1
0
Fork 0
forked from github/pelican

Try unescaped paths in intrasite link discovery

Some content parsers escape link paths in their html output
(i.e. docutils uses HTML escaping and markdown uses URL encoding.
Intrasite link discovery is refactored to also attempt HTML or URL
unescaped versions of the path in order to match more permissively.
This commit is contained in:
Deniz Turgut 2020-10-04 19:29:32 +03:00
commit fd0923d2f2
No known key found for this signature in database
GPG key ID: 87B7168D7AB3ED2F
2 changed files with 106 additions and 31 deletions

View file

@ -4,7 +4,8 @@ import locale
import logging
import os
import re
from urllib.parse import urljoin, urlparse, urlunparse
from html import unescape
from urllib.parse import unquote, urljoin, urlparse, urlunparse
import pytz
@ -250,38 +251,55 @@ class Content:
# XXX Put this in a different location.
if what in {'filename', 'static', 'attach'}:
if path.startswith('/'):
path = path[1:]
def _get_linked_content(key, url):
nonlocal value
def _find_path(path):
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)
return self._context[key].get(path, None)
# try path
result = _find_path(url.path)
if result is not None:
return result
# try unquoted path
result = _find_path(unquote(url.path))
if result is not None:
return result
# try html unescaped url
unescaped_url = urlparse(unescape(url.geturl()))
result = _find_path(unescaped_url.path)
if result is not None:
value = unescaped_url
return result
# check if a static file is linked with {filename}
if what == 'filename' and key == 'generated_content':
linked_content = _get_linked_content('static_content', value)
if linked_content:
logger.warning(
'{filename} used for linking to static'
' content %s in %s. Use {static} instead',
value.path,
self.get_relative_source_path())
return linked_content
return None
if what == 'filename':
key = 'generated_content'
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)
key = 'static_content'
key = 'static_content' if what in ('static', 'attach')\
else 'generated_content'
def _get_linked_content(key, path):
try:
return self._context[key][path]
except KeyError:
try:
# Markdown escapes spaces, try unescaping
return self._context[key][path.replace('%20', ' ')]
except KeyError:
if what == 'filename' and key == 'generated_content':
key = 'static_content'
linked_content = _get_linked_content(key, path)
if linked_content:
logger.warning(
'{filename} used for linking to static'
' content %s in %s. Use {static} instead',
path,
self.get_relative_source_path())
return linked_content
return None
linked_content = _get_linked_content(key, path)
linked_content = _get_linked_content(key, value)
if linked_content:
if what == 'attach':
linked_content.attach_to(self)

View file

@ -30,6 +30,9 @@ class TestBase(LoggedTestCase):
'content': TEST_CONTENT,
'context': {
'localsiteurl': '',
'generated_content': {},
'static_content': {},
'static_links': set()
},
'metadata': {
'summary': TEST_SUMMARY,
@ -519,6 +522,60 @@ class TestPage(TestBase):
'<img src="http://static.cool.site/images/poster.jpg"/>'
)
def test_intrasite_link_escape(self):
article = type(
'_DummyArticle', (object,), {'url': 'article-spaces.html'})
asset = type(
'_DummyAsset', (object,), {'url': 'name@example.com'})
args = self.page_kwargs.copy()
args['settings'] = get_settings()
args['source_path'] = 'content'
args['context']['generated_content'] = {'article spaces.rst': article}
args['context']['static_content'] = {'name@example.com': asset}
expected_output = (
'A simple test with a '
'<a href="http://notmyidea.org/article-spaces.html#anchor">link</a> '
'<a href="http://notmyidea.org/name@example.com#anchor">file</a>'
)
# not escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article spaces.rst#anchor">link</a> '
'<a href="{static}name@example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# html escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article spaces.rst#anchor">link</a> '
'<a href="{static}name&#64;example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# url escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article%20spaces.rst#anchor">link</a> '
'<a href="{static}name%40example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# html and url escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article%20spaces.rst#anchor">link</a> '
'<a href="{static}name&#64;example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
def test_intrasite_link_markdown_spaces(self):
cls_name = '_DummyArticle'
article = type(cls_name, (object,), {'url': 'article-spaces.html'})