Merge pull request #2812 from avaris/2646

Try unescaped paths in intrasite link discovery
This commit is contained in:
Justin Mayer 2020-10-19 20:43:18 +02:00 committed by GitHub
commit 197cd1e12e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 106 additions and 31 deletions

View file

@ -4,7 +4,8 @@ import locale
import logging import logging
import os import os
import re import re
from urllib.parse import urljoin, urlparse, urlunparse from html import unescape
from urllib.parse import unquote, urljoin, urlparse, urlunparse
import pytz import pytz
@ -250,6 +251,10 @@ class Content:
# XXX Put this in a different location. # XXX Put this in a different location.
if what in {'filename', 'static', 'attach'}: if what in {'filename', 'static', 'attach'}:
def _get_linked_content(key, url):
nonlocal value
def _find_path(path):
if path.startswith('/'): if path.startswith('/'):
path = path[1:] path = path[1:]
else: else:
@ -257,31 +262,44 @@ class Content:
path = self.get_relative_source_path( path = self.get_relative_source_path(
os.path.join(self.relative_dir, path) os.path.join(self.relative_dir, path)
) )
return self._context[key].get(path, None)
key = 'static_content' if what in ('static', 'attach')\ # try path
else 'generated_content' result = _find_path(url.path)
if result is not None:
return result
def _get_linked_content(key, path): # try unquoted path
try: result = _find_path(unquote(url.path))
return self._context[key][path] if result is not None:
except KeyError: return result
try:
# Markdown escapes spaces, try unescaping # try html unescaped url
return self._context[key][path.replace('%20', ' ')] unescaped_url = urlparse(unescape(url.geturl()))
except KeyError: result = _find_path(unescaped_url.path)
if result is not None:
value = unescaped_url
return result
# check if a static file is linked with {filename}
if what == 'filename' and key == 'generated_content': if what == 'filename' and key == 'generated_content':
key = 'static_content' linked_content = _get_linked_content('static_content', value)
linked_content = _get_linked_content(key, path)
if linked_content: if linked_content:
logger.warning( logger.warning(
'{filename} used for linking to static' '{filename} used for linking to static'
' content %s in %s. Use {static} instead', ' content %s in %s. Use {static} instead',
path, value.path,
self.get_relative_source_path()) self.get_relative_source_path())
return linked_content return linked_content
return None return None
linked_content = _get_linked_content(key, path) if what == 'filename':
key = 'generated_content'
else:
key = 'static_content'
linked_content = _get_linked_content(key, value)
if linked_content: if linked_content:
if what == 'attach': if what == 'attach':
linked_content.attach_to(self) linked_content.attach_to(self)

View file

@ -30,6 +30,9 @@ class TestBase(LoggedTestCase):
'content': TEST_CONTENT, 'content': TEST_CONTENT,
'context': { 'context': {
'localsiteurl': '', 'localsiteurl': '',
'generated_content': {},
'static_content': {},
'static_links': set()
}, },
'metadata': { 'metadata': {
'summary': TEST_SUMMARY, 'summary': TEST_SUMMARY,
@ -519,6 +522,60 @@ class TestPage(TestBase):
'<img src="http://static.cool.site/images/poster.jpg"/>' '<img src="http://static.cool.site/images/poster.jpg"/>'
) )
def test_intrasite_link_escape(self):
article = type(
'_DummyArticle', (object,), {'url': 'article-spaces.html'})
asset = type(
'_DummyAsset', (object,), {'url': 'name@example.com'})
args = self.page_kwargs.copy()
args['settings'] = get_settings()
args['source_path'] = 'content'
args['context']['generated_content'] = {'article spaces.rst': article}
args['context']['static_content'] = {'name@example.com': asset}
expected_output = (
'A simple test with a '
'<a href="http://notmyidea.org/article-spaces.html#anchor">link</a> '
'<a href="http://notmyidea.org/name@example.com#anchor">file</a>'
)
# not escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article spaces.rst#anchor">link</a> '
'<a href="{static}name@example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# html escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article spaces.rst#anchor">link</a> '
'<a href="{static}name&#64;example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# url escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article%20spaces.rst#anchor">link</a> '
'<a href="{static}name%40example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
# html and url escaped
args['content'] = (
'A simple test with a '
'<a href="{filename}article%20spaces.rst#anchor">link</a> '
'<a href="{static}name&#64;example.com#anchor">file</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(content, expected_output)
def test_intrasite_link_markdown_spaces(self): def test_intrasite_link_markdown_spaces(self):
cls_name = '_DummyArticle' cls_name = '_DummyArticle'
article = type(cls_name, (object,), {'url': 'article-spaces.html'}) article = type(cls_name, (object,), {'url': 'article-spaces.html'})