Try unescaped paths in intrasite link discovery

Some content parsers escape link paths in their html output (i.e. docutils uses HTML escaping and markdown uses URL encoding. Intrasite link discovery is refactored to also attempt HTML or URL unescaped versions of the path in order to match more permissively.
2025-10-15 20:28:56 +02:00 · 2020-10-04 19:29:32 +03:00 · 2020-10-04 19:29:32 +03:00 · fd0923d2f2
commit fd0923d2f2
parent 7a6686f467
2 changed files with 106 additions and 31 deletions
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -4,7 +4,8 @@ import locale
 import logging
 import os
 import re
-from urllib.parse import urljoin, urlparse, urlunparse
+from html import unescape
 from urllib.parse import unquote, urljoin, urlparse, urlunparse
 import pytz
@ -250,6 +251,10 @@ class Content:
        # XXX Put this in a different location.
        if what in {'filename', 'static', 'attach'}:
            def _get_linked_content(key, url):
                nonlocal value
                def _find_path(path):
                    if path.startswith('/'):
                        path = path[1:]
                    else:
@ -257,31 +262,44 @@ class Content:
                        path = self.get_relative_source_path(
                            os.path.join(self.relative_dir, path)
                        )
                    return self._context[key].get(path, None)
-            key = 'static_content' if what in ('static', 'attach')\
+                # try path
-                else 'generated_content'
+                result = _find_path(url.path)
                if result is not None:
                    return result
-            def _get_linked_content(key, path):
+                # try unquoted path
-                try:
+                result = _find_path(unquote(url.path))
-                    return self._context[key][path]
+                if result is not None:
-                except KeyError:
+                    return result
-                    try:
+
-                        # Markdown escapes spaces, try unescaping
+                # try html unescaped url
-                        return self._context[key][path.replace('%20', ' ')]
+                unescaped_url = urlparse(unescape(url.geturl()))
-                    except KeyError:
+                result = _find_path(unescaped_url.path)
                if result is not None:
                    value = unescaped_url
                    return result
                # check if a static file is linked with {filename}
                if what == 'filename' and key == 'generated_content':
-                            key = 'static_content'
+                    linked_content = _get_linked_content('static_content', value)
                            linked_content = _get_linked_content(key, path)
                    if linked_content:
                        logger.warning(
                            '{filename} used for linking to static'
                            ' content %s in %s. Use {static} instead',
-                                    path,
+                            value.path,
                            self.get_relative_source_path())
                        return linked_content
                return None
-            linked_content = _get_linked_content(key, path)
+            if what == 'filename':
                key = 'generated_content'
            else:
                key = 'static_content'
            linked_content = _get_linked_content(key, value)
            if linked_content:
                if what == 'attach':
                    linked_content.attach_to(self)
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -30,6 +30,9 @@ class TestBase(LoggedTestCase):
            'content': TEST_CONTENT,
            'context': {
                'localsiteurl': '',
                'generated_content': {},
                'static_content': {},
                'static_links': set()
            },
            'metadata': {
                'summary': TEST_SUMMARY,
@ -519,6 +522,60 @@ class TestPage(TestBase):
            '<img src="http://static.cool.site/images/poster.jpg"/>'
        )
    def test_intrasite_link_escape(self):
        article = type(
            '_DummyArticle', (object,), {'url': 'article-spaces.html'})
        asset = type(
            '_DummyAsset', (object,), {'url': 'name@example.com'})
        args = self.page_kwargs.copy()
        args['settings'] = get_settings()
        args['source_path'] = 'content'
        args['context']['generated_content'] = {'article spaces.rst': article}
        args['context']['static_content'] = {'name@example.com': asset}
        expected_output = (
            'A simple test with a '
            '<a href="http://notmyidea.org/article-spaces.html#anchor">link</a> '
            '<a href="http://notmyidea.org/name@example.com#anchor">file</a>'
        )
        # not escaped
        args['content'] = (
            'A simple test with a '
            '<a href="{filename}article spaces.rst#anchor">link</a> '
            '<a href="{static}name@example.com#anchor">file</a>'
        )
        content = Page(**args).get_content('http://notmyidea.org')
        self.assertEqual(content, expected_output)
        # html escaped
        args['content'] = (
            'A simple test with a '
            '<a href="{filename}article spaces.rst#anchor">link</a> '
            '<a href="{static}name&#64;example.com#anchor">file</a>'
        )
        content = Page(**args).get_content('http://notmyidea.org')
        self.assertEqual(content, expected_output)
        # url escaped
        args['content'] = (
            'A simple test with a '
            '<a href="{filename}article%20spaces.rst#anchor">link</a> '
            '<a href="{static}name%40example.com#anchor">file</a>'
        )
        content = Page(**args).get_content('http://notmyidea.org')
        self.assertEqual(content, expected_output)
        # html and url escaped
        args['content'] = (
            'A simple test with a '
            '<a href="{filename}article%20spaces.rst#anchor">link</a> '
            '<a href="{static}name&#64;example.com#anchor">file</a>'
        )
        content = Page(**args).get_content('http://notmyidea.org')
        self.assertEqual(content, expected_output)
    def test_intrasite_link_markdown_spaces(self):
        cls_name = '_DummyArticle'
        article = type(cls_name, (object,), {'url': 'article-spaces.html'})