From fd0923d2f24c9833021c01f247f97f4d1d8e67b4 Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Sun, 4 Oct 2020 19:29:32 +0300
Subject: [PATCH] Try unescaped paths in intrasite link discovery

Some content parsers escape link paths in their html output
(i.e. docutils uses HTML escaping and markdown uses URL encoding.
Intrasite link discovery is refactored to also attempt HTML or URL
unescaped versions of the path in order to match more permissively.
---
 pelican/contents.py            | 80 +++++++++++++++++++++-------------
 pelican/tests/test_contents.py | 57 ++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/pelican/contents.py b/pelican/contents.py
index 6470ee45..75cedcdc 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -4,7 +4,8 @@ import locale
 import logging
 import os
 import re
-from urllib.parse import urljoin, urlparse, urlunparse
+from html import unescape
+from urllib.parse import unquote, urljoin, urlparse, urlunparse
 
 import pytz
 
@@ -250,38 +251,55 @@ class Content:
 
         # XXX Put this in a different location.
         if what in {'filename', 'static', 'attach'}:
-            if path.startswith('/'):
-                path = path[1:]
+            def _get_linked_content(key, url):
+                nonlocal value
+
+                def _find_path(path):
+                    if path.startswith('/'):
+                        path = path[1:]
+                    else:
+                        # relative to the source path of this content
+                        path = self.get_relative_source_path(
+                            os.path.join(self.relative_dir, path)
+                        )
+                    return self._context[key].get(path, None)
+
+                # try path
+                result = _find_path(url.path)
+                if result is not None:
+                    return result
+
+                # try unquoted path
+                result = _find_path(unquote(url.path))
+                if result is not None:
+                    return result
+
+                # try html unescaped url
+                unescaped_url = urlparse(unescape(url.geturl()))
+                result = _find_path(unescaped_url.path)
+                if result is not None:
+                    value = unescaped_url
+                    return result
+
+                # check if a static file is linked with {filename}
+                if what == 'filename' and key == 'generated_content':
+                    linked_content = _get_linked_content('static_content', value)
+                    if linked_content:
+                        logger.warning(
+                            '{filename} used for linking to static'
+                            ' content %s in %s. Use {static} instead',
+                            value.path,
+                            self.get_relative_source_path())
+                        return linked_content
+
+                return None
+
+            if what == 'filename':
+                key = 'generated_content'
             else:
-                # relative to the source path of this content
-                path = self.get_relative_source_path(
-                    os.path.join(self.relative_dir, path)
-                )
+                key = 'static_content'
 
-            key = 'static_content' if what in ('static', 'attach')\
-                else 'generated_content'
-
-            def _get_linked_content(key, path):
-                try:
-                    return self._context[key][path]
-                except KeyError:
-                    try:
-                        # Markdown escapes spaces, try unescaping
-                        return self._context[key][path.replace('%20', ' ')]
-                    except KeyError:
-                        if what == 'filename' and key == 'generated_content':
-                            key = 'static_content'
-                            linked_content = _get_linked_content(key, path)
-                            if linked_content:
-                                logger.warning(
-                                    '{filename} used for linking to static'
-                                    ' content %s in %s. Use {static} instead',
-                                    path,
-                                    self.get_relative_source_path())
-                                return linked_content
-                        return None
-
-            linked_content = _get_linked_content(key, path)
+            linked_content = _get_linked_content(key, value)
             if linked_content:
                 if what == 'attach':
                     linked_content.attach_to(self)
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 1a520bc7..32012d4f 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -30,6 +30,9 @@ class TestBase(LoggedTestCase):
             'content': TEST_CONTENT,
             'context': {
                 'localsiteurl': '',
+                'generated_content': {},
+                'static_content': {},
+                'static_links': set()
             },
             'metadata': {
                 'summary': TEST_SUMMARY,
@@ -519,6 +522,60 @@ class TestPage(TestBase):
             '<img src="http://static.cool.site/images/poster.jpg"/>'
         )
 
+    def test_intrasite_link_escape(self):
+        article = type(
+            '_DummyArticle', (object,), {'url': 'article-spaces.html'})
+        asset = type(
+            '_DummyAsset', (object,), {'url': 'name@example.com'})
+
+        args = self.page_kwargs.copy()
+        args['settings'] = get_settings()
+        args['source_path'] = 'content'
+        args['context']['generated_content'] = {'article spaces.rst': article}
+        args['context']['static_content'] = {'name@example.com': asset}
+
+        expected_output = (
+            'A simple test with a '
+            '<a href="http://notmyidea.org/article-spaces.html#anchor">link</a> '
+            '<a href="http://notmyidea.org/name@example.com#anchor">file</a>'
+        )
+
+        # not escaped
+        args['content'] = (
+            'A simple test with a '
+            '<a href="{filename}article spaces.rst#anchor">link</a> '
+            '<a href="{static}name@example.com#anchor">file</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(content, expected_output)
+
+        # html escaped
+        args['content'] = (
+            'A simple test with a '
+            '<a href="{filename}article spaces.rst#anchor">link</a> '
+            '<a href="{static}name&#64;example.com#anchor">file</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(content, expected_output)
+
+        # url escaped
+        args['content'] = (
+            'A simple test with a '
+            '<a href="{filename}article%20spaces.rst#anchor">link</a> '
+            '<a href="{static}name%40example.com#anchor">file</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(content, expected_output)
+
+        # html and url escaped
+        args['content'] = (
+            'A simple test with a '
+            '<a href="{filename}article%20spaces.rst#anchor">link</a> '
+            '<a href="{static}name&#64;example.com#anchor">file</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(content, expected_output)
+
     def test_intrasite_link_markdown_spaces(self):
         cls_name = '_DummyArticle'
         article = type(cls_name, (object,), {'url': 'article-spaces.html'})