Merge pull request #1093 from honzajavorek/intrasite-link-parsing

Better intrasite link parsing
2025-10-15 20:28:56 +02:00 · 2013-09-25 07:58:27 -07:00 · 2013-09-25 07:58:27 -07:00 · dbbf95b184
commit dbbf95b184
parent 2c468f091a 6ed23fec7d
2 changed files with 86 additions and 15 deletions
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -10,6 +10,11 @@ import os
 import re
 import sys

+try:
+    from urlparse import urlparse, urlunparse
+except ImportError:
+    from urllib.parse import urlparse, urlunparse
+
 from datetime import datetime


@ -194,34 +199,36 @@ class Content(object):

        def replacer(m):
            what = m.group('what')
-            value = m.group('value')
+            value = urlparse(m.group('value'))
+            path = value.path
            origin = m.group('path')

-            # we support only filename for now. the plan is to support
-            # categories, tags, etc. in the future, but let's keep things
-            # simple for now.
-
            # XXX Put this in a different location.
            if what == 'filename':
-                if value.startswith('/'):
-                    value = value[1:]
+                if path.startswith('/'):
+                    path = path[1:]
                else:
                    # relative to the source path of this content
-                    value = self.get_relative_source_path(
-                        os.path.join(self.relative_dir, value)
+                    path = self.get_relative_source_path(
+                        os.path.join(self.relative_dir, path)
                    )

-                if value in self._context['filenames']:
+                if path in self._context['filenames']:
                    origin = '/'.join((siteurl,
-                             self._context['filenames'][value].url))
-                    origin = origin.replace('\\', '/')  # Fow windows paths.
+                             self._context['filenames'][path].url))
+                    origin = origin.replace('\\', '/')  # for Windows paths.
                else:
                    logger.warning("Unable to find {fn}, skipping url"
-                                   " replacement".format(fn=value))
+                                   " replacement".format(fn=path))
            elif what == 'category':
-                origin = Category(value, self.settings).url
+                origin = Category(path, self.settings).url
            elif what == 'tag':
-                origin = Tag(value, self.settings).url
+                origin = Tag(path, self.settings).url
+
+            # keep all other parts, such as query, fragment, etc.
+            parts = list(value)
+            parts[2] = origin
+            origin = urlunparse(parts)

            return ''.join((m.group('markup'), m.group('quote'), origin,
                            m.group('quote')))
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals

+import six
 from datetime import datetime
 from sys import platform

@ -204,6 +205,69 @@ class TestPage(unittest.TestCase):
                          ('A simple test, with a '
                           '<a href="category/category.html">link</a>'))

+    def test_intrasite_link(self):
+        # type does not take unicode in PY2 and bytes in PY3, which in
+        # combination with unicode literals leads to following insane line:
+        cls_name = '_DummyArticle' if six.PY3 else b'_DummyArticle'
+        article = type(cls_name, (object,), {'url': 'article.html'})
+
+        args = self.page_kwargs.copy()
+        args['settings'] = get_settings()
+        args['source_path'] = 'content'
+        args['context']['filenames'] = {'article.rst': article}
+
+        # Classic intrasite link via filename
+        args['content'] = (
+            'A simple test, with a '
+            '<a href="|filename|article.rst">link</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEquals(
+            content,
+            'A simple test, with a '
+            '<a href="http://notmyidea.org/article.html">link</a>'
+        )
+
+        # fragment
+        args['content'] = (
+            'A simple test, with a '
+            '<a href="|filename|article.rst#section-2">link</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEquals(
+            content,
+            'A simple test, with a '
+            '<a href="http://notmyidea.org/article.html#section-2">link</a>'
+        )
+
+        # query
+        args['content'] = (
+            'A simple test, with a '
+            '<a href="|filename|article.rst'
+            '?utm_whatever=234&highlight=word">link</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEquals(
+            content,
+            'A simple test, with a '
+            '<a href="http://notmyidea.org/article.html'
+            '?utm_whatever=234&highlight=word">link</a>'
+        )
+
+        # combination
+        args['content'] = (
+            'A simple test, with a '
+            '<a href="|filename|article.rst'
+            '?utm_whatever=234&highlight=word#section-2">link</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEquals(
+            content,
+            'A simple test, with a '
+            '<a href="http://notmyidea.org/article.html'
+            '?utm_whatever=234&highlight=word#section-2">link</a>'
+        )
+

 class TestArticle(TestPage):
    def test_template(self):