Make URL part joining aware of absolute URLs.

Previously, with RELATIVE_URLS disabled, when both SITEURL and STATIC_URL were absolute, the final generate data URLs looked wrong like this (two absolute URLs joined by `/`): http://your.site/http://static.your.site/image.png With this patch, the data URLs are correctly: http://static.your.site/image.png This also applies to all *_URL configuration options (for example, ability to have pages and articles on different domains) and behaves like one expects even with URLs starting with just `//`, thanks to making use of urllib.parse.urljoin(). However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't handle the relative base correctly. In that case, simple os.path.join() is used. That, however, breaks the above case, but as RELATIVE_URLS are meant for local development (thus no data scattered across multiple domains), I don't see any problem. Just to clarify, this is a fully backwards-compatible change, it only enables new use cases that were impossible before.
2025-10-15 20:28:56 +02:00 · 2017-07-20 23:21:17 +02:00 · 2017-07-20 23:21:17 +02:00 · 0b13aa9b46
commit 0b13aa9b46
parent 1f30306e23
2 changed files with 73 additions and 6 deletions
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -11,7 +11,7 @@ import sys
 import pytz

 import six
-from six.moves.urllib.parse import urlparse, urlunparse
+from six.moves.urllib.parse import urljoin, urlparse, urlunparse

 from pelican import signals
 from pelican.settings import DEFAULT_CONFIG
@ -234,6 +234,25 @@ class Content(object):
        path = value.path
        origin = m.group('path')

+        # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
+        # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
+        # properly get `../a.html`. However, os.path.join() produces
+        # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
+        # instead of correct "http://foo/bar.html", so one has to pick a side
+        # as there is no silver bullet.
+        if self.settings['RELATIVE_URLS']:
+            joiner = os.path.join
+        else:
+            joiner = urljoin
+
+            # However, it's not *that* simple: urljoin("blog", "index.html")
+            # produces just `index.html` instead of `blog/index.html` (unlike
+            # os.path.join()), so in order to get a correct answer one needs to
+            # append a trailing slash to siteurl in that case. This also makes
+            # the new behavior fully compatible with Pelican 3.7.1.
+            if not siteurl.endswith('/'):
+                siteurl += '/'
+
        # XXX Put this in a different location.
        if what in {'filename', 'attach'}:
            if path.startswith('/'):
@ -260,7 +279,7 @@ class Content(object):
                            "%s used {attach} link syntax on a "
                            "non-static file. Use {filename} instead.",
                            self.get_relative_source_path())
-                origin = '/'.join((siteurl, linked_content.url))
+                origin = joiner(siteurl, linked_content.url)
                origin = origin.replace('\\', '/')  # for Windows paths.
            else:
                logger.warning(
@ -269,13 +288,13 @@ class Content(object):
                        'limit_msg': ("Other resources were not found "
                                      "and their urls not replaced")})
        elif what == 'category':
-            origin = '/'.join((siteurl, Category(path, self.settings).url))
+            origin = joiner(siteurl, Category(path, self.settings).url)
        elif what == 'tag':
-            origin = '/'.join((siteurl, Tag(path, self.settings).url))
+            origin = joiner(siteurl, Tag(path, self.settings).url)
        elif what == 'index':
-            origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
+            origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
        elif what == 'author':
-            origin = '/'.join((siteurl, Author(path, self.settings).url))
+            origin = joiner(siteurl, Author(path, self.settings).url)
        else:
            logger.warning(
                "Replacement Indicator '%s' not recognized, "
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
            '</blockquote>'
        )

+    def test_intrasite_link_absolute(self):
+        """Test that absolute URLs are merged properly."""
+
+        args = self.page_kwargs.copy()
+        args['settings'] = get_settings(
+            STATIC_URL='http://static.cool.site/{path}',
+            ARTICLE_URL='http://blog.cool.site/{slug}.html')
+        args['source_path'] = 'content'
+        args['context']['filenames'] = {
+            'images/poster.jpg': Static('',
+                                        settings=args['settings'],
+                                        source_path='images/poster.jpg'),
+            'article.rst': Article('',
+                                   settings=args['settings'],
+                                   metadata={'slug': 'article',
+                                             'title': 'Article'})
+        }
+
+        # Article link will go to blog
+        args['content'] = (
+            '<a href="{filename}article.rst">Article</a>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<a href="http://blog.cool.site/article.html">Article</a>'
+        )
+
+        # Page link will go to the main site
+        args['content'] = (
+            '<a href="{index}">Index</a>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<a href="http://cool.site/index.html">Index</a>'
+        )
+
+        # Image link will go to static
+        args['content'] = (
+            '<img src="{filename}/images/poster.jpg"/>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<img src="http://static.cool.site/images/poster.jpg"/>'
+        )
+
    def test_intrasite_link_markdown_spaces(self):
        # Markdown introduces %20 instead of spaces, this tests that
        # we support markdown doing this.