From 1f30306e2329e5a1f0c5dd39844d9bb0a0c04573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Tue, 19 Sep 2017 18:22:56 +0200
Subject: [PATCH 1/2] Make the internal link replacer function public.

So it can be used from outside.
---
 pelican/contents.py | 126 ++++++++++++++++++++++----------------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/pelican/contents.py b/pelican/contents.py
index 15770fc8..a534dbaa 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -228,6 +228,68 @@ class Content(object):
         key = key if self.in_default_lang else 'lang_%s' % key
         return self._expand_settings(key)
 
+    def _link_replacer(self, siteurl, m):
+        what = m.group('what')
+        value = urlparse(m.group('value'))
+        path = value.path
+        origin = m.group('path')
+
+        # XXX Put this in a different location.
+        if what in {'filename', 'attach'}:
+            if path.startswith('/'):
+                path = path[1:]
+            else:
+                # relative to the source path of this content
+                path = self.get_relative_source_path(
+                    os.path.join(self.relative_dir, path)
+                )
+
+            if path not in self._context['filenames']:
+                unquoted_path = path.replace('%20', ' ')
+
+                if unquoted_path in self._context['filenames']:
+                    path = unquoted_path
+
+            linked_content = self._context['filenames'].get(path)
+            if linked_content:
+                if what == 'attach':
+                    if isinstance(linked_content, Static):
+                        linked_content.attach_to(self)
+                    else:
+                        logger.warning(
+                            "%s used {attach} link syntax on a "
+                            "non-static file. Use {filename} instead.",
+                            self.get_relative_source_path())
+                origin = '/'.join((siteurl, linked_content.url))
+                origin = origin.replace('\\', '/')  # for Windows paths.
+            else:
+                logger.warning(
+                    "Unable to find '%s', skipping url replacement.",
+                    value.geturl(), extra={
+                        'limit_msg': ("Other resources were not found "
+                                      "and their urls not replaced")})
+        elif what == 'category':
+            origin = '/'.join((siteurl, Category(path, self.settings).url))
+        elif what == 'tag':
+            origin = '/'.join((siteurl, Tag(path, self.settings).url))
+        elif what == 'index':
+            origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
+        elif what == 'author':
+            origin = '/'.join((siteurl, Author(path, self.settings).url))
+        else:
+            logger.warning(
+                "Replacement Indicator '%s' not recognized, "
+                "skipping replacement",
+                what)
+
+        # keep all other parts, such as query, fragment, etc.
+        parts = list(value)
+        parts[2] = origin
+        origin = urlunparse(parts)
+
+        return ''.join((m.group('markup'), m.group('quote'), origin,
+                        m.group('quote')))
+
     def _update_content(self, content, siteurl):
         """Update the content attribute.
 
@@ -251,69 +313,7 @@ class Content(object):
             \2""".format(instrasite_link_regex)
         hrefs = re.compile(regex, re.X)
 
-        def replacer(m):
-            what = m.group('what')
-            value = urlparse(m.group('value'))
-            path = value.path
-            origin = m.group('path')
-
-            # XXX Put this in a different location.
-            if what in {'filename', 'attach'}:
-                if path.startswith('/'):
-                    path = path[1:]
-                else:
-                    # relative to the source path of this content
-                    path = self.get_relative_source_path(
-                        os.path.join(self.relative_dir, path)
-                    )
-
-                if path not in self._context['filenames']:
-                    unquoted_path = path.replace('%20', ' ')
-
-                    if unquoted_path in self._context['filenames']:
-                        path = unquoted_path
-
-                linked_content = self._context['filenames'].get(path)
-                if linked_content:
-                    if what == 'attach':
-                        if isinstance(linked_content, Static):
-                            linked_content.attach_to(self)
-                        else:
-                            logger.warning(
-                                "%s used {attach} link syntax on a "
-                                "non-static file. Use {filename} instead.",
-                                self.get_relative_source_path())
-                    origin = '/'.join((siteurl, linked_content.url))
-                    origin = origin.replace('\\', '/')  # for Windows paths.
-                else:
-                    logger.warning(
-                        "Unable to find '%s', skipping url replacement.",
-                        value.geturl(), extra={
-                            'limit_msg': ("Other resources were not found "
-                                          "and their urls not replaced")})
-            elif what == 'category':
-                origin = '/'.join((siteurl, Category(path, self.settings).url))
-            elif what == 'tag':
-                origin = '/'.join((siteurl, Tag(path, self.settings).url))
-            elif what == 'index':
-                origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
-            elif what == 'author':
-                origin = '/'.join((siteurl, Author(path, self.settings).url))
-            else:
-                logger.warning(
-                    "Replacement Indicator '%s' not recognized, "
-                    "skipping replacement",
-                    what)
-
-            # keep all other parts, such as query, fragment, etc.
-            parts = list(value)
-            parts[2] = origin
-            origin = urlunparse(parts)
-
-            return ''.join((m.group('markup'), m.group('quote'), origin,
-                            m.group('quote')))
-
-        return hrefs.sub(replacer, content)
+        return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
 
     def get_siteurl(self):
         return self._context.get('localsiteurl', '')

From 0b13aa9b4610a8d26ece75cfcebebaa5fb3fa3de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Thu, 20 Jul 2017 23:21:17 +0200
Subject: [PATCH 2/2] Make URL part joining aware of absolute URLs.

Previously, with RELATIVE_URLS disabled, when both SITEURL and
STATIC_URL were absolute, the final generate data URLs looked wrong like
this (two absolute URLs joined by `/`):

    http://your.site/http://static.your.site/image.png

With this patch, the data URLs are correctly:

    http://static.your.site/image.png

This also applies to all *_URL configuration options (for example,
ability to have pages and articles on different domains) and behaves
like one expects even with URLs starting with just `//`, thanks to
making use of urllib.parse.urljoin().

However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't
handle the relative base correctly. In that case, simple os.path.join()
is used. That, however, breaks the above case, but as RELATIVE_URLS are
meant for local development (thus no data scattered across multiple
domains), I don't see any problem.

Just to clarify, this is a fully backwards-compatible change, it only
enables new use cases that were impossible before.
---
 pelican/contents.py            | 31 +++++++++++++++++-----
 pelican/tests/test_contents.py | 48 ++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/pelican/contents.py b/pelican/contents.py
index a534dbaa..e84c8296 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -11,7 +11,7 @@ import sys
 import pytz
 
 import six
-from six.moves.urllib.parse import urlparse, urlunparse
+from six.moves.urllib.parse import urljoin, urlparse, urlunparse
 
 from pelican import signals
 from pelican.settings import DEFAULT_CONFIG
@@ -234,6 +234,25 @@ class Content(object):
         path = value.path
         origin = m.group('path')
 
+        # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
+        # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
+        # properly get `../a.html`. However, os.path.join() produces
+        # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
+        # instead of correct "http://foo/bar.html", so one has to pick a side
+        # as there is no silver bullet.
+        if self.settings['RELATIVE_URLS']:
+            joiner = os.path.join
+        else:
+            joiner = urljoin
+
+            # However, it's not *that* simple: urljoin("blog", "index.html")
+            # produces just `index.html` instead of `blog/index.html` (unlike
+            # os.path.join()), so in order to get a correct answer one needs to
+            # append a trailing slash to siteurl in that case. This also makes
+            # the new behavior fully compatible with Pelican 3.7.1.
+            if not siteurl.endswith('/'):
+                siteurl += '/'
+
         # XXX Put this in a different location.
         if what in {'filename', 'attach'}:
             if path.startswith('/'):
@@ -260,7 +279,7 @@ class Content(object):
                             "%s used {attach} link syntax on a "
                             "non-static file. Use {filename} instead.",
                             self.get_relative_source_path())
-                origin = '/'.join((siteurl, linked_content.url))
+                origin = joiner(siteurl, linked_content.url)
                 origin = origin.replace('\\', '/')  # for Windows paths.
             else:
                 logger.warning(
@@ -269,13 +288,13 @@ class Content(object):
                         'limit_msg': ("Other resources were not found "
                                       "and their urls not replaced")})
         elif what == 'category':
-            origin = '/'.join((siteurl, Category(path, self.settings).url))
+            origin = joiner(siteurl, Category(path, self.settings).url)
         elif what == 'tag':
-            origin = '/'.join((siteurl, Tag(path, self.settings).url))
+            origin = joiner(siteurl, Tag(path, self.settings).url)
         elif what == 'index':
-            origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
+            origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
         elif what == 'author':
-            origin = '/'.join((siteurl, Author(path, self.settings).url))
+            origin = joiner(siteurl, Author(path, self.settings).url)
         else:
             logger.warning(
                 "Replacement Indicator '%s' not recognized, "
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index d028c7a1..04c82f61 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
             '</blockquote>'
         )
 
+    def test_intrasite_link_absolute(self):
+        """Test that absolute URLs are merged properly."""
+
+        args = self.page_kwargs.copy()
+        args['settings'] = get_settings(
+            STATIC_URL='http://static.cool.site/{path}',
+            ARTICLE_URL='http://blog.cool.site/{slug}.html')
+        args['source_path'] = 'content'
+        args['context']['filenames'] = {
+            'images/poster.jpg': Static('',
+                                        settings=args['settings'],
+                                        source_path='images/poster.jpg'),
+            'article.rst': Article('',
+                                   settings=args['settings'],
+                                   metadata={'slug': 'article',
+                                             'title': 'Article'})
+        }
+
+        # Article link will go to blog
+        args['content'] = (
+            '<a href="{filename}article.rst">Article</a>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<a href="http://blog.cool.site/article.html">Article</a>'
+        )
+
+        # Page link will go to the main site
+        args['content'] = (
+            '<a href="{index}">Index</a>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<a href="http://cool.site/index.html">Index</a>'
+        )
+
+        # Image link will go to static
+        args['content'] = (
+            '<img src="{filename}/images/poster.jpg"/>'
+        )
+        content = Page(**args).get_content('http://cool.site')
+        self.assertEqual(
+            content,
+            '<img src="http://static.cool.site/images/poster.jpg"/>'
+        )
+
     def test_intrasite_link_markdown_spaces(self):
         # Markdown introduces %20 instead of spaces, this tests that
         # we support markdown doing this.