From 0b13aa9b4610a8d26ece75cfcebebaa5fb3fa3de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Thu, 20 Jul 2017 23:21:17 +0200 Subject: [PATCH] Make URL part joining aware of absolute URLs. Previously, with RELATIVE_URLS disabled, when both SITEURL and STATIC_URL were absolute, the final generate data URLs looked wrong like this (two absolute URLs joined by `/`): http://your.site/http://static.your.site/image.png With this patch, the data URLs are correctly: http://static.your.site/image.png This also applies to all *_URL configuration options (for example, ability to have pages and articles on different domains) and behaves like one expects even with URLs starting with just `//`, thanks to making use of urllib.parse.urljoin(). However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't handle the relative base correctly. In that case, simple os.path.join() is used. That, however, breaks the above case, but as RELATIVE_URLS are meant for local development (thus no data scattered across multiple domains), I don't see any problem. Just to clarify, this is a fully backwards-compatible change, it only enables new use cases that were impossible before. --- pelican/contents.py | 31 +++++++++++++++++----- pelican/tests/test_contents.py | 48 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index a534dbaa..e84c8296 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -11,7 +11,7 @@ import sys import pytz import six -from six.moves.urllib.parse import urlparse, urlunparse +from six.moves.urllib.parse import urljoin, urlparse, urlunparse from pelican import signals from pelican.settings import DEFAULT_CONFIG @@ -234,6 +234,25 @@ class Content(object): path = value.path origin = m.group('path') + # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html") + # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to + # properly get `../a.html`. However, os.path.join() produces + # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html") + # instead of correct "http://foo/bar.html", so one has to pick a side + # as there is no silver bullet. + if self.settings['RELATIVE_URLS']: + joiner = os.path.join + else: + joiner = urljoin + + # However, it's not *that* simple: urljoin("blog", "index.html") + # produces just `index.html` instead of `blog/index.html` (unlike + # os.path.join()), so in order to get a correct answer one needs to + # append a trailing slash to siteurl in that case. This also makes + # the new behavior fully compatible with Pelican 3.7.1. + if not siteurl.endswith('/'): + siteurl += '/' + # XXX Put this in a different location. if what in {'filename', 'attach'}: if path.startswith('/'): @@ -260,7 +279,7 @@ class Content(object): "%s used {attach} link syntax on a " "non-static file. Use {filename} instead.", self.get_relative_source_path()) - origin = '/'.join((siteurl, linked_content.url)) + origin = joiner(siteurl, linked_content.url) origin = origin.replace('\\', '/') # for Windows paths. else: logger.warning( @@ -269,13 +288,13 @@ class Content(object): 'limit_msg': ("Other resources were not found " "and their urls not replaced")}) elif what == 'category': - origin = '/'.join((siteurl, Category(path, self.settings).url)) + origin = joiner(siteurl, Category(path, self.settings).url) elif what == 'tag': - origin = '/'.join((siteurl, Tag(path, self.settings).url)) + origin = joiner(siteurl, Tag(path, self.settings).url) elif what == 'index': - origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) + origin = joiner(siteurl, self.settings['INDEX_SAVE_AS']) elif what == 'author': - origin = '/'.join((siteurl, Author(path, self.settings).url)) + origin = joiner(siteurl, Author(path, self.settings).url) else: logger.warning( "Replacement Indicator '%s' not recognized, " diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index d028c7a1..04c82f61 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -397,6 +397,54 @@ class TestPage(LoggedTestCase): '' ) + def test_intrasite_link_absolute(self): + """Test that absolute URLs are merged properly.""" + + args = self.page_kwargs.copy() + args['settings'] = get_settings( + STATIC_URL='http://static.cool.site/{path}', + ARTICLE_URL='http://blog.cool.site/{slug}.html') + args['source_path'] = 'content' + args['context']['filenames'] = { + 'images/poster.jpg': Static('', + settings=args['settings'], + source_path='images/poster.jpg'), + 'article.rst': Article('', + settings=args['settings'], + metadata={'slug': 'article', + 'title': 'Article'}) + } + + # Article link will go to blog + args['content'] = ( + 'Article' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + 'Article' + ) + + # Page link will go to the main site + args['content'] = ( + 'Index' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + 'Index' + ) + + # Image link will go to static + args['content'] = ( + '' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + '' + ) + def test_intrasite_link_markdown_spaces(self): # Markdown introduces %20 instead of spaces, this tests that # we support markdown doing this.