Make URL part joining aware of absolute URLs.

Previously, with RELATIVE_URLS disabled, when both SITEURL and
STATIC_URL were absolute, the final generate data URLs looked wrong like
this (two absolute URLs joined by `/`):

    http://your.site/http://static.your.site/image.png

With this patch, the data URLs are correctly:

    http://static.your.site/image.png

This also applies to all *_URL configuration options (for example,
ability to have pages and articles on different domains) and behaves
like one expects even with URLs starting with just `//`, thanks to
making use of urllib.parse.urljoin().

However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't
handle the relative base correctly. In that case, simple os.path.join()
is used. That, however, breaks the above case, but as RELATIVE_URLS are
meant for local development (thus no data scattered across multiple
domains), I don't see any problem.

Just to clarify, this is a fully backwards-compatible change, it only
enables new use cases that were impossible before.
This commit is contained in:
Vladimír Vondruš 2017-07-20 23:21:17 +02:00
commit 0b13aa9b46
2 changed files with 73 additions and 6 deletions

View file

@ -11,7 +11,7 @@ import sys
import pytz import pytz
import six import six
from six.moves.urllib.parse import urlparse, urlunparse from six.moves.urllib.parse import urljoin, urlparse, urlunparse
from pelican import signals from pelican import signals
from pelican.settings import DEFAULT_CONFIG from pelican.settings import DEFAULT_CONFIG
@ -234,6 +234,25 @@ class Content(object):
path = value.path path = value.path
origin = m.group('path') origin = m.group('path')
# urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
# so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
# properly get `../a.html`. However, os.path.join() produces
# `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
# instead of correct "http://foo/bar.html", so one has to pick a side
# as there is no silver bullet.
if self.settings['RELATIVE_URLS']:
joiner = os.path.join
else:
joiner = urljoin
# However, it's not *that* simple: urljoin("blog", "index.html")
# produces just `index.html` instead of `blog/index.html` (unlike
# os.path.join()), so in order to get a correct answer one needs to
# append a trailing slash to siteurl in that case. This also makes
# the new behavior fully compatible with Pelican 3.7.1.
if not siteurl.endswith('/'):
siteurl += '/'
# XXX Put this in a different location. # XXX Put this in a different location.
if what in {'filename', 'attach'}: if what in {'filename', 'attach'}:
if path.startswith('/'): if path.startswith('/'):
@ -260,7 +279,7 @@ class Content(object):
"%s used {attach} link syntax on a " "%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.", "non-static file. Use {filename} instead.",
self.get_relative_source_path()) self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url)) origin = joiner(siteurl, linked_content.url)
origin = origin.replace('\\', '/') # for Windows paths. origin = origin.replace('\\', '/') # for Windows paths.
else: else:
logger.warning( logger.warning(
@ -269,13 +288,13 @@ class Content(object):
'limit_msg': ("Other resources were not found " 'limit_msg': ("Other resources were not found "
"and their urls not replaced")}) "and their urls not replaced")})
elif what == 'category': elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url)) origin = joiner(siteurl, Category(path, self.settings).url)
elif what == 'tag': elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url)) origin = joiner(siteurl, Tag(path, self.settings).url)
elif what == 'index': elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
elif what == 'author': elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url)) origin = joiner(siteurl, Author(path, self.settings).url)
else: else:
logger.warning( logger.warning(
"Replacement Indicator '%s' not recognized, " "Replacement Indicator '%s' not recognized, "

View file

@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
'</blockquote>' '</blockquote>'
) )
def test_intrasite_link_absolute(self):
"""Test that absolute URLs are merged properly."""
args = self.page_kwargs.copy()
args['settings'] = get_settings(
STATIC_URL='http://static.cool.site/{path}',
ARTICLE_URL='http://blog.cool.site/{slug}.html')
args['source_path'] = 'content'
args['context']['filenames'] = {
'images/poster.jpg': Static('',
settings=args['settings'],
source_path='images/poster.jpg'),
'article.rst': Article('',
settings=args['settings'],
metadata={'slug': 'article',
'title': 'Article'})
}
# Article link will go to blog
args['content'] = (
'<a href="{filename}article.rst">Article</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://blog.cool.site/article.html">Article</a>'
)
# Page link will go to the main site
args['content'] = (
'<a href="{index}">Index</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://cool.site/index.html">Index</a>'
)
# Image link will go to static
args['content'] = (
'<img src="{filename}/images/poster.jpg"/>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<img src="http://static.cool.site/images/poster.jpg"/>'
)
def test_intrasite_link_markdown_spaces(self): def test_intrasite_link_markdown_spaces(self):
# Markdown introduces %20 instead of spaces, this tests that # Markdown introduces %20 instead of spaces, this tests that
# we support markdown doing this. # we support markdown doing this.