Merge pull request #2196 from mosra/absolute-url-merging

Make URL part joining aware of absolute URLs
This commit is contained in:
Justin Mayer 2018-02-08 08:19:26 -08:00 committed by GitHub
commit e7ac0a9272
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 131 additions and 64 deletions

View file

@ -11,7 +11,7 @@ import sys
import pytz import pytz
import six import six
from six.moves.urllib.parse import urlparse, urlunparse from six.moves.urllib.parse import urljoin, urlparse, urlunparse
from pelican import signals from pelican import signals
from pelican.settings import DEFAULT_CONFIG from pelican.settings import DEFAULT_CONFIG
@ -228,6 +228,87 @@ class Content(object):
key = key if self.in_default_lang else 'lang_%s' % key key = key if self.in_default_lang else 'lang_%s' % key
return self._expand_settings(key) return self._expand_settings(key)
def _link_replacer(self, siteurl, m):
what = m.group('what')
value = urlparse(m.group('value'))
path = value.path
origin = m.group('path')
# urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
# so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
# properly get `../a.html`. However, os.path.join() produces
# `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
# instead of correct "http://foo/bar.html", so one has to pick a side
# as there is no silver bullet.
if self.settings['RELATIVE_URLS']:
joiner = os.path.join
else:
joiner = urljoin
# However, it's not *that* simple: urljoin("blog", "index.html")
# produces just `index.html` instead of `blog/index.html` (unlike
# os.path.join()), so in order to get a correct answer one needs to
# append a trailing slash to siteurl in that case. This also makes
# the new behavior fully compatible with Pelican 3.7.1.
if not siteurl.endswith('/'):
siteurl += '/'
# XXX Put this in a different location.
if what in {'filename', 'attach'}:
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)
if path not in self._context['filenames']:
unquoted_path = path.replace('%20', ' ')
if unquoted_path in self._context['filenames']:
path = unquoted_path
linked_content = self._context['filenames'].get(path)
if linked_content:
if what == 'attach':
if isinstance(linked_content, Static):
linked_content.attach_to(self)
else:
logger.warning(
"%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.",
self.get_relative_source_path())
origin = joiner(siteurl, linked_content.url)
origin = origin.replace('\\', '/') # for Windows paths.
else:
logger.warning(
"Unable to find '%s', skipping url replacement.",
value.geturl(), extra={
'limit_msg': ("Other resources were not found "
"and their urls not replaced")})
elif what == 'category':
origin = joiner(siteurl, Category(path, self.settings).url)
elif what == 'tag':
origin = joiner(siteurl, Tag(path, self.settings).url)
elif what == 'index':
origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
elif what == 'author':
origin = joiner(siteurl, Author(path, self.settings).url)
else:
logger.warning(
"Replacement Indicator '%s' not recognized, "
"skipping replacement",
what)
# keep all other parts, such as query, fragment, etc.
parts = list(value)
parts[2] = origin
origin = urlunparse(parts)
return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote')))
def _update_content(self, content, siteurl): def _update_content(self, content, siteurl):
"""Update the content attribute. """Update the content attribute.
@ -251,69 +332,7 @@ class Content(object):
\2""".format(instrasite_link_regex) \2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X) hrefs = re.compile(regex, re.X)
def replacer(m): return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
what = m.group('what')
value = urlparse(m.group('value'))
path = value.path
origin = m.group('path')
# XXX Put this in a different location.
if what in {'filename', 'attach'}:
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)
if path not in self._context['filenames']:
unquoted_path = path.replace('%20', ' ')
if unquoted_path in self._context['filenames']:
path = unquoted_path
linked_content = self._context['filenames'].get(path)
if linked_content:
if what == 'attach':
if isinstance(linked_content, Static):
linked_content.attach_to(self)
else:
logger.warning(
"%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.",
self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url))
origin = origin.replace('\\', '/') # for Windows paths.
else:
logger.warning(
"Unable to find '%s', skipping url replacement.",
value.geturl(), extra={
'limit_msg': ("Other resources were not found "
"and their urls not replaced")})
elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url))
elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url))
elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url))
else:
logger.warning(
"Replacement Indicator '%s' not recognized, "
"skipping replacement",
what)
# keep all other parts, such as query, fragment, etc.
parts = list(value)
parts[2] = origin
origin = urlunparse(parts)
return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote')))
return hrefs.sub(replacer, content)
def get_siteurl(self): def get_siteurl(self):
return self._context.get('localsiteurl', '') return self._context.get('localsiteurl', '')

View file

@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
'</blockquote>' '</blockquote>'
) )
def test_intrasite_link_absolute(self):
"""Test that absolute URLs are merged properly."""
args = self.page_kwargs.copy()
args['settings'] = get_settings(
STATIC_URL='http://static.cool.site/{path}',
ARTICLE_URL='http://blog.cool.site/{slug}.html')
args['source_path'] = 'content'
args['context']['filenames'] = {
'images/poster.jpg': Static('',
settings=args['settings'],
source_path='images/poster.jpg'),
'article.rst': Article('',
settings=args['settings'],
metadata={'slug': 'article',
'title': 'Article'})
}
# Article link will go to blog
args['content'] = (
'<a href="{filename}article.rst">Article</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://blog.cool.site/article.html">Article</a>'
)
# Page link will go to the main site
args['content'] = (
'<a href="{index}">Index</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://cool.site/index.html">Index</a>'
)
# Image link will go to static
args['content'] = (
'<img src="{filename}/images/poster.jpg"/>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<img src="http://static.cool.site/images/poster.jpg"/>'
)
def test_intrasite_link_markdown_spaces(self): def test_intrasite_link_markdown_spaces(self):
# Markdown introduces %20 instead of spaces, this tests that # Markdown introduces %20 instead of spaces, this tests that
# we support markdown doing this. # we support markdown doing this.