Merge pull request #2196 from mosra/absolute-url-merging

Make URL part joining aware of absolute URLs
This commit is contained in:
Justin Mayer 2018-02-08 08:19:26 -08:00 committed by GitHub
commit e7ac0a9272
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 131 additions and 64 deletions

View file

@ -11,7 +11,7 @@ import sys
import pytz import pytz
import six import six
from six.moves.urllib.parse import urlparse, urlunparse from six.moves.urllib.parse import urljoin, urlparse, urlunparse
from pelican import signals from pelican import signals
from pelican.settings import DEFAULT_CONFIG from pelican.settings import DEFAULT_CONFIG
@ -228,35 +228,31 @@ class Content(object):
key = key if self.in_default_lang else 'lang_%s' % key key = key if self.in_default_lang else 'lang_%s' % key
return self._expand_settings(key) return self._expand_settings(key)
def _update_content(self, content, siteurl): def _link_replacer(self, siteurl, m):
"""Update the content attribute.
Change all the relative paths of the content to relative paths
suitable for the output content.
:param content: content resource that will be passed to the templates.
:param siteurl: siteurl which is locally generated by the writer in
case of RELATIVE_URLS.
"""
if not content:
return content
instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX']
regex = r"""
(?P<markup><[^\>]+ # match tag with all url-value attributes
(?:href|src|poster|data|cite|formaction|action)\s*=\s*)
(?P<quote>["\']) # require value to be quoted
(?P<path>{0}(?P<value>.*?)) # the url value
\2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X)
def replacer(m):
what = m.group('what') what = m.group('what')
value = urlparse(m.group('value')) value = urlparse(m.group('value'))
path = value.path path = value.path
origin = m.group('path') origin = m.group('path')
# urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
# so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
# properly get `../a.html`. However, os.path.join() produces
# `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
# instead of correct "http://foo/bar.html", so one has to pick a side
# as there is no silver bullet.
if self.settings['RELATIVE_URLS']:
joiner = os.path.join
else:
joiner = urljoin
# However, it's not *that* simple: urljoin("blog", "index.html")
# produces just `index.html` instead of `blog/index.html` (unlike
# os.path.join()), so in order to get a correct answer one needs to
# append a trailing slash to siteurl in that case. This also makes
# the new behavior fully compatible with Pelican 3.7.1.
if not siteurl.endswith('/'):
siteurl += '/'
# XXX Put this in a different location. # XXX Put this in a different location.
if what in {'filename', 'attach'}: if what in {'filename', 'attach'}:
if path.startswith('/'): if path.startswith('/'):
@ -283,7 +279,7 @@ class Content(object):
"%s used {attach} link syntax on a " "%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.", "non-static file. Use {filename} instead.",
self.get_relative_source_path()) self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url)) origin = joiner(siteurl, linked_content.url)
origin = origin.replace('\\', '/') # for Windows paths. origin = origin.replace('\\', '/') # for Windows paths.
else: else:
logger.warning( logger.warning(
@ -292,13 +288,13 @@ class Content(object):
'limit_msg': ("Other resources were not found " 'limit_msg': ("Other resources were not found "
"and their urls not replaced")}) "and their urls not replaced")})
elif what == 'category': elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url)) origin = joiner(siteurl, Category(path, self.settings).url)
elif what == 'tag': elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url)) origin = joiner(siteurl, Tag(path, self.settings).url)
elif what == 'index': elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
elif what == 'author': elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url)) origin = joiner(siteurl, Author(path, self.settings).url)
else: else:
logger.warning( logger.warning(
"Replacement Indicator '%s' not recognized, " "Replacement Indicator '%s' not recognized, "
@ -313,7 +309,30 @@ class Content(object):
return ''.join((m.group('markup'), m.group('quote'), origin, return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote'))) m.group('quote')))
return hrefs.sub(replacer, content) def _update_content(self, content, siteurl):
"""Update the content attribute.
Change all the relative paths of the content to relative paths
suitable for the output content.
:param content: content resource that will be passed to the templates.
:param siteurl: siteurl which is locally generated by the writer in
case of RELATIVE_URLS.
"""
if not content:
return content
instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX']
regex = r"""
(?P<markup><[^\>]+ # match tag with all url-value attributes
(?:href|src|poster|data|cite|formaction|action)\s*=\s*)
(?P<quote>["\']) # require value to be quoted
(?P<path>{0}(?P<value>.*?)) # the url value
\2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X)
return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
def get_siteurl(self): def get_siteurl(self):
return self._context.get('localsiteurl', '') return self._context.get('localsiteurl', '')

View file

@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
'</blockquote>' '</blockquote>'
) )
def test_intrasite_link_absolute(self):
"""Test that absolute URLs are merged properly."""
args = self.page_kwargs.copy()
args['settings'] = get_settings(
STATIC_URL='http://static.cool.site/{path}',
ARTICLE_URL='http://blog.cool.site/{slug}.html')
args['source_path'] = 'content'
args['context']['filenames'] = {
'images/poster.jpg': Static('',
settings=args['settings'],
source_path='images/poster.jpg'),
'article.rst': Article('',
settings=args['settings'],
metadata={'slug': 'article',
'title': 'Article'})
}
# Article link will go to blog
args['content'] = (
'<a href="{filename}article.rst">Article</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://blog.cool.site/article.html">Article</a>'
)
# Page link will go to the main site
args['content'] = (
'<a href="{index}">Index</a>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<a href="http://cool.site/index.html">Index</a>'
)
# Image link will go to static
args['content'] = (
'<img src="{filename}/images/poster.jpg"/>'
)
content = Page(**args).get_content('http://cool.site')
self.assertEqual(
content,
'<img src="http://static.cool.site/images/poster.jpg"/>'
)
def test_intrasite_link_markdown_spaces(self): def test_intrasite_link_markdown_spaces(self):
# Markdown introduces %20 instead of spaces, this tests that # Markdown introduces %20 instead of spaces, this tests that
# we support markdown doing this. # we support markdown doing this.