From 1f30306e2329e5a1f0c5dd39844d9bb0a0c04573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Tue, 19 Sep 2017 18:22:56 +0200 Subject: [PATCH 1/2] Make the internal link replacer function public. So it can be used from outside. --- pelican/contents.py | 126 ++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 15770fc8..a534dbaa 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -228,6 +228,68 @@ class Content(object): key = key if self.in_default_lang else 'lang_%s' % key return self._expand_settings(key) + def _link_replacer(self, siteurl, m): + what = m.group('what') + value = urlparse(m.group('value')) + path = value.path + origin = m.group('path') + + # XXX Put this in a different location. + if what in {'filename', 'attach'}: + if path.startswith('/'): + path = path[1:] + else: + # relative to the source path of this content + path = self.get_relative_source_path( + os.path.join(self.relative_dir, path) + ) + + if path not in self._context['filenames']: + unquoted_path = path.replace('%20', ' ') + + if unquoted_path in self._context['filenames']: + path = unquoted_path + + linked_content = self._context['filenames'].get(path) + if linked_content: + if what == 'attach': + if isinstance(linked_content, Static): + linked_content.attach_to(self) + else: + logger.warning( + "%s used {attach} link syntax on a " + "non-static file. Use {filename} instead.", + self.get_relative_source_path()) + origin = '/'.join((siteurl, linked_content.url)) + origin = origin.replace('\\', '/') # for Windows paths. + else: + logger.warning( + "Unable to find '%s', skipping url replacement.", + value.geturl(), extra={ + 'limit_msg': ("Other resources were not found " + "and their urls not replaced")}) + elif what == 'category': + origin = '/'.join((siteurl, Category(path, self.settings).url)) + elif what == 'tag': + origin = '/'.join((siteurl, Tag(path, self.settings).url)) + elif what == 'index': + origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) + elif what == 'author': + origin = '/'.join((siteurl, Author(path, self.settings).url)) + else: + logger.warning( + "Replacement Indicator '%s' not recognized, " + "skipping replacement", + what) + + # keep all other parts, such as query, fragment, etc. + parts = list(value) + parts[2] = origin + origin = urlunparse(parts) + + return ''.join((m.group('markup'), m.group('quote'), origin, + m.group('quote'))) + def _update_content(self, content, siteurl): """Update the content attribute. @@ -251,69 +313,7 @@ class Content(object): \2""".format(instrasite_link_regex) hrefs = re.compile(regex, re.X) - def replacer(m): - what = m.group('what') - value = urlparse(m.group('value')) - path = value.path - origin = m.group('path') - - # XXX Put this in a different location. - if what in {'filename', 'attach'}: - if path.startswith('/'): - path = path[1:] - else: - # relative to the source path of this content - path = self.get_relative_source_path( - os.path.join(self.relative_dir, path) - ) - - if path not in self._context['filenames']: - unquoted_path = path.replace('%20', ' ') - - if unquoted_path in self._context['filenames']: - path = unquoted_path - - linked_content = self._context['filenames'].get(path) - if linked_content: - if what == 'attach': - if isinstance(linked_content, Static): - linked_content.attach_to(self) - else: - logger.warning( - "%s used {attach} link syntax on a " - "non-static file. Use {filename} instead.", - self.get_relative_source_path()) - origin = '/'.join((siteurl, linked_content.url)) - origin = origin.replace('\\', '/') # for Windows paths. - else: - logger.warning( - "Unable to find '%s', skipping url replacement.", - value.geturl(), extra={ - 'limit_msg': ("Other resources were not found " - "and their urls not replaced")}) - elif what == 'category': - origin = '/'.join((siteurl, Category(path, self.settings).url)) - elif what == 'tag': - origin = '/'.join((siteurl, Tag(path, self.settings).url)) - elif what == 'index': - origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) - elif what == 'author': - origin = '/'.join((siteurl, Author(path, self.settings).url)) - else: - logger.warning( - "Replacement Indicator '%s' not recognized, " - "skipping replacement", - what) - - # keep all other parts, such as query, fragment, etc. - parts = list(value) - parts[2] = origin - origin = urlunparse(parts) - - return ''.join((m.group('markup'), m.group('quote'), origin, - m.group('quote'))) - - return hrefs.sub(replacer, content) + return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content) def get_siteurl(self): return self._context.get('localsiteurl', '') From 0b13aa9b4610a8d26ece75cfcebebaa5fb3fa3de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Thu, 20 Jul 2017 23:21:17 +0200 Subject: [PATCH 2/2] Make URL part joining aware of absolute URLs. Previously, with RELATIVE_URLS disabled, when both SITEURL and STATIC_URL were absolute, the final generate data URLs looked wrong like this (two absolute URLs joined by `/`): http://your.site/http://static.your.site/image.png With this patch, the data URLs are correctly: http://static.your.site/image.png This also applies to all *_URL configuration options (for example, ability to have pages and articles on different domains) and behaves like one expects even with URLs starting with just `//`, thanks to making use of urllib.parse.urljoin(). However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't handle the relative base correctly. In that case, simple os.path.join() is used. That, however, breaks the above case, but as RELATIVE_URLS are meant for local development (thus no data scattered across multiple domains), I don't see any problem. Just to clarify, this is a fully backwards-compatible change, it only enables new use cases that were impossible before. --- pelican/contents.py | 31 +++++++++++++++++----- pelican/tests/test_contents.py | 48 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index a534dbaa..e84c8296 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -11,7 +11,7 @@ import sys import pytz import six -from six.moves.urllib.parse import urlparse, urlunparse +from six.moves.urllib.parse import urljoin, urlparse, urlunparse from pelican import signals from pelican.settings import DEFAULT_CONFIG @@ -234,6 +234,25 @@ class Content(object): path = value.path origin = m.group('path') + # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html") + # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to + # properly get `../a.html`. However, os.path.join() produces + # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html") + # instead of correct "http://foo/bar.html", so one has to pick a side + # as there is no silver bullet. + if self.settings['RELATIVE_URLS']: + joiner = os.path.join + else: + joiner = urljoin + + # However, it's not *that* simple: urljoin("blog", "index.html") + # produces just `index.html` instead of `blog/index.html` (unlike + # os.path.join()), so in order to get a correct answer one needs to + # append a trailing slash to siteurl in that case. This also makes + # the new behavior fully compatible with Pelican 3.7.1. + if not siteurl.endswith('/'): + siteurl += '/' + # XXX Put this in a different location. if what in {'filename', 'attach'}: if path.startswith('/'): @@ -260,7 +279,7 @@ class Content(object): "%s used {attach} link syntax on a " "non-static file. Use {filename} instead.", self.get_relative_source_path()) - origin = '/'.join((siteurl, linked_content.url)) + origin = joiner(siteurl, linked_content.url) origin = origin.replace('\\', '/') # for Windows paths. else: logger.warning( @@ -269,13 +288,13 @@ class Content(object): 'limit_msg': ("Other resources were not found " "and their urls not replaced")}) elif what == 'category': - origin = '/'.join((siteurl, Category(path, self.settings).url)) + origin = joiner(siteurl, Category(path, self.settings).url) elif what == 'tag': - origin = '/'.join((siteurl, Tag(path, self.settings).url)) + origin = joiner(siteurl, Tag(path, self.settings).url) elif what == 'index': - origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) + origin = joiner(siteurl, self.settings['INDEX_SAVE_AS']) elif what == 'author': - origin = '/'.join((siteurl, Author(path, self.settings).url)) + origin = joiner(siteurl, Author(path, self.settings).url) else: logger.warning( "Replacement Indicator '%s' not recognized, " diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index d028c7a1..04c82f61 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -397,6 +397,54 @@ class TestPage(LoggedTestCase): '' ) + def test_intrasite_link_absolute(self): + """Test that absolute URLs are merged properly.""" + + args = self.page_kwargs.copy() + args['settings'] = get_settings( + STATIC_URL='http://static.cool.site/{path}', + ARTICLE_URL='http://blog.cool.site/{slug}.html') + args['source_path'] = 'content' + args['context']['filenames'] = { + 'images/poster.jpg': Static('', + settings=args['settings'], + source_path='images/poster.jpg'), + 'article.rst': Article('', + settings=args['settings'], + metadata={'slug': 'article', + 'title': 'Article'}) + } + + # Article link will go to blog + args['content'] = ( + 'Article' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + 'Article' + ) + + # Page link will go to the main site + args['content'] = ( + 'Index' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + 'Index' + ) + + # Image link will go to static + args['content'] = ( + '' + ) + content = Page(**args).get_content('http://cool.site') + self.assertEqual( + content, + '' + ) + def test_intrasite_link_markdown_spaces(self): # Markdown introduces %20 instead of spaces, this tests that # we support markdown doing this.