Merge pull request #2196 from mosra/absolute-url-merging

Make URL part joining aware of absolute URLs
2025-10-15 20:28:56 +02:00 · 2018-02-08 08:19:26 -08:00 · 2018-02-08 08:19:26 -08:00 · e7ac0a9272
commit e7ac0a9272
parent 34103cd5dd 0b13aa9b46
2 changed files with 131 additions and 64 deletions
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -11,7 +11,7 @@ import sys
 import pytz
 import six
-from six.moves.urllib.parse import urlparse, urlunparse
+from six.moves.urllib.parse import urljoin, urlparse, urlunparse
 from pelican import signals
 from pelican.settings import DEFAULT_CONFIG
@ -228,6 +228,87 @@ class Content(object):
        key = key if self.in_default_lang else 'lang_%s' % key
        return self._expand_settings(key)
    def _link_replacer(self, siteurl, m):
        what = m.group('what')
        value = urlparse(m.group('value'))
        path = value.path
        origin = m.group('path')
        # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
        # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
        # properly get `../a.html`. However, os.path.join() produces
        # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
        # instead of correct "http://foo/bar.html", so one has to pick a side
        # as there is no silver bullet.
        if self.settings['RELATIVE_URLS']:
            joiner = os.path.join
        else:
            joiner = urljoin
            # However, it's not *that* simple: urljoin("blog", "index.html")
            # produces just `index.html` instead of `blog/index.html` (unlike
            # os.path.join()), so in order to get a correct answer one needs to
            # append a trailing slash to siteurl in that case. This also makes
            # the new behavior fully compatible with Pelican 3.7.1.
            if not siteurl.endswith('/'):
                siteurl += '/'
        # XXX Put this in a different location.
        if what in {'filename', 'attach'}:
            if path.startswith('/'):
                path = path[1:]
            else:
                # relative to the source path of this content
                path = self.get_relative_source_path(
                    os.path.join(self.relative_dir, path)
                )
            if path not in self._context['filenames']:
                unquoted_path = path.replace('%20', ' ')
                if unquoted_path in self._context['filenames']:
                    path = unquoted_path
            linked_content = self._context['filenames'].get(path)
            if linked_content:
                if what == 'attach':
                    if isinstance(linked_content, Static):
                        linked_content.attach_to(self)
                    else:
                        logger.warning(
                            "%s used {attach} link syntax on a "
                            "non-static file. Use {filename} instead.",
                            self.get_relative_source_path())
                origin = joiner(siteurl, linked_content.url)
                origin = origin.replace('\\', '/')  # for Windows paths.
            else:
                logger.warning(
                    "Unable to find '%s', skipping url replacement.",
                    value.geturl(), extra={
                        'limit_msg': ("Other resources were not found "
                                      "and their urls not replaced")})
        elif what == 'category':
            origin = joiner(siteurl, Category(path, self.settings).url)
        elif what == 'tag':
            origin = joiner(siteurl, Tag(path, self.settings).url)
        elif what == 'index':
            origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
        elif what == 'author':
            origin = joiner(siteurl, Author(path, self.settings).url)
        else:
            logger.warning(
                "Replacement Indicator '%s' not recognized, "
                "skipping replacement",
                what)
        # keep all other parts, such as query, fragment, etc.
        parts = list(value)
        parts[2] = origin
        origin = urlunparse(parts)
        return ''.join((m.group('markup'), m.group('quote'), origin,
                        m.group('quote')))
    def _update_content(self, content, siteurl):
        """Update the content attribute.
@ -251,69 +332,7 @@ class Content(object):
            \2""".format(instrasite_link_regex)
        hrefs = re.compile(regex, re.X)
-        def replacer(m):
+        return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
            what = m.group('what')
            value = urlparse(m.group('value'))
            path = value.path
            origin = m.group('path')
            # XXX Put this in a different location.
            if what in {'filename', 'attach'}:
                if path.startswith('/'):
                    path = path[1:]
                else:
                    # relative to the source path of this content
                    path = self.get_relative_source_path(
                        os.path.join(self.relative_dir, path)
                    )
                if path not in self._context['filenames']:
                    unquoted_path = path.replace('%20', ' ')
                    if unquoted_path in self._context['filenames']:
                        path = unquoted_path
                linked_content = self._context['filenames'].get(path)
                if linked_content:
                    if what == 'attach':
                        if isinstance(linked_content, Static):
                            linked_content.attach_to(self)
                        else:
                            logger.warning(
                                "%s used {attach} link syntax on a "
                                "non-static file. Use {filename} instead.",
                                self.get_relative_source_path())
                    origin = '/'.join((siteurl, linked_content.url))
                    origin = origin.replace('\\', '/')  # for Windows paths.
                else:
                    logger.warning(
                        "Unable to find '%s', skipping url replacement.",
                        value.geturl(), extra={
                            'limit_msg': ("Other resources were not found "
                                          "and their urls not replaced")})
            elif what == 'category':
                origin = '/'.join((siteurl, Category(path, self.settings).url))
            elif what == 'tag':
                origin = '/'.join((siteurl, Tag(path, self.settings).url))
            elif what == 'index':
                origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
            elif what == 'author':
                origin = '/'.join((siteurl, Author(path, self.settings).url))
            else:
                logger.warning(
                    "Replacement Indicator '%s' not recognized, "
                    "skipping replacement",
                    what)
            # keep all other parts, such as query, fragment, etc.
            parts = list(value)
            parts[2] = origin
            origin = urlunparse(parts)
            return ''.join((m.group('markup'), m.group('quote'), origin,
                            m.group('quote')))
        return hrefs.sub(replacer, content)
    def get_siteurl(self):
        return self._context.get('localsiteurl', '')
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -397,6 +397,54 @@ class TestPage(LoggedTestCase):
            '</blockquote>'
        )
    def test_intrasite_link_absolute(self):
        """Test that absolute URLs are merged properly."""
        args = self.page_kwargs.copy()
        args['settings'] = get_settings(
            STATIC_URL='http://static.cool.site/{path}',
            ARTICLE_URL='http://blog.cool.site/{slug}.html')
        args['source_path'] = 'content'
        args['context']['filenames'] = {
            'images/poster.jpg': Static('',
                                        settings=args['settings'],
                                        source_path='images/poster.jpg'),
            'article.rst': Article('',
                                   settings=args['settings'],
                                   metadata={'slug': 'article',
                                             'title': 'Article'})
        }
        # Article link will go to blog
        args['content'] = (
            '<a href="{filename}article.rst">Article</a>'
        )
        content = Page(**args).get_content('http://cool.site')
        self.assertEqual(
            content,
            '<a href="http://blog.cool.site/article.html">Article</a>'
        )
        # Page link will go to the main site
        args['content'] = (
            '<a href="{index}">Index</a>'
        )
        content = Page(**args).get_content('http://cool.site')
        self.assertEqual(
            content,
            '<a href="http://cool.site/index.html">Index</a>'
        )
        # Image link will go to static
        args['content'] = (
            '<img src="{filename}/images/poster.jpg"/>'
        )
        content = Page(**args).get_content('http://cool.site')
        self.assertEqual(
            content,
            '<img src="http://static.cool.site/images/poster.jpg"/>'
        )
    def test_intrasite_link_markdown_spaces(self):
        # Markdown introduces %20 instead of spaces, this tests that
        # we support markdown doing this.