From 04dba17b800b7c67e43706a8a164ebf697bba10f Mon Sep 17 00:00:00 2001 From: zhouji Date: Wed, 16 Oct 2013 17:06:56 +0800 Subject: [PATCH 1/2] Fix #1117 Make intra-link support all url-value HTML attributes. --- pelican/contents.py | 58 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index dbc33716..39322e99 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -125,6 +125,52 @@ class Content(object): if 'summary' in metadata: self._summary = metadata['summary'] + # prepare the list of HTML tag attributes which have a URL value. + # refer: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value + self._url_attributes = { # each item in this set is a tuple composed by tag_name, attr_name + # HTML4 tags + ('a', 'href'), + ('applet', 'codebase'), + ('area', 'href'), + ('base', 'href'), + ('blockquote', 'cite'), + ('body', 'background'), + ('del', 'cite'), + ('form', 'action'), + ('frame', 'longdesc'), + ('frame', 'src'), + ('head', 'profile'), + ('iframe', 'longdesc'), + ('iframe', 'src'), + ('img', 'longdesc'), + ('img', 'src'), + ('img', 'usemap'), + ('input', 'src'), + ('input', 'usemap'), + ('ins', 'cite'), + ('link', 'href'), + ('object', 'classid'), + ('object', 'codebase'), + ('object', 'data'), + ('object', 'usemap'), + ('q', 'cite'), + ('script', 'src'), + + # HTML5 tags + ('audio', 'src'), + ('button', 'formaction'), + ('command', 'icon'), + ('embed', 'src'), + ('html', 'manifest'), + ('input', 'formaction'), + ('source', 'src'), + ('video', 'poster'), + ('video', 'src'), + } + """:type: set of (tuple of (string, string)""" + attribute_names = set(pair[1] for pair in self._url_attributes) + self._url_attr_pattern = '|'.join(attribute_names) + signals.content_object_init.send(self) def __str__(self): @@ -189,12 +235,12 @@ class Content(object): instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX'] regex = r""" - (?P<\s*[^\>]* # match tag with src and href attr - (?:href|src)\s*=) + (?P<\s*(?P[^\s\>]+)[^\>]* # match tag with all url-value attributes + (?P{1})\s*=) (?P["\']) # require value to be quoted (?P{0}(?P.*?)) # the url value - \2""".format(instrasite_link_regex) + \4""".format(instrasite_link_regex, self._url_attr_pattern) hrefs = re.compile(regex, re.X) def replacer(m): @@ -203,6 +249,12 @@ class Content(object): path = value.path origin = m.group('path') + # verify HTML tag and attribute pair to avoid miss-replacing + tag = m.group('tag') + attr = m.group('attr') + if attr != 'href' and attr != 'src' and (tag, attr) not in self._url_attributes: + return m.group(0) + # XXX Put this in a different location. if what == 'filename': if path.startswith('/'): From e538aa2cdeb4eed2df40bcf0a414c0930ab05e25 Mon Sep 17 00:00:00 2001 From: zhouji Date: Thu, 17 Oct 2013 11:33:34 +0800 Subject: [PATCH 2/2] Fine-tune url-value HTML attributes list. --- pelican/contents.py | 58 ++-------------------------------- pelican/tests/test_contents.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 39322e99..059c54a7 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -125,52 +125,6 @@ class Content(object): if 'summary' in metadata: self._summary = metadata['summary'] - # prepare the list of HTML tag attributes which have a URL value. - # refer: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value - self._url_attributes = { # each item in this set is a tuple composed by tag_name, attr_name - # HTML4 tags - ('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('base', 'href'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('head', 'profile'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src'), - - # HTML5 tags - ('audio', 'src'), - ('button', 'formaction'), - ('command', 'icon'), - ('embed', 'src'), - ('html', 'manifest'), - ('input', 'formaction'), - ('source', 'src'), - ('video', 'poster'), - ('video', 'src'), - } - """:type: set of (tuple of (string, string)""" - attribute_names = set(pair[1] for pair in self._url_attributes) - self._url_attr_pattern = '|'.join(attribute_names) - signals.content_object_init.send(self) def __str__(self): @@ -235,12 +189,12 @@ class Content(object): instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX'] regex = r""" - (?P<\s*(?P[^\s\>]+)[^\>]* # match tag with all url-value attributes - (?P{1})\s*=) + (?P<\s*[^\>]* # match tag with all url-value attributes + (?:href|src|poster|data|cite|formaction|action)\s*=) (?P["\']) # require value to be quoted (?P{0}(?P.*?)) # the url value - \4""".format(instrasite_link_regex, self._url_attr_pattern) + \2""".format(instrasite_link_regex) hrefs = re.compile(regex, re.X) def replacer(m): @@ -249,12 +203,6 @@ class Content(object): path = value.path origin = m.group('path') - # verify HTML tag and attribute pair to avoid miss-replacing - tag = m.group('tag') - attr = m.group('attr') - if attr != 'href' and attr != 'src' and (tag, attr) not in self._url_attributes: - return m.group(0) - # XXX Put this in a different location. if what == 'filename': if path.startswith('/'): diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 9c894ffc..92e61355 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -268,6 +268,61 @@ class TestPage(unittest.TestCase): '?utm_whatever=234&highlight=word#section-2">link' ) + def test_intrasite_link_more(self): + # type does not take unicode in PY2 and bytes in PY3, which in + # combination with unicode literals leads to following insane line: + cls_name = '_DummyAsset' if six.PY3 else b'_DummyAsset' + + args = self.page_kwargs.copy() + args['settings'] = get_settings() + args['source_path'] = 'content' + args['context']['filenames'] = { + 'images/poster.jpg': type(cls_name, (object,), {'url': 'images/poster.jpg'}), + 'assets/video.mp4': type(cls_name, (object,), {'url': 'assets/video.mp4'}), + 'images/graph.svg': type(cls_name, (object,), {'url': 'images/graph.svg'}), + 'reference.rst': type(cls_name, (object,), {'url': 'reference.html'}), + } + + # video.poster + args['content'] = ( + 'There is a video with poster ' + '' + ) + content = Page(**args).get_content('http://notmyidea.org') + self.assertEqual( + content, + 'There is a video with poster ' + '' + ) + + # object.data + args['content'] = ( + 'There is a svg object ' + '' + ) + content = Page(**args).get_content('http://notmyidea.org') + self.assertEqual( + content, + 'There is a svg object ' + '' + ) + + # blockquote.cite + args['content'] = ( + 'There is a blockquote with cite attribute ' + '
blah blah
' + ) + content = Page(**args).get_content('http://notmyidea.org') + self.assertEqual( + content, + 'There is a blockquote with cite attribute ' + '
blah blah
' + ) + class TestArticle(TestPage): def test_template(self):