From b70772717500394a44f494e8ee612bbba2297054 Mon Sep 17 00:00:00 2001 From: zhouji Date: Mon, 14 Oct 2013 17:18:57 +0800 Subject: [PATCH 1/3] a simple solution for #1117 --- pelican/contents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/contents.py b/pelican/contents.py index dbc33716..62406904 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -190,7 +190,7 @@ class Content(object): instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX'] regex = r""" (?P<\s*[^\>]* # match tag with src and href attr - (?:href|src)\s*=) + (?:href|src|poster)\s*=) (?P["\']) # require value to be quoted (?P{0}(?P.*?)) # the url value From d85ef66698f441b82b7d7f21c88f7ef2a396fee1 Mon Sep 17 00:00:00 2001 From: zhouji Date: Wed, 16 Oct 2013 16:43:34 +0800 Subject: [PATCH 2/3] Fix #1117 and add all possible url-value HTML attributes. --- pelican/contents.py | 60 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) mode change 100644 => 100755 pelican/contents.py diff --git a/pelican/contents.py b/pelican/contents.py old mode 100644 new mode 100755 index 62406904..f89b08ff --- a/pelican/contents.py +++ b/pelican/contents.py @@ -125,6 +125,52 @@ class Content(object): if 'summary' in metadata: self._summary = metadata['summary'] + # prepare the list of HTML tag attributes which have a URL value. + # refer: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value + self._url_attributes = { # each item in this set is a tuple composed by tag_name, attr_name + # HTML4 tags + ('a', 'href'), + ('applet', 'codebase'), + ('area', 'href'), + ('base', 'href'), + ('blockquote', 'cite'), + ('body', 'background'), + ('del', 'cite'), + ('form', 'action'), + ('frame', 'longdesc'), + ('frame', 'src'), + ('head', 'profile'), + ('iframe', 'longdesc'), + ('iframe', 'src'), + ('img', 'longdesc'), + ('img', 'src'), + ('img', 'usemap'), + ('input', 'src'), + ('input', 'usemap'), + ('ins', 'cite'), + ('link', 'href'), + ('object', 'classid'), + ('object', 'codebase'), + ('object', 'data'), + ('object', 'usemap'), + ('q', 'cite'), + ('script', 'src'), + + # HTML5 tags + ('audio', 'src'), + ('button', 'formaction'), + ('command', 'icon'), + ('embed', 'src'), + ('html', 'manifest'), + ('input', 'formaction'), + ('source', 'src'), + ('video', 'poster'), + ('video', 'src'), + } + """:type: set of (tuple of (string, string)""" + attribute_names = set(pair[1] for pair in self._url_attributes) + self._url_attr_pattern = '|'.join(attribute_names) + signals.content_object_init.send(self) def __str__(self): @@ -189,20 +235,26 @@ class Content(object): instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX'] regex = r""" - (?P<\s*[^\>]* # match tag with src and href attr - (?:href|src|poster)\s*=) - + (?P<\s*(?P[^\s\>]+)[^\>]* # match tag with all url-value attributes + (?P{1})\s*=) (?P["\']) # require value to be quoted (?P{0}(?P.*?)) # the url value - \2""".format(instrasite_link_regex) + \4""".format(instrasite_link_regex, self._url_attr_pattern) hrefs = re.compile(regex, re.X) def replacer(m): + print(m.group(0)) what = m.group('what') value = urlparse(m.group('value')) path = value.path origin = m.group('path') + # verify HTML tag and attribute pair to avoid miss-replacing + tag = m.group('tag') + attr = m.group('attr') + if attr != 'href' and attr != 'src' and (tag, attr) not in self._url_attributes: + return m.group(0) + # XXX Put this in a different location. if what == 'filename': if path.startswith('/'): From fa150ad51daedefcf7d6c3a75a5c2c156e002695 Mon Sep 17 00:00:00 2001 From: zhouji Date: Wed, 16 Oct 2013 16:45:31 +0800 Subject: [PATCH 3/3] Restore file mode of content.py, which is miss-updated in last commit. --- pelican/contents.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 pelican/contents.py diff --git a/pelican/contents.py b/pelican/contents.py old mode 100755 new mode 100644