Fine-tune url-value HTML attributes list.

This commit is contained in:
zhouji 2013-10-17 11:33:34 +08:00
commit e538aa2cde
2 changed files with 58 additions and 55 deletions

View file

@ -125,52 +125,6 @@ class Content(object):
if 'summary' in metadata:
self._summary = metadata['summary']
# prepare the list of HTML tag attributes which have a URL value.
# refer: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
self._url_attributes = { # each item in this set is a tuple composed by tag_name, attr_name
# HTML4 tags
('a', 'href'),
('applet', 'codebase'),
('area', 'href'),
('base', 'href'),
('blockquote', 'cite'),
('body', 'background'),
('del', 'cite'),
('form', 'action'),
('frame', 'longdesc'),
('frame', 'src'),
('head', 'profile'),
('iframe', 'longdesc'),
('iframe', 'src'),
('img', 'longdesc'),
('img', 'src'),
('img', 'usemap'),
('input', 'src'),
('input', 'usemap'),
('ins', 'cite'),
('link', 'href'),
('object', 'classid'),
('object', 'codebase'),
('object', 'data'),
('object', 'usemap'),
('q', 'cite'),
('script', 'src'),
# HTML5 tags
('audio', 'src'),
('button', 'formaction'),
('command', 'icon'),
('embed', 'src'),
('html', 'manifest'),
('input', 'formaction'),
('source', 'src'),
('video', 'poster'),
('video', 'src'),
}
""":type: set of (tuple of (string, string)"""
attribute_names = set(pair[1] for pair in self._url_attributes)
self._url_attr_pattern = '|'.join(attribute_names)
signals.content_object_init.send(self)
def __str__(self):
@ -235,12 +189,12 @@ class Content(object):
instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX']
regex = r"""
(?P<markup><\s*(?P<tag>[^\s\>]+)[^\>]* # match tag with all url-value attributes
(?P<attr>{1})\s*=)
(?P<markup><\s*[^\>]* # match tag with all url-value attributes
(?:href|src|poster|data|cite|formaction|action)\s*=)
(?P<quote>["\']) # require value to be quoted
(?P<path>{0}(?P<value>.*?)) # the url value
\4""".format(instrasite_link_regex, self._url_attr_pattern)
\2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X)
def replacer(m):
@ -249,12 +203,6 @@ class Content(object):
path = value.path
origin = m.group('path')
# verify HTML tag and attribute pair to avoid miss-replacing
tag = m.group('tag')
attr = m.group('attr')
if attr != 'href' and attr != 'src' and (tag, attr) not in self._url_attributes:
return m.group(0)
# XXX Put this in a different location.
if what == 'filename':
if path.startswith('/'):

View file

@ -268,6 +268,61 @@ class TestPage(unittest.TestCase):
'?utm_whatever=234&highlight=word#section-2">link</a>'
)
def test_intrasite_link_more(self):
# type does not take unicode in PY2 and bytes in PY3, which in
# combination with unicode literals leads to following insane line:
cls_name = '_DummyAsset' if six.PY3 else b'_DummyAsset'
args = self.page_kwargs.copy()
args['settings'] = get_settings()
args['source_path'] = 'content'
args['context']['filenames'] = {
'images/poster.jpg': type(cls_name, (object,), {'url': 'images/poster.jpg'}),
'assets/video.mp4': type(cls_name, (object,), {'url': 'assets/video.mp4'}),
'images/graph.svg': type(cls_name, (object,), {'url': 'images/graph.svg'}),
'reference.rst': type(cls_name, (object,), {'url': 'reference.html'}),
}
# video.poster
args['content'] = (
'There is a video with poster '
'<video controls poster="{filename}/images/poster.jpg">'
'<source src="|filename|/assets/video.mp4" type="video/mp4">'
'</video>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(
content,
'There is a video with poster '
'<video controls poster="http://notmyidea.org/images/poster.jpg">'
'<source src="http://notmyidea.org/assets/video.mp4" type="video/mp4">'
'</video>'
)
# object.data
args['content'] = (
'There is a svg object '
'<object data="{filename}/images/graph.svg" type="image/svg+xml"></object>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(
content,
'There is a svg object '
'<object data="http://notmyidea.org/images/graph.svg" type="image/svg+xml"></object>'
)
# blockquote.cite
args['content'] = (
'There is a blockquote with cite attribute '
'<blockquote cite="{filename}reference.rst">blah blah</blockquote>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(
content,
'There is a blockquote with cite attribute '
'<blockquote cite="http://notmyidea.org/reference.html">blah blah</blockquote>'
)
class TestArticle(TestPage):
def test_template(self):