Merge pull request #1802 from andreacorbellini/better-word-recognition

When truncating, consider hyphens, apostrophes and HTML entities.
This commit is contained in:
Justin Mayer 2015-08-28 10:18:44 -07:00
commit ac151ef867
10 changed files with 95 additions and 24 deletions

View file

@ -90,7 +90,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="/unbelievable.html">read more</a> <a class="readmore" href="/unbelievable.html">read more</a>
</div><!-- /.entry-content --> </div><!-- /.entry-content -->
</article></li> </article></li>

View file

@ -227,7 +227,7 @@ YEAH !</p>
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="/unbelievable.html">read more</a> <a class="readmore" href="/unbelievable.html">read more</a>
</div><!-- /.entry-content --> </div><!-- /.entry-content -->
</article></li> </article></li>

View file

@ -59,7 +59,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="../unbelievable.html">read more</a> <a class="readmore" href="../unbelievable.html">read more</a>
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>

View file

@ -103,7 +103,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="../unbelievable.html">read more</a> <a class="readmore" href="../unbelievable.html">read more</a>
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>

View file

@ -59,7 +59,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="./unbelievable.html">read more</a> <a class="readmore" href="./unbelievable.html">read more</a>
<p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>

View file

@ -59,7 +59,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a> <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
}()); }());
</script> </script>
</body> </body>
</html> </html>

View file

@ -103,7 +103,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a> <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>
@ -175,4 +175,4 @@ pelican.conf, it ...</p></div>
}()); }());
</script> </script>
</body> </body>
</html> </html>

View file

@ -59,7 +59,7 @@
<div class="section" id="testing-another-case"> <div class="section" id="testing-another-case">
<h2>Testing another case</h2> <h2>Testing another case</h2>
<p>This will now have a line number in 'custom' since it's the default in <p>This will now have a line number in 'custom' since it's the default in
pelican.conf, it ...</p></div> pelican.conf, it will ...</p></div>
<a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a> <a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
<p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content --> <p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
</article></li> </article></li>
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
}()); }());
</script> </script>
</body> </body>
</html> </html>

View file

@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase):
self.assertEqual(utils.get_relative_path(value), expected) self.assertEqual(utils.get_relative_path(value), expected)
def test_truncate_html_words(self): def test_truncate_html_words(self):
# Plain text.
self.assertEqual( self.assertEqual(
utils.truncate_html_words('short string', 20), utils.truncate_html_words('short string', 20),
'short string') 'short string')
self.assertEqual( self.assertEqual(
utils.truncate_html_words('word ' * 100, 20), utils.truncate_html_words('word ' * 100, 20),
'word ' * 20 + '...') 'word ' * 20 + '...')
# Words enclosed or intervaled by HTML tags.
self.assertEqual( self.assertEqual(
utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20), utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
'<p>' + 'word ' * 20 + '...</p>') '<p>' + 'word ' * 20 + '...</p>')
self.assertEqual( self.assertEqual(
utils.truncate_html_words( utils.truncate_html_words(
'<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20), '<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
'<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>') '<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
self.assertEqual( self.assertEqual(
utils.truncate_html_words('<br>' + 'word ' * 100, 20), utils.truncate_html_words('<br>' + 'word ' * 100, 20),
'<br>' + 'word ' * 20 + '...') '<br>' + 'word ' * 20 + '...')
self.assertEqual( self.assertEqual(
utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20), utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
'<!-- comment -->' + 'word ' * 20 + '...') '<!-- comment -->' + 'word ' * 20 + '...')
# Words with hypens and apostrophes.
self.assertEqual(
utils.truncate_html_words("a-b " * 100, 20),
"a-b " * 20 + '...')
self.assertEqual(
utils.truncate_html_words("it's " * 100, 20),
"it's " * 20 + '...')
# Words with HTML entity references.
self.assertEqual(
utils.truncate_html_words("&eacute; " * 100, 20),
"&eacute; " * 20 + '...')
self.assertEqual(
utils.truncate_html_words("caf&eacute; " * 100, 20),
"caf&eacute; " * 20 + '...')
self.assertEqual(
utils.truncate_html_words("&egrave;lite " * 100, 20),
"&egrave;lite " * 20 + '...')
self.assertEqual(
utils.truncate_html_words("cafeti&eacute;re " * 100, 20),
"cafeti&eacute;re " * 20 + '...')
def test_process_translations(self): def test_process_translations(self):
# create a bunch of articles # create a bunch of articles
# 1: no translation metadata # 1: no translation metadata

View file

@ -25,6 +25,7 @@ from jinja2 import Markup
import pytz import pytz
import six import six
from six.moves import html_entities
from six.moves.html_parser import HTMLParser from six.moves.html_parser import HTMLParser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -408,7 +409,8 @@ def posixize_path(rel_path):
class _HTMLWordTruncator(HTMLParser): class _HTMLWordTruncator(HTMLParser):
_word_regex = re.compile(r'\w[\w-]*', re.U) _word_regex = re.compile(r"\w[\w'-]*", re.U)
_word_prefix_regex = re.compile(r'\w', re.U)
_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
'hr', 'input') 'hr', 'input')
@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser):
self.max_words = max_words self.max_words = max_words
self.words_found = 0 self.words_found = 0
self.open_tags = [] self.open_tags = []
self.last_word_end = None
self.truncate_at = None self.truncate_at = None
def getoffset(self):
line_start = 0
lineno, line_offset = self.getpos()
for i in range(lineno - 1):
line_start = self.rawdata.index('\n', line_start) + 1
return line_start + line_offset
def add_word(self, word_end):
self.words_found += 1
self.last_word_end = None
if self.words_found == self.max_words:
self.truncate_at = word_end
def add_last_word(self):
if self.last_word_end is not None:
self.add_word(self.last_word_end)
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if self.truncate_at is not None: if self.truncate_at is not None:
return return
self.add_last_word()
if tag not in self._singlets: if tag not in self._singlets:
self.open_tags.insert(0, tag) self.open_tags.insert(0, tag)
def handle_endtag(self, tag): def handle_endtag(self, tag):
if self.truncate_at is not None: if self.truncate_at is not None:
return return
self.add_last_word()
try: try:
i = self.open_tags.index(tag) i = self.open_tags.index(tag)
except ValueError: except ValueError:
@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser):
def handle_data(self, data): def handle_data(self, data):
word_end = 0 word_end = 0
offset = self.getoffset()
while self.words_found < self.max_words: while self.words_found < self.max_words:
match = self._word_regex.search(data, word_end) match = self._word_regex.search(data, word_end)
if not match: if not match:
break break
word_end = match.end(0)
self.words_found += 1
if self.words_found == self.max_words: if match.start(0) > 0:
line_start = 0 self.add_last_word()
lineno, line_offset = self.getpos()
for i in range(lineno - 1): word_end = match.end(0)
line_start = self.rawdata.index('\n', line_start) + 1 self.last_word_end = offset + word_end
self.truncate_at = line_start + line_offset + word_end
if word_end < len(data):
self.add_last_word()
def handle_ref(self, char):
offset = self.getoffset()
ref_end = self.rawdata.index(';', offset) + 1
if self.last_word_end is None:
if self._word_prefix_regex.match(char):
self.last_word_end = ref_end
else:
if self._word_regex.match(char):
self.last_word_end = ref_end
else:
self.add_last_word()
def handle_entityref(self, name):
try:
codepoint = html_entities.name2codepoint[name]
except KeyError:
self.handle_ref('')
else:
self.handle_ref(chr(codepoint))
def handle_charref(self, name):
if name.startswith('x'):
codepoint = int(name[1:], 16)
else:
codepoint = int(name)
self.handle_ref(chr(codepoint))
def truncate_html_words(s, num, end_text='...'): def truncate_html_words(s, num, end_text='...'):