mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #1802 from andreacorbellini/better-word-recognition
When truncating, consider hyphens, apostrophes and HTML entities.
This commit is contained in:
commit
ac151ef867
10 changed files with 95 additions and 24 deletions
|
|
@ -90,7 +90,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="/unbelievable.html">read more</a>
|
||||
</div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ YEAH !</p>
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="/unbelievable.html">read more</a>
|
||||
</div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="../unbelievable.html">read more</a>
|
||||
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="../unbelievable.html">read more</a>
|
||||
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="./unbelievable.html">read more</a>
|
||||
<p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
||||
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
|
|||
}());
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
||||
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
@ -175,4 +175,4 @@ pelican.conf, it ...</p></div>
|
|||
}());
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@
|
|||
<div class="section" id="testing-another-case">
|
||||
<h2>Testing another case</h2>
|
||||
<p>This will now have a line number in 'custom' since it's the default in
|
||||
pelican.conf, it ...</p></div>
|
||||
pelican.conf, it will ...</p></div>
|
||||
<a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
|
||||
<p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||
</article></li>
|
||||
|
|
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
|
|||
}());
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase):
|
|||
self.assertEqual(utils.get_relative_path(value), expected)
|
||||
|
||||
def test_truncate_html_words(self):
|
||||
# Plain text.
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('short string', 20),
|
||||
'short string')
|
||||
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('word ' * 100, 20),
|
||||
'word ' * 20 + '...')
|
||||
|
||||
# Words enclosed or intervaled by HTML tags.
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
|
||||
'<p>' + 'word ' * 20 + '...</p>')
|
||||
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words(
|
||||
'<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
|
||||
'<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
|
||||
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('<br>' + 'word ' * 100, 20),
|
||||
'<br>' + 'word ' * 20 + '...')
|
||||
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
|
||||
'<!-- comment -->' + 'word ' * 20 + '...')
|
||||
|
||||
# Words with hypens and apostrophes.
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("a-b " * 100, 20),
|
||||
"a-b " * 20 + '...')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("it's " * 100, 20),
|
||||
"it's " * 20 + '...')
|
||||
|
||||
# Words with HTML entity references.
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("é " * 100, 20),
|
||||
"é " * 20 + '...')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("café " * 100, 20),
|
||||
"café " * 20 + '...')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("èlite " * 100, 20),
|
||||
"èlite " * 20 + '...')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words("cafetiére " * 100, 20),
|
||||
"cafetiére " * 20 + '...')
|
||||
|
||||
def test_process_translations(self):
|
||||
# create a bunch of articles
|
||||
# 1: no translation metadata
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from jinja2 import Markup
|
|||
import pytz
|
||||
|
||||
import six
|
||||
from six.moves import html_entities
|
||||
from six.moves.html_parser import HTMLParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -408,7 +409,8 @@ def posixize_path(rel_path):
|
|||
|
||||
class _HTMLWordTruncator(HTMLParser):
|
||||
|
||||
_word_regex = re.compile(r'\w[\w-]*', re.U)
|
||||
_word_regex = re.compile(r"\w[\w'-]*", re.U)
|
||||
_word_prefix_regex = re.compile(r'\w', re.U)
|
||||
_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
|
||||
'hr', 'input')
|
||||
|
||||
|
|
@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser):
|
|||
self.max_words = max_words
|
||||
self.words_found = 0
|
||||
self.open_tags = []
|
||||
self.last_word_end = None
|
||||
self.truncate_at = None
|
||||
|
||||
def getoffset(self):
|
||||
line_start = 0
|
||||
lineno, line_offset = self.getpos()
|
||||
for i in range(lineno - 1):
|
||||
line_start = self.rawdata.index('\n', line_start) + 1
|
||||
return line_start + line_offset
|
||||
|
||||
def add_word(self, word_end):
|
||||
self.words_found += 1
|
||||
self.last_word_end = None
|
||||
if self.words_found == self.max_words:
|
||||
self.truncate_at = word_end
|
||||
|
||||
def add_last_word(self):
|
||||
if self.last_word_end is not None:
|
||||
self.add_word(self.last_word_end)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if self.truncate_at is not None:
|
||||
return
|
||||
self.add_last_word()
|
||||
if tag not in self._singlets:
|
||||
self.open_tags.insert(0, tag)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self.truncate_at is not None:
|
||||
return
|
||||
self.add_last_word()
|
||||
try:
|
||||
i = self.open_tags.index(tag)
|
||||
except ValueError:
|
||||
|
|
@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser):
|
|||
|
||||
def handle_data(self, data):
|
||||
word_end = 0
|
||||
offset = self.getoffset()
|
||||
|
||||
while self.words_found < self.max_words:
|
||||
match = self._word_regex.search(data, word_end)
|
||||
if not match:
|
||||
break
|
||||
word_end = match.end(0)
|
||||
self.words_found += 1
|
||||
|
||||
if self.words_found == self.max_words:
|
||||
line_start = 0
|
||||
lineno, line_offset = self.getpos()
|
||||
for i in range(lineno - 1):
|
||||
line_start = self.rawdata.index('\n', line_start) + 1
|
||||
self.truncate_at = line_start + line_offset + word_end
|
||||
if match.start(0) > 0:
|
||||
self.add_last_word()
|
||||
|
||||
word_end = match.end(0)
|
||||
self.last_word_end = offset + word_end
|
||||
|
||||
if word_end < len(data):
|
||||
self.add_last_word()
|
||||
|
||||
def handle_ref(self, char):
|
||||
offset = self.getoffset()
|
||||
ref_end = self.rawdata.index(';', offset) + 1
|
||||
|
||||
if self.last_word_end is None:
|
||||
if self._word_prefix_regex.match(char):
|
||||
self.last_word_end = ref_end
|
||||
else:
|
||||
if self._word_regex.match(char):
|
||||
self.last_word_end = ref_end
|
||||
else:
|
||||
self.add_last_word()
|
||||
|
||||
def handle_entityref(self, name):
|
||||
try:
|
||||
codepoint = html_entities.name2codepoint[name]
|
||||
except KeyError:
|
||||
self.handle_ref('')
|
||||
else:
|
||||
self.handle_ref(chr(codepoint))
|
||||
|
||||
def handle_charref(self, name):
|
||||
if name.startswith('x'):
|
||||
codepoint = int(name[1:], 16)
|
||||
else:
|
||||
codepoint = int(name)
|
||||
self.handle_ref(chr(codepoint))
|
||||
|
||||
|
||||
def truncate_html_words(s, num, end_text='...'):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue