mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #1802 from andreacorbellini/better-word-recognition
When truncating, consider hyphens, apostrophes and HTML entities.
This commit is contained in:
commit
ac151ef867
10 changed files with 95 additions and 24 deletions
|
|
@ -90,7 +90,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="/unbelievable.html">read more</a>
|
<a class="readmore" href="/unbelievable.html">read more</a>
|
||||||
</div><!-- /.entry-content -->
|
</div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
|
||||||
|
|
@ -227,7 +227,7 @@ YEAH !</p>
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="/unbelievable.html">read more</a>
|
<a class="readmore" href="/unbelievable.html">read more</a>
|
||||||
</div><!-- /.entry-content -->
|
</div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="../unbelievable.html">read more</a>
|
<a class="readmore" href="../unbelievable.html">read more</a>
|
||||||
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="../unbelievable.html">read more</a>
|
<a class="readmore" href="../unbelievable.html">read more</a>
|
||||||
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="./unbelievable.html">read more</a>
|
<a class="readmore" href="./unbelievable.html">read more</a>
|
||||||
<p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
||||||
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
|
||||||
}());
|
}());
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
<a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
|
||||||
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
@ -175,4 +175,4 @@ pelican.conf, it ...</p></div>
|
||||||
}());
|
}());
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@
|
||||||
<div class="section" id="testing-another-case">
|
<div class="section" id="testing-another-case">
|
||||||
<h2>Testing another case</h2>
|
<h2>Testing another case</h2>
|
||||||
<p>This will now have a line number in 'custom' since it's the default in
|
<p>This will now have a line number in 'custom' since it's the default in
|
||||||
pelican.conf, it ...</p></div>
|
pelican.conf, it will ...</p></div>
|
||||||
<a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
|
<a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
|
||||||
<p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
<p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p> </div><!-- /.entry-content -->
|
||||||
</article></li>
|
</article></li>
|
||||||
|
|
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
|
||||||
}());
|
}());
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
||||||
|
|
@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase):
|
||||||
self.assertEqual(utils.get_relative_path(value), expected)
|
self.assertEqual(utils.get_relative_path(value), expected)
|
||||||
|
|
||||||
def test_truncate_html_words(self):
|
def test_truncate_html_words(self):
|
||||||
|
# Plain text.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words('short string', 20),
|
utils.truncate_html_words('short string', 20),
|
||||||
'short string')
|
'short string')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words('word ' * 100, 20),
|
utils.truncate_html_words('word ' * 100, 20),
|
||||||
'word ' * 20 + '...')
|
'word ' * 20 + '...')
|
||||||
|
|
||||||
|
# Words enclosed or intervaled by HTML tags.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
|
utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
|
||||||
'<p>' + 'word ' * 20 + '...</p>')
|
'<p>' + 'word ' * 20 + '...</p>')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words(
|
utils.truncate_html_words(
|
||||||
'<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
|
'<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
|
||||||
'<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
|
'<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words('<br>' + 'word ' * 100, 20),
|
utils.truncate_html_words('<br>' + 'word ' * 100, 20),
|
||||||
'<br>' + 'word ' * 20 + '...')
|
'<br>' + 'word ' * 20 + '...')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
|
utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
|
||||||
'<!-- comment -->' + 'word ' * 20 + '...')
|
'<!-- comment -->' + 'word ' * 20 + '...')
|
||||||
|
|
||||||
|
# Words with hypens and apostrophes.
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("a-b " * 100, 20),
|
||||||
|
"a-b " * 20 + '...')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("it's " * 100, 20),
|
||||||
|
"it's " * 20 + '...')
|
||||||
|
|
||||||
|
# Words with HTML entity references.
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("é " * 100, 20),
|
||||||
|
"é " * 20 + '...')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("café " * 100, 20),
|
||||||
|
"café " * 20 + '...')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("èlite " * 100, 20),
|
||||||
|
"èlite " * 20 + '...')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words("cafetiére " * 100, 20),
|
||||||
|
"cafetiére " * 20 + '...')
|
||||||
|
|
||||||
def test_process_translations(self):
|
def test_process_translations(self):
|
||||||
# create a bunch of articles
|
# create a bunch of articles
|
||||||
# 1: no translation metadata
|
# 1: no translation metadata
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ from jinja2 import Markup
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
from six.moves import html_entities
|
||||||
from six.moves.html_parser import HTMLParser
|
from six.moves.html_parser import HTMLParser
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -408,7 +409,8 @@ def posixize_path(rel_path):
|
||||||
|
|
||||||
class _HTMLWordTruncator(HTMLParser):
|
class _HTMLWordTruncator(HTMLParser):
|
||||||
|
|
||||||
_word_regex = re.compile(r'\w[\w-]*', re.U)
|
_word_regex = re.compile(r"\w[\w'-]*", re.U)
|
||||||
|
_word_prefix_regex = re.compile(r'\w', re.U)
|
||||||
_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
|
_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
|
||||||
'hr', 'input')
|
'hr', 'input')
|
||||||
|
|
||||||
|
|
@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser):
|
||||||
self.max_words = max_words
|
self.max_words = max_words
|
||||||
self.words_found = 0
|
self.words_found = 0
|
||||||
self.open_tags = []
|
self.open_tags = []
|
||||||
|
self.last_word_end = None
|
||||||
self.truncate_at = None
|
self.truncate_at = None
|
||||||
|
|
||||||
|
def getoffset(self):
|
||||||
|
line_start = 0
|
||||||
|
lineno, line_offset = self.getpos()
|
||||||
|
for i in range(lineno - 1):
|
||||||
|
line_start = self.rawdata.index('\n', line_start) + 1
|
||||||
|
return line_start + line_offset
|
||||||
|
|
||||||
|
def add_word(self, word_end):
|
||||||
|
self.words_found += 1
|
||||||
|
self.last_word_end = None
|
||||||
|
if self.words_found == self.max_words:
|
||||||
|
self.truncate_at = word_end
|
||||||
|
|
||||||
|
def add_last_word(self):
|
||||||
|
if self.last_word_end is not None:
|
||||||
|
self.add_word(self.last_word_end)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
if self.truncate_at is not None:
|
if self.truncate_at is not None:
|
||||||
return
|
return
|
||||||
|
self.add_last_word()
|
||||||
if tag not in self._singlets:
|
if tag not in self._singlets:
|
||||||
self.open_tags.insert(0, tag)
|
self.open_tags.insert(0, tag)
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
if self.truncate_at is not None:
|
if self.truncate_at is not None:
|
||||||
return
|
return
|
||||||
|
self.add_last_word()
|
||||||
try:
|
try:
|
||||||
i = self.open_tags.index(tag)
|
i = self.open_tags.index(tag)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser):
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
word_end = 0
|
word_end = 0
|
||||||
|
offset = self.getoffset()
|
||||||
|
|
||||||
while self.words_found < self.max_words:
|
while self.words_found < self.max_words:
|
||||||
match = self._word_regex.search(data, word_end)
|
match = self._word_regex.search(data, word_end)
|
||||||
if not match:
|
if not match:
|
||||||
break
|
break
|
||||||
word_end = match.end(0)
|
|
||||||
self.words_found += 1
|
|
||||||
|
|
||||||
if self.words_found == self.max_words:
|
if match.start(0) > 0:
|
||||||
line_start = 0
|
self.add_last_word()
|
||||||
lineno, line_offset = self.getpos()
|
|
||||||
for i in range(lineno - 1):
|
word_end = match.end(0)
|
||||||
line_start = self.rawdata.index('\n', line_start) + 1
|
self.last_word_end = offset + word_end
|
||||||
self.truncate_at = line_start + line_offset + word_end
|
|
||||||
|
if word_end < len(data):
|
||||||
|
self.add_last_word()
|
||||||
|
|
||||||
|
def handle_ref(self, char):
|
||||||
|
offset = self.getoffset()
|
||||||
|
ref_end = self.rawdata.index(';', offset) + 1
|
||||||
|
|
||||||
|
if self.last_word_end is None:
|
||||||
|
if self._word_prefix_regex.match(char):
|
||||||
|
self.last_word_end = ref_end
|
||||||
|
else:
|
||||||
|
if self._word_regex.match(char):
|
||||||
|
self.last_word_end = ref_end
|
||||||
|
else:
|
||||||
|
self.add_last_word()
|
||||||
|
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
try:
|
||||||
|
codepoint = html_entities.name2codepoint[name]
|
||||||
|
except KeyError:
|
||||||
|
self.handle_ref('')
|
||||||
|
else:
|
||||||
|
self.handle_ref(chr(codepoint))
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
if name.startswith('x'):
|
||||||
|
codepoint = int(name[1:], 16)
|
||||||
|
else:
|
||||||
|
codepoint = int(name)
|
||||||
|
self.handle_ref(chr(codepoint))
|
||||||
|
|
||||||
|
|
||||||
def truncate_html_words(s, num, end_text='...'):
|
def truncate_html_words(s, num, end_text='...'):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue