diff --git a/docs/changelog.rst b/docs/changelog.rst index 6cea6d93..5771abda 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,6 +5,7 @@ Release history ================ * Improve handling of links to intra-site resources +* Decode HTML entities within WordPress post titles on import 3.0 (2012-08-08) ================== diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index fc28c6a4..fceac4b5 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +from HTMLParser import HTMLParser import os import subprocess import sys @@ -29,7 +30,8 @@ def wp2fields(xml): if item.fetch('wp:status')[0].contents[0] == "publish": try: - title = item.title.contents[0] + # Use HTMLParser due to issues with BeautifulSoup 3 + title = HTMLParser().unescape(item.title.contents[0]) except IndexError: continue diff --git a/tests/content/wordpressexport.xml b/tests/content/wordpressexport.xml index d3e86cba..0d68f180 100644 --- a/tests/content/wordpressexport.xml +++ b/tests/content/wordpressexport.xml @@ -112,10 +112,10 @@ A normal post - http://thisisa.test/?p=173 + http://thisisa.test/?p=174 Thu, 01 Jan 1970 00:00:00 +0000 bob - http://thisisa.test/?p=173 + http://thisisa.test/?p=174 - 173 + 174 2012-02-16 15:52:55 0000-00-00 00:00:00 open @@ -574,5 +574,59 @@ Bottom line: don't mess up with birds]]> + + A normal post with some <html> entities in the title. You can't miss them. + http://thisisa.test/?p=175 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=175 + + +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 175 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + html-entity-test + publish + 0 + 0 + post + + 0 + + + _edit_last + + +
    diff --git a/tests/test_importer.py b/tests/test_importer.py index d4ff8205..959a556a 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -47,3 +47,13 @@ class TestWordpressXmlImporter(unittest.TestCase): rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp, strip_raw=True)) self.assertFalse(any(' entities in the title. You can't miss them.") + self.assertTrue('&' not in title)