1
0
Fork 0
forked from github/pelican

Decode HTML entities in titles on WP import

This commit is contained in:
David Beitey 2012-11-03 21:55:56 +10:00
commit e2c3701757
4 changed files with 71 additions and 4 deletions

View file

@ -5,6 +5,7 @@ Release history
================ ================
* Improve handling of links to intra-site resources * Improve handling of links to intra-site resources
* Decode HTML entities within WordPress post titles on import
3.0 (2012-08-08) 3.0 (2012-08-08)
================== ==================

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import argparse import argparse
from HTMLParser import HTMLParser
import os import os
import subprocess import subprocess
import sys import sys
@ -29,7 +30,8 @@ def wp2fields(xml):
if item.fetch('wp:status')[0].contents[0] == "publish": if item.fetch('wp:status')[0].contents[0] == "publish":
try: try:
title = item.title.contents[0] # Use HTMLParser due to issues with BeautifulSoup 3
title = HTMLParser().unescape(item.title.contents[0])
except IndexError: except IndexError:
continue continue

View file

@ -112,10 +112,10 @@
</item> </item>
<item> <item>
<title>A normal post</title> <title>A normal post</title>
<link>http://thisisa.test/?p=173</link> <link>http://thisisa.test/?p=174</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate> <pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator> <dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=173</guid> <guid isPermaLink="false">http://thisisa.test/?p=174</guid>
<description></description> <description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod <content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
@ -146,7 +146,7 @@ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded> proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded> <excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>173</wp:post_id> <wp:post_id>174</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date> <wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt> <wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status> <wp:comment_status>open</wp:comment_status>
@ -574,5 +574,59 @@ Bottom line: don't mess up with birds]]></content:encoded>
<wp:meta_value><![CDATA[3]]></wp:meta_value> <wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta> </wp:postmeta>
</item> </item>
<item>
<title>A normal post with some &lt;html&gt; entities in the title. You can&#039;t miss them.</title>
<link>http://thisisa.test/?p=175</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=175</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<ul>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
</ul>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>175</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>html-entity-test</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<category domain="category" nicename="category-2"><![CDATA[Category 2]]></category>
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
</channel> </channel>
</rss> </rss>

View file

@ -47,3 +47,13 @@ class TestWordpressXmlImporter(unittest.TestCase):
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp, rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
strip_raw=True)) strip_raw=True))
self.assertFalse(any('<iframe' in rst for rst in rst_files)) self.assertFalse(any('<iframe' in rst for rst in rst_files))
def test_decode_html_entities_in_titles(self):
posts = list(self.posts)
test_posts = [post for post in posts if post[2] == 'html-entity-test']
self.assertTrue(len(test_posts) == 1)
post = test_posts[0]
title = post[0]
self.assertTrue(title, "A normal post with some <html> entities in the title. You can't miss them.")
self.assertTrue('&' not in title)