Decode HTML entities in titles on WP import

This commit is contained in:
David Beitey 2012-11-03 21:55:56 +10:00
commit e2c3701757
4 changed files with 71 additions and 4 deletions

View file

@ -112,10 +112,10 @@
</item>
<item>
<title>A normal post</title>
<link>http://thisisa.test/?p=173</link>
<link>http://thisisa.test/?p=174</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=173</guid>
<guid isPermaLink="false">http://thisisa.test/?p=174</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
@ -146,7 +146,7 @@ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>173</wp:post_id>
<wp:post_id>174</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
@ -574,5 +574,59 @@ Bottom line: don't mess up with birds]]></content:encoded>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>A normal post with some &lt;html&gt; entities in the title. You can&#039;t miss them.</title>
<link>http://thisisa.test/?p=175</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=175</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<ul>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
</ul>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>175</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>html-entity-test</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<category domain="category" nicename="category-2"><![CDATA[Category 2]]></category>
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
</channel>
</rss>

View file

@ -47,3 +47,13 @@ class TestWordpressXmlImporter(unittest.TestCase):
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
strip_raw=True))
self.assertFalse(any('<iframe' in rst for rst in rst_files))
def test_decode_html_entities_in_titles(self):
posts = list(self.posts)
test_posts = [post for post in posts if post[2] == 'html-entity-test']
self.assertTrue(len(test_posts) == 1)
post = test_posts[0]
title = post[0]
self.assertTrue(title, "A normal post with some <html> entities in the title. You can't miss them.")
self.assertTrue('&' not in title)