forked from github/pelican
Decode HTML entities in titles on WP import
This commit is contained in:
parent
bfc963a065
commit
e2c3701757
4 changed files with 71 additions and 4 deletions
|
|
@ -5,6 +5,7 @@ Release history
|
|||
================
|
||||
|
||||
* Improve handling of links to intra-site resources
|
||||
* Decode HTML entities within WordPress post titles on import
|
||||
|
||||
3.0 (2012-08-08)
|
||||
==================
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import argparse
|
||||
from HTMLParser import HTMLParser
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
|
@ -29,7 +30,8 @@ def wp2fields(xml):
|
|||
if item.fetch('wp:status')[0].contents[0] == "publish":
|
||||
|
||||
try:
|
||||
title = item.title.contents[0]
|
||||
# Use HTMLParser due to issues with BeautifulSoup 3
|
||||
title = HTMLParser().unescape(item.title.contents[0])
|
||||
except IndexError:
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -112,10 +112,10 @@
|
|||
</item>
|
||||
<item>
|
||||
<title>A normal post</title>
|
||||
<link>http://thisisa.test/?p=173</link>
|
||||
<link>http://thisisa.test/?p=174</link>
|
||||
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
|
||||
<dc:creator>bob</dc:creator>
|
||||
<guid isPermaLink="false">http://thisisa.test/?p=173</guid>
|
||||
<guid isPermaLink="false">http://thisisa.test/?p=174</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
|
|
@ -146,7 +146,7 @@ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
|||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<wp:post_id>173</wp:post_id>
|
||||
<wp:post_id>174</wp:post_id>
|
||||
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
|
||||
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
|
|
@ -574,5 +574,59 @@ Bottom line: don't mess up with birds]]></content:encoded>
|
|||
<wp:meta_value><![CDATA[3]]></wp:meta_value>
|
||||
</wp:postmeta>
|
||||
</item>
|
||||
<item>
|
||||
<title>A normal post with some <html> entities in the title. You can't miss them.</title>
|
||||
<link>http://thisisa.test/?p=175</link>
|
||||
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
|
||||
<dc:creator>bob</dc:creator>
|
||||
<guid isPermaLink="false">http://thisisa.test/?p=175</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
|
||||
<ul>
|
||||
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
|
||||
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
|
||||
</ul>
|
||||
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<wp:post_id>175</wp:post_id>
|
||||
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
|
||||
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>html-entity-test</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
<category domain="category" nicename="category-2"><![CDATA[Category 2]]></category>
|
||||
<wp:postmeta>
|
||||
<wp:meta_key>_edit_last</wp:meta_key>
|
||||
<wp:meta_value><![CDATA[3]]></wp:meta_value>
|
||||
</wp:postmeta>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
|
|
|
|||
|
|
@ -47,3 +47,13 @@ class TestWordpressXmlImporter(unittest.TestCase):
|
|||
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
|
||||
strip_raw=True))
|
||||
self.assertFalse(any('<iframe' in rst for rst in rst_files))
|
||||
|
||||
def test_decode_html_entities_in_titles(self):
|
||||
posts = list(self.posts)
|
||||
test_posts = [post for post in posts if post[2] == 'html-entity-test']
|
||||
self.assertTrue(len(test_posts) == 1)
|
||||
|
||||
post = test_posts[0]
|
||||
title = post[0]
|
||||
self.assertTrue(title, "A normal post with some <html> entities in the title. You can't miss them.")
|
||||
self.assertTrue('&' not in title)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue