forked from github/pelican
Decode HTML entities in titles on WP import
This commit is contained in:
parent
bfc963a065
commit
e2c3701757
4 changed files with 71 additions and 4 deletions
|
|
@ -5,6 +5,7 @@ Release history
|
||||||
================
|
================
|
||||||
|
|
||||||
* Improve handling of links to intra-site resources
|
* Improve handling of links to intra-site resources
|
||||||
|
* Decode HTML entities within WordPress post titles on import
|
||||||
|
|
||||||
3.0 (2012-08-08)
|
3.0 (2012-08-08)
|
||||||
==================
|
==================
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -29,7 +30,8 @@ def wp2fields(xml):
|
||||||
if item.fetch('wp:status')[0].contents[0] == "publish":
|
if item.fetch('wp:status')[0].contents[0] == "publish":
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title = item.title.contents[0]
|
# Use HTMLParser due to issues with BeautifulSoup 3
|
||||||
|
title = HTMLParser().unescape(item.title.contents[0])
|
||||||
except IndexError:
|
except IndexError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -112,10 +112,10 @@
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item>
|
||||||
<title>A normal post</title>
|
<title>A normal post</title>
|
||||||
<link>http://thisisa.test/?p=173</link>
|
<link>http://thisisa.test/?p=174</link>
|
||||||
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
|
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
|
||||||
<dc:creator>bob</dc:creator>
|
<dc:creator>bob</dc:creator>
|
||||||
<guid isPermaLink="false">http://thisisa.test/?p=173</guid>
|
<guid isPermaLink="false">http://thisisa.test/?p=174</guid>
|
||||||
<description></description>
|
<description></description>
|
||||||
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
|
@ -146,7 +146,7 @@ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
|
||||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||||
<wp:post_id>173</wp:post_id>
|
<wp:post_id>174</wp:post_id>
|
||||||
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
|
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
|
||||||
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
|
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
|
||||||
<wp:comment_status>open</wp:comment_status>
|
<wp:comment_status>open</wp:comment_status>
|
||||||
|
|
@ -574,5 +574,59 @@ Bottom line: don't mess up with birds]]></content:encoded>
|
||||||
<wp:meta_value><![CDATA[3]]></wp:meta_value>
|
<wp:meta_value><![CDATA[3]]></wp:meta_value>
|
||||||
</wp:postmeta>
|
</wp:postmeta>
|
||||||
</item>
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>A normal post with some <html> entities in the title. You can't miss them.</title>
|
||||||
|
<link>http://thisisa.test/?p=175</link>
|
||||||
|
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
|
||||||
|
<dc:creator>bob</dc:creator>
|
||||||
|
<guid isPermaLink="false">http://thisisa.test/?p=175</guid>
|
||||||
|
<description></description>
|
||||||
|
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
|
||||||
|
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
|
||||||
|
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||||
|
<wp:post_id>175</wp:post_id>
|
||||||
|
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
|
||||||
|
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
|
||||||
|
<wp:comment_status>open</wp:comment_status>
|
||||||
|
<wp:ping_status>open</wp:ping_status>
|
||||||
|
<wp:post_name>html-entity-test</wp:post_name>
|
||||||
|
<wp:status>publish</wp:status>
|
||||||
|
<wp:post_parent>0</wp:post_parent>
|
||||||
|
<wp:menu_order>0</wp:menu_order>
|
||||||
|
<wp:post_type>post</wp:post_type>
|
||||||
|
<wp:post_password></wp:post_password>
|
||||||
|
<wp:is_sticky>0</wp:is_sticky>
|
||||||
|
<category domain="category" nicename="category-2"><![CDATA[Category 2]]></category>
|
||||||
|
<wp:postmeta>
|
||||||
|
<wp:meta_key>_edit_last</wp:meta_key>
|
||||||
|
<wp:meta_value><![CDATA[3]]></wp:meta_value>
|
||||||
|
</wp:postmeta>
|
||||||
|
</item>
|
||||||
</channel>
|
</channel>
|
||||||
</rss>
|
</rss>
|
||||||
|
|
|
||||||
|
|
@ -47,3 +47,13 @@ class TestWordpressXmlImporter(unittest.TestCase):
|
||||||
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
|
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
|
||||||
strip_raw=True))
|
strip_raw=True))
|
||||||
self.assertFalse(any('<iframe' in rst for rst in rst_files))
|
self.assertFalse(any('<iframe' in rst for rst in rst_files))
|
||||||
|
|
||||||
|
def test_decode_html_entities_in_titles(self):
|
||||||
|
posts = list(self.posts)
|
||||||
|
test_posts = [post for post in posts if post[2] == 'html-entity-test']
|
||||||
|
self.assertTrue(len(test_posts) == 1)
|
||||||
|
|
||||||
|
post = test_posts[0]
|
||||||
|
title = post[0]
|
||||||
|
self.assertTrue(title, "A normal post with some <html> entities in the title. You can't miss them.")
|
||||||
|
self.assertTrue('&' not in title)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue