Decode HTML entities in titles on WP import

This commit is contained in:
David Beitey 2012-11-03 21:55:56 +10:00
commit e2c3701757
4 changed files with 71 additions and 4 deletions

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python
import argparse
from HTMLParser import HTMLParser
import os
import subprocess
import sys
@ -29,7 +30,8 @@ def wp2fields(xml):
if item.fetch('wp:status')[0].contents[0] == "publish":
try:
title = item.title.contents[0]
# Use HTMLParser due to issues with BeautifulSoup 3
title = HTMLParser().unescape(item.title.contents[0])
except IndexError:
continue