#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import argparse
try:
# py3k import
from html.parser import HTMLParser
from urllib.request import urlretrieve
from urllib.parse import urlparse
from urllib.error import URLError
except ImportError:
# py2 import
from HTMLParser import HTMLParser # NOQA
from urllib import urlretrieve
from urlparse import urlparse
from urllib2 import URLError
import os
import re
import subprocess
import sys
import time
import logging
from codecs import open
from pelican.utils import slugify
from pelican.log import init
logger = logging.getLogger(__name__)
def decode_wp_content(content, br=True):
pre_tags = {}
if content.strip() == "":
return ""
content += "\n"
if "
")
last_pre = pre_parts.pop()
content = ""
pre_index = 0
for pre_part in pre_parts:
start = pre_part.find(""
content = content + pre_part[0:start] + name
pre_index += 1
content = content + last_pre
content = re.sub(r'
\s*
', "\n\n", content)
allblocks = ('(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|'
'td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|'
'map|area|blockquote|address|math|style|p|h[1-6]|hr|'
'fieldset|noscript|samp|legend|section|article|aside|'
'hgroup|header|footer|nav|figure|figcaption|details|'
'menu|summary)')
content = re.sub(r'(<' + allblocks + r'[^>]*>)', "\n\\1", content)
content = re.sub(r'(' + allblocks + r'>)', "\\1\n\n", content)
# content = content.replace("\r\n", "\n")
if "