#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import argparse
try:
from html import unescape # py3.4+
except ImportError:
from six.moves.html_parser import HTMLParser
unescape = HTMLParser().unescape
import os
import re
import subprocess
import sys
import time
import logging
from codecs import open
from six.moves.urllib.error import URLError
from six.moves.urllib.parse import urlparse
from six.moves.urllib.request import urlretrieve
from pelican.utils import slugify, SafeDatetime
from pelican.log import init
logger = logging.getLogger(__name__)
def decode_wp_content(content, br=True):
pre_tags = {}
if content.strip() == "":
return ""
content += "\n"
if "
")
last_pre = pre_parts.pop()
content = ""
pre_index = 0
for pre_part in pre_parts:
start = pre_part.find(""
content = content + pre_part[0:start] + name
pre_index += 1
content = content + last_pre
content = re.sub(r'
\s*
', "\n\n", content)
allblocks = ('(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|'
'td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|'
'map|area|blockquote|address|math|style|p|h[1-6]|hr|'
'fieldset|noscript|samp|legend|section|article|aside|'
'hgroup|header|footer|nav|figure|figcaption|details|'
'menu|summary)')
content = re.sub(r'(<' + allblocks + r'[^>]*>)', "\n\\1", content)
content = re.sub(r'(' + allblocks + r'>)', "\\1\n\n", content)
# content = content.replace("\r\n", "\n")
if "