diff --git a/pelican/tests/content/wordpress_content_decoded b/pelican/tests/content/wordpress_content_decoded new file mode 100644 index 00000000..6e91338c --- /dev/null +++ b/pelican/tests/content/wordpress_content_decoded @@ -0,0 +1,48 @@ +

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

+

+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ +
+
+  a = [1, 2, 3]
+  b = [4, 5, 6]
+  for i in zip(a, b):
+    print i
+
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

diff --git a/pelican/tests/content/wordpress_content_encoded b/pelican/tests/content/wordpress_content_encoded new file mode 100644 index 00000000..da35de3b --- /dev/null +++ b/pelican/tests/content/wordpress_content_encoded @@ -0,0 +1,55 @@ +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + + + +
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+ + +
+
+  a = [1, 2, 3]
+  b = [4, 5, 6]
+  for i in zip(a, b):
+    print i
+
+
+ +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + diff --git a/pelican/tests/content/wordpressexport.xml b/pelican/tests/content/wordpressexport.xml index 0d68f180..56d9a458 100644 --- a/pelican/tests/content/wordpressexport.xml +++ b/pelican/tests/content/wordpressexport.xml @@ -628,5 +628,59 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]> + + Code in List + http://thisisa.test/?p=175 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=175 + + +
  • List Item One!
  • +
  • List Item Two!
  • +
  • This is a code sample +
    +
    +  a = [1, 2, 3]
    +  b = [4, 5, 6]
    +  for i in zip(a, b):
    +    print i
    +
    +
  • +
  • List Item Four!
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 175 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + code-in-list-test + publish + 0 + 0 + post + + 0 + + + _edit_last + + +
    diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 3d297160..61424774 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -2,13 +2,20 @@ from __future__ import unicode_literals, print_function import os +import re -from pelican.tools.pelican_import import wp2fields, fields2pelican +from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content from pelican.tests.support import (unittest, temporary_folder, mute, skipIfNoExecutable) CUR_DIR = os.path.dirname(__file__) WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') +WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, + 'content', + 'wordpress_content_encoded') +WORDPRESS_DECODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, + 'content', + 'wordpress_content_decoded') try: from bs4 import BeautifulSoup @@ -21,38 +28,33 @@ except ImportError: class TestWordpressXmlImporter(unittest.TestCase): def setUp(self): - self.posts = wp2fields(WORDPRESS_XML_SAMPLE) + self.posts = list(wp2fields(WORDPRESS_XML_SAMPLE)) def test_ignore_empty_posts(self): - - posts = list(self.posts) - self.assertTrue(posts) - for title, content, fname, date, author, categ, tags, format in posts: + self.assertTrue(self.posts) + for title, content, fname, date, author, categ, tags, format in self.posts: self.assertTrue(title.strip()) def test_can_toggle_raw_html_code_parsing(self): - - posts = list(self.posts) r = lambda f: open(f).read() silent_f2p = mute(True)(fields2pelican) with temporary_folder() as temp: - rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp)) + rst_files = (r(f) for f in silent_f2p(self.posts, 'markdown', temp)) self.assertTrue(any(' entities in the" " title. You can't miss them.") self.assertTrue('&' not in title) + + def test_decode_wp_content_returns_empty(self): + """ Check that given an empty string we return an empty string.""" + self.assertEqual(decode_wp_content(""), "") + + def test_decode_wp_content(self): + """ Check that we can decode a wordpress content string.""" + with open(WORDPRESS_ENCODED_CONTENT_SAMPLE, 'r') as encoded_file: + encoded_content = encoded_file.read() + with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file: + decoded_content = decoded_file.read() + self.assertEqual(decode_wp_content(encoded_content, br=False), decoded_content) + + def test_preserve_verbatim_formatting(self): + r = lambda f: open(f).read() + silent_f2p = mute(True)(fields2pelican) + test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts) + with temporary_folder() as temp: + md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0] + self.assertTrue(re.search(r'\s+a = \[1, 2, 3\]', md)) + self.assertTrue(re.search(r'\s+b = \[4, 5, 6\]', md)) + + for_line = re.search(r'\s+for i in zip\(a, b\):', md).group(0) + print_line = re.search(r'\s+print i', md).group(0) + self.assertTrue(for_line.rindex('for') < print_line.rindex('print')) + + def test_code_in_list(self): + r = lambda f: open(f).read() + silent_f2p = mute(True)(fields2pelican) + test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts) + with temporary_folder() as temp: + md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0] + sample_line = re.search(r'- This is a code sample', md).group(0) + code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0) + self.assertTrue(sample_line.rindex('This') < code_line.rindex('a')) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index b3587a13..53998eff 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -10,6 +10,7 @@ except ImportError: # py2 import from HTMLParser import HTMLParser # NOQA import os +import re import subprocess import sys import time @@ -19,6 +20,79 @@ from codecs import open from pelican.utils import slugify +def decode_wp_content(content, br=True): + pre_tags = {} + if content.strip() == "": + return "" + + content += "\n" + if "") + last_pre = pre_parts.pop() + content = "" + pre_index = 0 + + for pre_part in pre_parts: + start = pre_part.index("" + content = content + pre_part[0:start] + name + pre_index += 1 + content = content + last_pre + + content = re.sub(r'
    \s*
    ', "\n\n", content) + allblocks = ('(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|' + 'td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|' + 'map|area|blockquote|address|math|style|p|h[1-6]|hr|' + 'fieldset|noscript|samp|legend|section|article|aside|' + 'hgroup|header|footer|nav|figure|figcaption|details|' + 'menu|summary)') + content = re.sub(r'(<' + allblocks + r'[^>]*>)', "\n\\1", content) + content = re.sub(r'()', "\\1\n\n", content) + # content = content.replace("\r\n", "\n") + if " inside object/embed + content = re.sub(r'\s*]*)>\s*', "", content) + content = re.sub(r'\s*\s*', '', content) + # content = re.sub(r'/\n\n+/', '\n\n', content) + pgraphs = filter(lambda s: s != "", re.split(r'\n\s*\n', content)) + content = "" + for p in pgraphs: + content = content + "

    " + p.strip() + "

    \n" + # under certain strange conditions it could create a P of entirely whitespace + content = re.sub(r'

    \s*

    ', '', content) + content = re.sub(r'

    ([^<]+)', "

    \\1

    ", content) + # don't wrap tags + content = re.sub(r'

    \s*(]*>)\s*

    ', "\\1", content) + #problem with nested lists + content = re.sub(r'

    (', "\\1", content) + content = re.sub(r'

    ]*)>', "

    ", content) + content = content.replace('

    ', '

    ') + content = re.sub(r'

    \s*(]*>)', "\\1", content) + content = re.sub(r'(]*>)\s*

    ', "\\1", content) + if br: + def _preserve_newline(match): + return match.group(0).replace("\n", "") + content = re.sub(r'/<(script|style).*?<\/\\1>/s', _preserve_newline, content) + # optionally make line breaks + content = re.sub(r'(?)\s*\n', "
    \n", content) + content = content.replace("", "\n") + content = re.sub(r'(]*>)\s*
    ', "\\1", content) + content = re.sub(r'
    (\s*]*>)', '\\1', content) + content = re.sub(r'\n

    ', "

    ", content) + + if pre_tags: + def _multi_replace(dic, string): + pattern = r'|'.join(map(re.escape, dic.keys())) + return re.sub(pattern, lambda m: dic[m.group()], string) + content = _multi_replace(pre_tags, content) + + return content + + def wp2fields(xml): """Opens a wordpress XML file, and yield pelican fields""" try: @@ -55,7 +129,7 @@ def wp2fields(xml): tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})] - yield (title, content, filename, date, author, categories, tags, "html") + yield (title, content, filename, date, author, categories, tags, "wp-html") def dc2fields(file): """Opens a Dotclear export file, and yield pelican fields""" @@ -257,15 +331,18 @@ def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=Fals print(out_filename) - if in_markup == "html": + if in_markup in ("html", "wp-html"): html_filename = os.path.join(output_path, filename+'.html') with open(html_filename, 'w', encoding='utf-8') as fp: # Replace newlines with paragraphs wrapped with

    so # HTML is valid before conversion - paragraphs = content.splitlines() - paragraphs = ['

    {0}

    '.format(p) for p in paragraphs] - new_content = ''.join(paragraphs) + if in_markup == "wp-html": + new_content = decode_wp_content(content) + else: + paragraphs = content.splitlines() + paragraphs = ['

    {0}

    '.format(p) for p in paragraphs] + new_content = ''.join(paragraphs) fp.write(new_content)