From 150d1f05d0a25fc791e0e4b14060303a2742996b Mon Sep 17 00:00:00 2001 From: David Alfonso Date: Tue, 26 Jun 2018 18:47:42 +0200 Subject: [PATCH] Add pandoc2 support to pelican-import. Fix #2255 Specific options passed to pandoc2 in order to get similar results than with pandoc1: - Disable smart quotes from the markdown output. - Enable raw parsing from html. --- pelican/tests/content/wordpressexport.xml | 6 +++- pelican/tests/test_importer.py | 13 ++++++++ pelican/tools/pelican_import.py | 40 ++++++++++++++++++++--- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/pelican/tests/content/wordpressexport.xml b/pelican/tests/content/wordpressexport.xml index 686b0fa7..9b194e8f 100644 --- a/pelican/tests/content/wordpressexport.xml +++ b/pelican/tests/content/wordpressexport.xml @@ -554,7 +554,11 @@ Pelicans are supposed to eat fish, damn it! -Bottom line: don't mess up with birds]]> +Bottom line: don't mess up with birds + +"That's a 'wonderful' shoe." + +“That’s a ‘magic’ sock.”]]> 173 2012-02-16 15:52:55 diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index aa16f6ad..7bb4aa6e 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -268,6 +268,19 @@ class TestWordpressXmlImporter(unittest.TestCase): code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0) self.assertTrue(sample_line.rindex('This') < code_line.rindex('a')) + def test_dont_use_smart_quotes(self): + def r(f): + with open(f, encoding='utf-8') as infile: + return infile.read() + silent_f2p = mute(True)(fields2pelican) + test_post = filter( + lambda p: p[0].startswith("Post with raw data"), + self.posts) + with temporary_folder() as temp: + md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0] + escaped_quotes = re.search(r'\\[\'"“”‘’]', md) + self.assertFalse(escaped_quotes) + class TestBuildHeader(unittest.TestCase): def test_build_header(self): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 63d1b336..461a3263 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -674,6 +674,22 @@ def download_attachments(output_path, urls): return locations +def is_pandoc_needed(fields): + in_markup_idx = 9 + return filter(lambda f: f[in_markup_idx] in ('html', 'wp-html'), fields) + + +def get_pandoc_version(): + cmd = ['pandoc', '--version'] + try: + output = subprocess.check_output(cmd, universal_newlines=True) + except (subprocess.CalledProcessError, OSError) as e: + logger.warning("Pandoc version unknown: %s", e) + return '' + + return output.split()[1] + + def update_links_to_attached_files(content, attachments): for old_url, new_path in attachments.items(): # url may occur both with http:// and https:// @@ -689,6 +705,14 @@ def fields2pelican( dircat=False, strip_raw=False, disable_slugs=False, dirpage=False, filename_template=None, filter_author=None, wp_custpost=False, wp_attach=False, attachments=None): + + pandoc_version = get_pandoc_version() + + if is_pandoc_needed(fields) and not pandoc_version: + error = ('Pandoc must be installed to complete the ' + 'requested import action.') + exit(error) + for (title, content, filename, date, author, categories, tags, status, kind, in_markup) in fields: if filter_author and filter_author != author: @@ -735,11 +759,17 @@ def fields2pelican( fp.write(new_content) - parse_raw = '--parse-raw' if not strip_raw else '' - cmd = ('pandoc --normalize {0} --from=html' - ' --to={1} -o "{2}" "{3}"') - cmd = cmd.format(parse_raw, out_markup, - out_filename, html_filename) + if pandoc_version[0] == '1': + parse_raw = '--parse-raw' if not strip_raw else '' + cmd = ('pandoc --normalize {0} --from=html' + ' --to={1} -o "{2}" "{3}"') + cmd = cmd.format(parse_raw, out_markup, + out_filename, html_filename) + else: + from_arg = '-f html+raw_html' if not strip_raw else '-f html' + cmd = ('pandoc {0} --to={1}-smart -o "{2}" "{3}"') + cmd = cmd.format(from_arg, out_markup, + out_filename, html_filename) try: rc = subprocess.call(cmd, shell=True)