From 150d1f05d0a25fc791e0e4b14060303a2742996b Mon Sep 17 00:00:00 2001
From: David Alfonso <developer@davidalfonso.es>
Date: Tue, 26 Jun 2018 18:47:42 +0200
Subject: [PATCH] Add pandoc2 support to pelican-import. Fix #2255

Specific options passed to pandoc2 in order to get similar results than
with pandoc1:

- Disable smart quotes from the markdown output.

- Enable raw parsing from html.
---
 pelican/tests/content/wordpressexport.xml |  6 +++-
 pelican/tests/test_importer.py            | 13 ++++++++
 pelican/tools/pelican_import.py           | 40 ++++++++++++++++++++---
 3 files changed, 53 insertions(+), 6 deletions(-)
diff --git a/pelican/tests/content/wordpressexport.xml b/pelican/tests/content/wordpressexport.xml
index 686b0fa7..9b194e8f 100644
--- a/pelican/tests/content/wordpressexport.xml
+++ b/pelican/tests/content/wordpressexport.xml
@@ -554,7 +554,11 @@ Pelicans are supposed to eat fish, damn it!
 
 <iframe width="420" height="315" src="http://www.youtube.com/embed/QNNl_uWmQXE" frameborder="0" allowfullscreen></iframe>
 
-Bottom line: don't mess up with birds]]></content:encoded>
+Bottom line: don't mess up with birds
+
+"That's a 'wonderful' shoe."
+
+“That’s a ‘magic’ sock.”]]></content:encoded>
         <excerpt:encoded><![CDATA[]]></excerpt:encoded>
         <wp:post_id>173</wp:post_id>
         <wp:post_date>2012-02-16 15:52:55</wp:post_date>
diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index aa16f6ad..7bb4aa6e 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -268,6 +268,19 @@ class TestWordpressXmlImporter(unittest.TestCase):
             code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0)
             self.assertTrue(sample_line.rindex('This') < code_line.rindex('a'))
 
+    def test_dont_use_smart_quotes(self):
+        def r(f):
+            with open(f, encoding='utf-8') as infile:
+                return infile.read()
+        silent_f2p = mute(True)(fields2pelican)
+        test_post = filter(
+            lambda p: p[0].startswith("Post with raw data"),
+            self.posts)
+        with temporary_folder() as temp:
+            md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0]
+            escaped_quotes = re.search(r'\\[\'"“”‘’]', md)
+            self.assertFalse(escaped_quotes)
+
 
 class TestBuildHeader(unittest.TestCase):
     def test_build_header(self):
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 63d1b336..461a3263 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -674,6 +674,22 @@ def download_attachments(output_path, urls):
     return locations
 
 
+def is_pandoc_needed(fields):
+    in_markup_idx = 9
+    return filter(lambda f: f[in_markup_idx] in ('html', 'wp-html'), fields)
+
+
+def get_pandoc_version():
+    cmd = ['pandoc', '--version']
+    try:
+        output = subprocess.check_output(cmd, universal_newlines=True)
+    except (subprocess.CalledProcessError, OSError) as e:
+        logger.warning("Pandoc version unknown: %s", e)
+        return ''
+
+    return output.split()[1]
+
+
 def update_links_to_attached_files(content, attachments):
     for old_url, new_path in attachments.items():
         # url may occur both with http:// and https://
@@ -689,6 +705,14 @@ def fields2pelican(
         dircat=False, strip_raw=False, disable_slugs=False,
         dirpage=False, filename_template=None, filter_author=None,
         wp_custpost=False, wp_attach=False, attachments=None):
+
+    pandoc_version = get_pandoc_version()
+
+    if is_pandoc_needed(fields) and not pandoc_version:
+        error = ('Pandoc must be installed to complete the '
+                 'requested import action.')
+        exit(error)
+
     for (title, content, filename, date, author, categories, tags, status,
             kind, in_markup) in fields:
         if filter_author and filter_author != author:
@@ -735,11 +759,17 @@ def fields2pelican(
 
                 fp.write(new_content)
 
-            parse_raw = '--parse-raw' if not strip_raw else ''
-            cmd = ('pandoc --normalize {0} --from=html'
-                   ' --to={1} -o "{2}" "{3}"')
-            cmd = cmd.format(parse_raw, out_markup,
-                             out_filename, html_filename)
+            if pandoc_version[0] == '1':
+                parse_raw = '--parse-raw' if not strip_raw else ''
+                cmd = ('pandoc --normalize {0} --from=html'
+                       ' --to={1} -o "{2}" "{3}"')
+                cmd = cmd.format(parse_raw, out_markup,
+                                 out_filename, html_filename)
+            else:
+                from_arg = '-f html+raw_html' if not strip_raw else '-f html'
+                cmd = ('pandoc {0} --to={1}-smart -o "{2}" "{3}"')
+                cmd = cmd.format(from_arg, out_markup,
+                                 out_filename, html_filename)
 
             try:
                 rc = subprocess.call(cmd, shell=True)