From 219c01afb07e41020224e5bd92ecb48ba6405000 Mon Sep 17 00:00:00 2001 From: "Martin (mart-e)" Date: Wed, 29 Mar 2023 13:39:27 +0200 Subject: [PATCH] [IMP] pelican_import with gmf instead of markdown The markdown import of pandoc is their own flavour of markdown. It for instance uses fenced divs[1] which are not supported by python-markdown. When importing content from Wordpress, there is several issues as explained in discussion 3113[2] This change follows a discussion with pandoc developer[3] [1] https://pandoc.org/MANUAL.html#divs-and-spans [2] https://github.com/getpelican/pelican/discussions/3113 [3] https://fosstodon.org/@pandoc/110105559949588768 Take the following Wordpress blog post sample: ```html

Paragraph content


Some caption

``` Before this commit: was imported as ```md ``{=html} Paragraph content ``{=html} ``{=html} ::: wp-block-image

Some caption
::: ``{=html} ``` After this change: ```md Paragraph content

Some caption
``` Fixes #3113 --- pelican/tests/test_importer.py | 2 +- pelican/tools/pelican_import.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 76feb9ce..198ee0fe 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -317,7 +317,7 @@ class TestWordpressXmlImporter(unittest.TestCase): self.posts) with temporary_folder() as temp: md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0] - sample_line = re.search(r'- This is a code sample', md).group(0) + sample_line = re.search(r'- This is a code sample', md).group(0) code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0) self.assertTrue(sample_line.rindex('This') < code_line.rindex('a')) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 5b08b6b5..f8a6c631 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -839,12 +839,15 @@ def fields2pelican( if pandoc_version >= (1, 16) else '--no-wrap' cmd = ('pandoc --normalize {0} --from=html' ' --to={1} {2} -o "{3}" "{4}"') - cmd = cmd.format(parse_raw, out_markup, wrap_none, + cmd = cmd.format(parse_raw, + out_markup if out_markup != 'markdown' else "gfm", + wrap_none, out_filename, html_filename) else: from_arg = '-f html+raw_html' if not strip_raw else '-f html' cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"') - cmd = cmd.format(from_arg, out_markup, + cmd = cmd.format(from_arg, + out_markup if out_markup != 'markdown' else "gfm", out_filename, html_filename) try: