From 219c01afb07e41020224e5bd92ecb48ba6405000 Mon Sep 17 00:00:00 2001
From: "Martin (mart-e)" <me@mart-e.be>
Date: Wed, 29 Mar 2023 13:39:27 +0200
Subject: [PATCH] [IMP] pelican_import with gmf instead of markdown

The markdown import of pandoc is their own flavour of markdown. It for
instance uses fenced divs[1] which are not supported by
python-markdown.  When importing content from Wordpress, there is
several issues as explained in discussion 3113[2]
This change follows a discussion with pandoc developer[3]

[1] https://pandoc.org/MANUAL.html#divs-and-spans
[2] https://github.com/getpelican/pelican/discussions/3113
[3] https://fosstodon.org/@pandoc/110105559949588768

Take the following Wordpress blog post sample:
```html
<p><!-- wp:paragraph --></p>
<p>Paragraph content</p>
<p><!-- /wp:paragraph --></p>
<p><!-- wp:image {"align":"center","id":3747,"sizeSlug":"full"} --></p>
<div class="wp-block-image">
<figure class="aligncenter size-full"><img src="https://test.com/test.jpg" alt="" class="wp-image-3747" title="Some title"/><br />
<figcaption><em>Some caption</em></figcaption>
</figure>
</div>
<p><!-- /wp:image --></p>
```
Before this commit:
was imported as

```md
`<!-- wp:paragraph -->`{=html}

Paragraph content

`<!-- /wp:paragraph -->`{=html}

`<!-- wp:image {"align":"center","id":3747,"sizeSlug":"full"} -->`{=html}

::: wp-block-image
<figure class="aligncenter size-full">
<img src="https://test.com/test.jpg" title="Some title"
class="wp-image-3747" /><br />

<figcaption><em>Some caption</em></figcaption>
</figure>
:::

`<!-- /wp:image -->`{=html}
```

After this change:
```md
<!-- wp:paragraph -->

Paragraph content

<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":3747,"sizeSlug":"full"} -->

<div class="wp-block-image">

<figure class="aligncenter size-full">
<img src="https://test.com/test.jpg" title="Some title"
class="wp-image-3747" /><br />

<figcaption><em>Some caption</em></figcaption>
</figure>

</div>

<!-- /wp:image -->
```

Fixes #3113
---
 pelican/tests/test_importer.py  | 2 +-
 pelican/tools/pelican_import.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index 76feb9ce..198ee0fe 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -317,7 +317,7 @@ class TestWordpressXmlImporter(unittest.TestCase):
             self.posts)
         with temporary_folder() as temp:
             md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0]
-            sample_line = re.search(r'-   This is a code sample', md).group(0)
+            sample_line = re.search(r'- This is a code sample', md).group(0)
             code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0)
             self.assertTrue(sample_line.rindex('This') < code_line.rindex('a'))
 
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 5b08b6b5..f8a6c631 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -839,12 +839,15 @@ def fields2pelican(
                     if pandoc_version >= (1, 16) else '--no-wrap'
                 cmd = ('pandoc --normalize {0} --from=html'
                        ' --to={1} {2} -o "{3}" "{4}"')
-                cmd = cmd.format(parse_raw, out_markup, wrap_none,
+                cmd = cmd.format(parse_raw,
+                                 out_markup if out_markup != 'markdown' else "gfm",
+                                 wrap_none,
                                  out_filename, html_filename)
             else:
                 from_arg = '-f html+raw_html' if not strip_raw else '-f html'
                 cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
-                cmd = cmd.format(from_arg, out_markup,
+                cmd = cmd.format(from_arg,
+                                 out_markup if out_markup != 'markdown' else "gfm",
                                  out_filename, html_filename)
 
             try: