use a tempfile for intermediate html file for pandoc in importer

This commit is contained in:
Deniz Turgut 2023-10-28 16:31:05 +03:00
commit 11c13ceae1
No known key found for this signature in database
GPG key ID: 87B7168D7AB3ED2F

View file

@ -7,6 +7,7 @@ import os
import re import re
import subprocess import subprocess
import sys import sys
import tempfile
import time import time
from collections import defaultdict from collections import defaultdict
from html import unescape from html import unescape
@ -785,9 +786,8 @@ def fields2pelican(
print(out_filename) print(out_filename)
if in_markup in ('html', 'wp-html'): if in_markup in ('html', 'wp-html'):
html_filename = os.path.join(output_path, filename + '.html') with tempfile.TemporaryDirectory() as tmpdir:
html_filename = os.path.join(tmpdir, 'pandoc-input.html')
with open(html_filename, 'w', encoding='utf-8') as fp:
# Replace newlines with paragraphs wrapped with <p> so # Replace newlines with paragraphs wrapped with <p> so
# HTML is valid before conversion # HTML is valid before conversion
if in_markup == 'wp-html': if in_markup == 'wp-html':
@ -796,41 +796,39 @@ def fields2pelican(
paragraphs = content.splitlines() paragraphs = content.splitlines()
paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs] paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
new_content = ''.join(paragraphs) new_content = ''.join(paragraphs)
with open(html_filename, 'w', encoding='utf-8') as fp:
fp.write(new_content)
fp.write(new_content) if pandoc_version < (2,):
parse_raw = '--parse-raw' if not strip_raw else ''
wrap_none = '--wrap=none' \
if pandoc_version >= (1, 16) else '--no-wrap'
cmd = ('pandoc --normalize {0} --from=html'
' --to={1} {2} -o "{3}" "{4}"')
cmd = cmd.format(parse_raw,
out_markup if out_markup != 'markdown' else "gfm",
wrap_none,
out_filename, html_filename)
else:
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
cmd = cmd.format(from_arg,
out_markup if out_markup != 'markdown' else "gfm",
out_filename, html_filename)
if pandoc_version < (2,): try:
parse_raw = '--parse-raw' if not strip_raw else '' rc = subprocess.call(cmd, shell=True)
wrap_none = '--wrap=none' \ if rc < 0:
if pandoc_version >= (1, 16) else '--no-wrap' error = 'Child was terminated by signal %d' % -rc
cmd = ('pandoc --normalize {0} --from=html' exit(error)
' --to={1} {2} -o "{3}" "{4}"')
cmd = cmd.format(parse_raw,
out_markup if out_markup != 'markdown' else "gfm",
wrap_none,
out_filename, html_filename)
else:
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
cmd = cmd.format(from_arg,
out_markup if out_markup != 'markdown' else "gfm",
out_filename, html_filename)
try: elif rc > 0:
rc = subprocess.call(cmd, shell=True) error = 'Please, check your Pandoc installation.'
if rc < 0: exit(error)
error = 'Child was terminated by signal %d' % -rc except OSError as e:
error = 'Pandoc execution failed: %s' % e
exit(error) exit(error)
elif rc > 0:
error = 'Please, check your Pandoc installation.'
exit(error)
except OSError as e:
error = 'Pandoc execution failed: %s' % e
exit(error)
os.remove(html_filename)
with open(out_filename, encoding='utf-8') as fs: with open(out_filename, encoding='utf-8') as fs:
content = fs.read() content = fs.read()
if out_markup == 'markdown': if out_markup == 'markdown':