Merge pull request #3221 from avaris/importer-fixes

This commit is contained in:
Justin Mayer 2023-10-28 16:10:47 +02:00 committed by GitHub
commit 6ed7395812
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 52 deletions

View file

@ -1,12 +1,9 @@
import datetime
import locale import locale
import os import os
import re import re
from posixpath import join as posix_join from posixpath import join as posix_join
from unittest.mock import patch from unittest.mock import patch
import dateutil.tz
from pelican.settings import DEFAULT_CONFIG from pelican.settings import DEFAULT_CONFIG
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder, from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
unittest) unittest)
@ -46,12 +43,9 @@ class TestWithOsDefaults(unittest.TestCase):
def setUp(self): def setUp(self):
self.old_locale = locale.setlocale(locale.LC_ALL) self.old_locale = locale.setlocale(locale.LC_ALL)
locale.setlocale(locale.LC_ALL, 'C') locale.setlocale(locale.LC_ALL, 'C')
self.old_timezone = datetime.datetime.now(dateutil.tz.tzlocal()).tzname()
os.environ['TZ'] = 'UTC'
def tearDown(self): def tearDown(self):
locale.setlocale(locale.LC_ALL, self.old_locale) locale.setlocale(locale.LC_ALL, self.old_locale)
os.environ['TZ'] = self.old_timezone
@skipIfNoExecutable(['pandoc', '--version']) @skipIfNoExecutable(['pandoc', '--version'])
@ -502,7 +496,7 @@ class TestTumblrImporter(TestWithOsDefaults):
{ {
"type": "photo", "type": "photo",
"blog_name": "testy", "blog_name": "testy",
"date": "2019-11-07 21:26:40 GMT", "date": "2019-11-07 21:26:40 UTC",
"timestamp": 1573162000, "timestamp": 1573162000,
"format": "html", "format": "html",
"slug": "a-slug", "slug": "a-slug",
@ -528,7 +522,7 @@ class TestTumblrImporter(TestWithOsDefaults):
self.assertEqual( self.assertEqual(
[('Photo', [('Photo',
'<img alt="" src="https://..fccdc2360ba7182a.jpg" />\n', '<img alt="" src="https://..fccdc2360ba7182a.jpg" />\n',
'2019-11-07-a-slug', '2019-11-07 21:26:40', 'testy', ['photo'], '2019-11-07-a-slug', '2019-11-07 21:26:40+0000', 'testy', ['photo'],
['economics'], 'published', 'article', 'html')], ['economics'], 'published', 'article', 'html')],
posts, posts,
posts) posts)
@ -544,7 +538,7 @@ class TestTumblrImporter(TestWithOsDefaults):
"type": "video", "type": "video",
"blog_name": "testy", "blog_name": "testy",
"slug": "the-slug", "slug": "the-slug",
"date": "2017-07-07 20:31:41 GMT", "date": "2017-07-07 20:31:41 UTC",
"timestamp": 1499459501, "timestamp": 1499459501,
"state": "published", "state": "published",
"format": "html", "format": "html",
@ -583,7 +577,7 @@ class TestTumblrImporter(TestWithOsDefaults):
'<iframe>2</iframe>\n' '<iframe>2</iframe>\n'
'<iframe>3</iframe>\n', '<iframe>3</iframe>\n',
'2017-07-07-the-slug', '2017-07-07-the-slug',
'2017-07-07 20:31:41', 'testy', ['video'], [], 'published', '2017-07-07 20:31:41+0000', 'testy', ['video'], [], 'published',
'article', 'html')], 'article', 'html')],
posts, posts,
posts) posts)
@ -599,7 +593,7 @@ class TestTumblrImporter(TestWithOsDefaults):
"type": "video", "type": "video",
"blog_name": "testy", "blog_name": "testy",
"slug": "the-slug", "slug": "the-slug",
"date": "2016-08-14 16:37:35 GMT", "date": "2016-08-14 16:37:35 UTC",
"timestamp": 1471192655, "timestamp": 1471192655,
"state": "published", "state": "published",
"format": "html", "format": "html",
@ -638,7 +632,7 @@ class TestTumblrImporter(TestWithOsDefaults):
'v=b">via</a></p>\n<p>Caption</p>' 'v=b">via</a></p>\n<p>Caption</p>'
'<p>(This video isn\'t available anymore.)</p>\n', '<p>(This video isn\'t available anymore.)</p>\n',
'2016-08-14-the-slug', '2016-08-14-the-slug',
'2016-08-14 16:37:35', 'testy', ['video'], ['interviews'], '2016-08-14 16:37:35+0000', 'testy', ['video'], ['interviews'],
'published', 'article', 'html')], 'published', 'article', 'html')],
posts, posts,
posts) posts)

View file

@ -1,11 +1,13 @@
#!/usr/bin/env python #!/usr/bin/env python
import argparse import argparse
import datetime
import logging import logging
import os import os
import re import re
import subprocess import subprocess
import sys import sys
import tempfile
import time import time
from collections import defaultdict from collections import defaultdict
from html import unescape from html import unescape
@ -416,10 +418,12 @@ def tumblr2fields(api_key, blogname):
slug = post.get('slug') or slugify(title, regex_subs=subs) slug = post.get('slug') or slugify(title, regex_subs=subs)
tags = post.get('tags') tags = post.get('tags')
timestamp = post.get('timestamp') timestamp = post.get('timestamp')
date = SafeDatetime.fromtimestamp(int(timestamp)).strftime( date = SafeDatetime.fromtimestamp(
"%Y-%m-%d %H:%M:%S") int(timestamp), tz=datetime.timezone.utc
slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime( ).strftime("%Y-%m-%d %H:%M:%S%z")
"%Y-%m-%d-") + slug slug = SafeDatetime.fromtimestamp(
int(timestamp), tz=datetime.timezone.utc
).strftime("%Y-%m-%d-") + slug
format = post.get('format') format = post.get('format')
content = post.get('body') content = post.get('body')
type = post.get('type') type = post.get('type')
@ -782,9 +786,8 @@ def fields2pelican(
print(out_filename) print(out_filename)
if in_markup in ('html', 'wp-html'): if in_markup in ('html', 'wp-html'):
html_filename = os.path.join(output_path, filename + '.html') with tempfile.TemporaryDirectory() as tmpdir:
html_filename = os.path.join(tmpdir, 'pandoc-input.html')
with open(html_filename, 'w', encoding='utf-8') as fp:
# Replace newlines with paragraphs wrapped with <p> so # Replace newlines with paragraphs wrapped with <p> so
# HTML is valid before conversion # HTML is valid before conversion
if in_markup == 'wp-html': if in_markup == 'wp-html':
@ -793,41 +796,39 @@ def fields2pelican(
paragraphs = content.splitlines() paragraphs = content.splitlines()
paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs] paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
new_content = ''.join(paragraphs) new_content = ''.join(paragraphs)
with open(html_filename, 'w', encoding='utf-8') as fp:
fp.write(new_content)
fp.write(new_content) if pandoc_version < (2,):
parse_raw = '--parse-raw' if not strip_raw else ''
wrap_none = '--wrap=none' \
if pandoc_version >= (1, 16) else '--no-wrap'
cmd = ('pandoc --normalize {0} --from=html'
' --to={1} {2} -o "{3}" "{4}"')
cmd = cmd.format(parse_raw,
out_markup if out_markup != 'markdown' else "gfm",
wrap_none,
out_filename, html_filename)
else:
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
cmd = cmd.format(from_arg,
out_markup if out_markup != 'markdown' else "gfm",
out_filename, html_filename)
if pandoc_version < (2,): try:
parse_raw = '--parse-raw' if not strip_raw else '' rc = subprocess.call(cmd, shell=True)
wrap_none = '--wrap=none' \ if rc < 0:
if pandoc_version >= (1, 16) else '--no-wrap' error = 'Child was terminated by signal %d' % -rc
cmd = ('pandoc --normalize {0} --from=html' exit(error)
' --to={1} {2} -o "{3}" "{4}"')
cmd = cmd.format(parse_raw,
out_markup if out_markup != 'markdown' else "gfm",
wrap_none,
out_filename, html_filename)
else:
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
cmd = cmd.format(from_arg,
out_markup if out_markup != 'markdown' else "gfm",
out_filename, html_filename)
try: elif rc > 0:
rc = subprocess.call(cmd, shell=True) error = 'Please, check your Pandoc installation.'
if rc < 0: exit(error)
error = 'Child was terminated by signal %d' % -rc except OSError as e:
error = 'Pandoc execution failed: %s' % e
exit(error) exit(error)
elif rc > 0:
error = 'Please, check your Pandoc installation.'
exit(error)
except OSError as e:
error = 'Pandoc execution failed: %s' % e
exit(error)
os.remove(html_filename)
with open(out_filename, encoding='utf-8') as fs: with open(out_filename, encoding='utf-8') as fs:
content = fs.read() content = fs.read()
if out_markup == 'markdown': if out_markup == 'markdown':