mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #3221 from avaris/importer-fixes
This commit is contained in:
commit
6ed7395812
2 changed files with 47 additions and 52 deletions
|
|
@ -1,12 +1,9 @@
|
||||||
import datetime
|
|
||||||
import locale
|
import locale
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from posixpath import join as posix_join
|
from posixpath import join as posix_join
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import dateutil.tz
|
|
||||||
|
|
||||||
from pelican.settings import DEFAULT_CONFIG
|
from pelican.settings import DEFAULT_CONFIG
|
||||||
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
|
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
|
||||||
unittest)
|
unittest)
|
||||||
|
|
@ -46,12 +43,9 @@ class TestWithOsDefaults(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.old_locale = locale.setlocale(locale.LC_ALL)
|
self.old_locale = locale.setlocale(locale.LC_ALL)
|
||||||
locale.setlocale(locale.LC_ALL, 'C')
|
locale.setlocale(locale.LC_ALL, 'C')
|
||||||
self.old_timezone = datetime.datetime.now(dateutil.tz.tzlocal()).tzname()
|
|
||||||
os.environ['TZ'] = 'UTC'
|
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
locale.setlocale(locale.LC_ALL, self.old_locale)
|
locale.setlocale(locale.LC_ALL, self.old_locale)
|
||||||
os.environ['TZ'] = self.old_timezone
|
|
||||||
|
|
||||||
|
|
||||||
@skipIfNoExecutable(['pandoc', '--version'])
|
@skipIfNoExecutable(['pandoc', '--version'])
|
||||||
|
|
@ -502,7 +496,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
{
|
{
|
||||||
"type": "photo",
|
"type": "photo",
|
||||||
"blog_name": "testy",
|
"blog_name": "testy",
|
||||||
"date": "2019-11-07 21:26:40 GMT",
|
"date": "2019-11-07 21:26:40 UTC",
|
||||||
"timestamp": 1573162000,
|
"timestamp": 1573162000,
|
||||||
"format": "html",
|
"format": "html",
|
||||||
"slug": "a-slug",
|
"slug": "a-slug",
|
||||||
|
|
@ -528,7 +522,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
[('Photo',
|
[('Photo',
|
||||||
'<img alt="" src="https://..fccdc2360ba7182a.jpg" />\n',
|
'<img alt="" src="https://..fccdc2360ba7182a.jpg" />\n',
|
||||||
'2019-11-07-a-slug', '2019-11-07 21:26:40', 'testy', ['photo'],
|
'2019-11-07-a-slug', '2019-11-07 21:26:40+0000', 'testy', ['photo'],
|
||||||
['economics'], 'published', 'article', 'html')],
|
['economics'], 'published', 'article', 'html')],
|
||||||
posts,
|
posts,
|
||||||
posts)
|
posts)
|
||||||
|
|
@ -544,7 +538,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
"type": "video",
|
"type": "video",
|
||||||
"blog_name": "testy",
|
"blog_name": "testy",
|
||||||
"slug": "the-slug",
|
"slug": "the-slug",
|
||||||
"date": "2017-07-07 20:31:41 GMT",
|
"date": "2017-07-07 20:31:41 UTC",
|
||||||
"timestamp": 1499459501,
|
"timestamp": 1499459501,
|
||||||
"state": "published",
|
"state": "published",
|
||||||
"format": "html",
|
"format": "html",
|
||||||
|
|
@ -583,7 +577,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
'<iframe>2</iframe>\n'
|
'<iframe>2</iframe>\n'
|
||||||
'<iframe>3</iframe>\n',
|
'<iframe>3</iframe>\n',
|
||||||
'2017-07-07-the-slug',
|
'2017-07-07-the-slug',
|
||||||
'2017-07-07 20:31:41', 'testy', ['video'], [], 'published',
|
'2017-07-07 20:31:41+0000', 'testy', ['video'], [], 'published',
|
||||||
'article', 'html')],
|
'article', 'html')],
|
||||||
posts,
|
posts,
|
||||||
posts)
|
posts)
|
||||||
|
|
@ -599,7 +593,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
"type": "video",
|
"type": "video",
|
||||||
"blog_name": "testy",
|
"blog_name": "testy",
|
||||||
"slug": "the-slug",
|
"slug": "the-slug",
|
||||||
"date": "2016-08-14 16:37:35 GMT",
|
"date": "2016-08-14 16:37:35 UTC",
|
||||||
"timestamp": 1471192655,
|
"timestamp": 1471192655,
|
||||||
"state": "published",
|
"state": "published",
|
||||||
"format": "html",
|
"format": "html",
|
||||||
|
|
@ -638,7 +632,7 @@ class TestTumblrImporter(TestWithOsDefaults):
|
||||||
'v=b">via</a></p>\n<p>Caption</p>'
|
'v=b">via</a></p>\n<p>Caption</p>'
|
||||||
'<p>(This video isn\'t available anymore.)</p>\n',
|
'<p>(This video isn\'t available anymore.)</p>\n',
|
||||||
'2016-08-14-the-slug',
|
'2016-08-14-the-slug',
|
||||||
'2016-08-14 16:37:35', 'testy', ['video'], ['interviews'],
|
'2016-08-14 16:37:35+0000', 'testy', ['video'], ['interviews'],
|
||||||
'published', 'article', 'html')],
|
'published', 'article', 'html')],
|
||||||
posts,
|
posts,
|
||||||
posts)
|
posts)
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,13 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from html import unescape
|
from html import unescape
|
||||||
|
|
@ -416,10 +418,12 @@ def tumblr2fields(api_key, blogname):
|
||||||
slug = post.get('slug') or slugify(title, regex_subs=subs)
|
slug = post.get('slug') or slugify(title, regex_subs=subs)
|
||||||
tags = post.get('tags')
|
tags = post.get('tags')
|
||||||
timestamp = post.get('timestamp')
|
timestamp = post.get('timestamp')
|
||||||
date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
|
date = SafeDatetime.fromtimestamp(
|
||||||
"%Y-%m-%d %H:%M:%S")
|
int(timestamp), tz=datetime.timezone.utc
|
||||||
slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
|
).strftime("%Y-%m-%d %H:%M:%S%z")
|
||||||
"%Y-%m-%d-") + slug
|
slug = SafeDatetime.fromtimestamp(
|
||||||
|
int(timestamp), tz=datetime.timezone.utc
|
||||||
|
).strftime("%Y-%m-%d-") + slug
|
||||||
format = post.get('format')
|
format = post.get('format')
|
||||||
content = post.get('body')
|
content = post.get('body')
|
||||||
type = post.get('type')
|
type = post.get('type')
|
||||||
|
|
@ -782,9 +786,8 @@ def fields2pelican(
|
||||||
print(out_filename)
|
print(out_filename)
|
||||||
|
|
||||||
if in_markup in ('html', 'wp-html'):
|
if in_markup in ('html', 'wp-html'):
|
||||||
html_filename = os.path.join(output_path, filename + '.html')
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
html_filename = os.path.join(tmpdir, 'pandoc-input.html')
|
||||||
with open(html_filename, 'w', encoding='utf-8') as fp:
|
|
||||||
# Replace newlines with paragraphs wrapped with <p> so
|
# Replace newlines with paragraphs wrapped with <p> so
|
||||||
# HTML is valid before conversion
|
# HTML is valid before conversion
|
||||||
if in_markup == 'wp-html':
|
if in_markup == 'wp-html':
|
||||||
|
|
@ -793,41 +796,39 @@ def fields2pelican(
|
||||||
paragraphs = content.splitlines()
|
paragraphs = content.splitlines()
|
||||||
paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
|
paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
|
||||||
new_content = ''.join(paragraphs)
|
new_content = ''.join(paragraphs)
|
||||||
|
with open(html_filename, 'w', encoding='utf-8') as fp:
|
||||||
|
fp.write(new_content)
|
||||||
|
|
||||||
fp.write(new_content)
|
if pandoc_version < (2,):
|
||||||
|
parse_raw = '--parse-raw' if not strip_raw else ''
|
||||||
|
wrap_none = '--wrap=none' \
|
||||||
|
if pandoc_version >= (1, 16) else '--no-wrap'
|
||||||
|
cmd = ('pandoc --normalize {0} --from=html'
|
||||||
|
' --to={1} {2} -o "{3}" "{4}"')
|
||||||
|
cmd = cmd.format(parse_raw,
|
||||||
|
out_markup if out_markup != 'markdown' else "gfm",
|
||||||
|
wrap_none,
|
||||||
|
out_filename, html_filename)
|
||||||
|
else:
|
||||||
|
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
|
||||||
|
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
|
||||||
|
cmd = cmd.format(from_arg,
|
||||||
|
out_markup if out_markup != 'markdown' else "gfm",
|
||||||
|
out_filename, html_filename)
|
||||||
|
|
||||||
if pandoc_version < (2,):
|
try:
|
||||||
parse_raw = '--parse-raw' if not strip_raw else ''
|
rc = subprocess.call(cmd, shell=True)
|
||||||
wrap_none = '--wrap=none' \
|
if rc < 0:
|
||||||
if pandoc_version >= (1, 16) else '--no-wrap'
|
error = 'Child was terminated by signal %d' % -rc
|
||||||
cmd = ('pandoc --normalize {0} --from=html'
|
exit(error)
|
||||||
' --to={1} {2} -o "{3}" "{4}"')
|
|
||||||
cmd = cmd.format(parse_raw,
|
|
||||||
out_markup if out_markup != 'markdown' else "gfm",
|
|
||||||
wrap_none,
|
|
||||||
out_filename, html_filename)
|
|
||||||
else:
|
|
||||||
from_arg = '-f html+raw_html' if not strip_raw else '-f html'
|
|
||||||
cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
|
|
||||||
cmd = cmd.format(from_arg,
|
|
||||||
out_markup if out_markup != 'markdown' else "gfm",
|
|
||||||
out_filename, html_filename)
|
|
||||||
|
|
||||||
try:
|
elif rc > 0:
|
||||||
rc = subprocess.call(cmd, shell=True)
|
error = 'Please, check your Pandoc installation.'
|
||||||
if rc < 0:
|
exit(error)
|
||||||
error = 'Child was terminated by signal %d' % -rc
|
except OSError as e:
|
||||||
|
error = 'Pandoc execution failed: %s' % e
|
||||||
exit(error)
|
exit(error)
|
||||||
|
|
||||||
elif rc > 0:
|
|
||||||
error = 'Please, check your Pandoc installation.'
|
|
||||||
exit(error)
|
|
||||||
except OSError as e:
|
|
||||||
error = 'Pandoc execution failed: %s' % e
|
|
||||||
exit(error)
|
|
||||||
|
|
||||||
os.remove(html_filename)
|
|
||||||
|
|
||||||
with open(out_filename, encoding='utf-8') as fs:
|
with open(out_filename, encoding='utf-8') as fs:
|
||||||
content = fs.read()
|
content = fs.read()
|
||||||
if out_markup == 'markdown':
|
if out_markup == 'markdown':
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue