From 83a8059d02af772e1e094e36d8a22fa66b5030db Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Sat, 28 Oct 2023 15:55:02 +0300
Subject: [PATCH 1/2] force timestamp conversion in tumblr importer to be UTC
 with offset and adjust tests

---
 pelican/tests/test_importer.py  | 18 ++++++------------
 pelican/tools/pelican_import.py | 11 +++++++----
 2 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index f45f885c..0d9586f0 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -1,12 +1,9 @@
-import datetime
 import locale
 import os
 import re
 from posixpath import join as posix_join
 from unittest.mock import patch
 
-import dateutil.tz
-
 from pelican.settings import DEFAULT_CONFIG
 from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
                                    unittest)
@@ -46,12 +43,9 @@ class TestWithOsDefaults(unittest.TestCase):
     def setUp(self):
         self.old_locale = locale.setlocale(locale.LC_ALL)
         locale.setlocale(locale.LC_ALL, 'C')
-        self.old_timezone = datetime.datetime.now(dateutil.tz.tzlocal()).tzname()
-        os.environ['TZ'] = 'UTC'
 
     def tearDown(self):
         locale.setlocale(locale.LC_ALL, self.old_locale)
-        os.environ['TZ'] = self.old_timezone
 
 
 @skipIfNoExecutable(['pandoc', '--version'])
@@ -502,7 +496,7 @@ class TestTumblrImporter(TestWithOsDefaults):
                 {
                     "type": "photo",
                     "blog_name": "testy",
-                    "date": "2019-11-07 21:26:40 GMT",
+                    "date": "2019-11-07 21:26:40 UTC",
                     "timestamp": 1573162000,
                     "format": "html",
                     "slug": "a-slug",
@@ -528,7 +522,7 @@ class TestTumblrImporter(TestWithOsDefaults):
         self.assertEqual(
             [('Photo',
               '<img alt="" src="https://..fccdc2360ba7182a.jpg" />\n',
-              '2019-11-07-a-slug', '2019-11-07 21:26:40', 'testy', ['photo'],
+              '2019-11-07-a-slug', '2019-11-07 21:26:40+0000', 'testy', ['photo'],
               ['economics'], 'published', 'article', 'html')],
             posts,
             posts)
@@ -544,7 +538,7 @@ class TestTumblrImporter(TestWithOsDefaults):
                     "type": "video",
                     "blog_name": "testy",
                     "slug": "the-slug",
-                    "date": "2017-07-07 20:31:41 GMT",
+                    "date": "2017-07-07 20:31:41 UTC",
                     "timestamp": 1499459501,
                     "state": "published",
                     "format": "html",
@@ -583,7 +577,7 @@ class TestTumblrImporter(TestWithOsDefaults):
               '<iframe>2</iframe>\n'
               '<iframe>3</iframe>\n',
               '2017-07-07-the-slug',
-              '2017-07-07 20:31:41', 'testy', ['video'], [], 'published',
+              '2017-07-07 20:31:41+0000', 'testy', ['video'], [], 'published',
               'article', 'html')],
             posts,
             posts)
@@ -599,7 +593,7 @@ class TestTumblrImporter(TestWithOsDefaults):
                     "type": "video",
                     "blog_name": "testy",
                     "slug": "the-slug",
-                    "date": "2016-08-14 16:37:35 GMT",
+                    "date": "2016-08-14 16:37:35 UTC",
                     "timestamp": 1471192655,
                     "state": "published",
                     "format": "html",
@@ -638,7 +632,7 @@ class TestTumblrImporter(TestWithOsDefaults):
               'v=b">via</a></p>\n<p>Caption</p>'
               '<p>(This video isn\'t available anymore.)</p>\n',
               '2016-08-14-the-slug',
-              '2016-08-14 16:37:35', 'testy', ['video'], ['interviews'],
+              '2016-08-14 16:37:35+0000', 'testy', ['video'], ['interviews'],
               'published', 'article', 'html')],
             posts,
             posts)
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 16ce6305..44568161 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import argparse
+import datetime
 import logging
 import os
 import re
@@ -416,10 +417,12 @@ def tumblr2fields(api_key, blogname):
             slug = post.get('slug') or slugify(title, regex_subs=subs)
             tags = post.get('tags')
             timestamp = post.get('timestamp')
-            date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
-                "%Y-%m-%d %H:%M:%S")
-            slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
-                "%Y-%m-%d-") + slug
+            date = SafeDatetime.fromtimestamp(
+                int(timestamp), tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d %H:%M:%S%z")
+            slug = SafeDatetime.fromtimestamp(
+                int(timestamp), tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d-") + slug
             format = post.get('format')
             content = post.get('body')
             type = post.get('type')

From 11c13ceae1c72bd786a1b09657de2926eb6ae267 Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Sat, 28 Oct 2023 16:31:05 +0300
Subject: [PATCH 2/2] use a tempfile for intermediate html file for pandoc in
 importer

---
 pelican/tools/pelican_import.py | 64 ++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 44568161..95e196ba 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -7,6 +7,7 @@ import os
 import re
 import subprocess
 import sys
+import tempfile
 import time
 from collections import defaultdict
 from html import unescape
@@ -785,9 +786,8 @@ def fields2pelican(
         print(out_filename)
 
         if in_markup in ('html', 'wp-html'):
-            html_filename = os.path.join(output_path, filename + '.html')
-
-            with open(html_filename, 'w', encoding='utf-8') as fp:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                html_filename = os.path.join(tmpdir, 'pandoc-input.html')
                 # Replace newlines with paragraphs wrapped with <p> so
                 # HTML is valid before conversion
                 if in_markup == 'wp-html':
@@ -796,41 +796,39 @@ def fields2pelican(
                     paragraphs = content.splitlines()
                     paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
                     new_content = ''.join(paragraphs)
+                with open(html_filename, 'w', encoding='utf-8') as fp:
+                    fp.write(new_content)
 
-                fp.write(new_content)
+                if pandoc_version < (2,):
+                    parse_raw = '--parse-raw' if not strip_raw else ''
+                    wrap_none = '--wrap=none' \
+                        if pandoc_version >= (1, 16) else '--no-wrap'
+                    cmd = ('pandoc --normalize {0} --from=html'
+                           ' --to={1} {2} -o "{3}" "{4}"')
+                    cmd = cmd.format(parse_raw,
+                                     out_markup if out_markup != 'markdown' else "gfm",
+                                     wrap_none,
+                                     out_filename, html_filename)
+                else:
+                    from_arg = '-f html+raw_html' if not strip_raw else '-f html'
+                    cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
+                    cmd = cmd.format(from_arg,
+                                     out_markup if out_markup != 'markdown' else "gfm",
+                                     out_filename, html_filename)
 
-            if pandoc_version < (2,):
-                parse_raw = '--parse-raw' if not strip_raw else ''
-                wrap_none = '--wrap=none' \
-                    if pandoc_version >= (1, 16) else '--no-wrap'
-                cmd = ('pandoc --normalize {0} --from=html'
-                       ' --to={1} {2} -o "{3}" "{4}"')
-                cmd = cmd.format(parse_raw,
-                                 out_markup if out_markup != 'markdown' else "gfm",
-                                 wrap_none,
-                                 out_filename, html_filename)
-            else:
-                from_arg = '-f html+raw_html' if not strip_raw else '-f html'
-                cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
-                cmd = cmd.format(from_arg,
-                                 out_markup if out_markup != 'markdown' else "gfm",
-                                 out_filename, html_filename)
+                try:
+                    rc = subprocess.call(cmd, shell=True)
+                    if rc < 0:
+                        error = 'Child was terminated by signal %d' % -rc
+                        exit(error)
 
-            try:
-                rc = subprocess.call(cmd, shell=True)
-                if rc < 0:
-                    error = 'Child was terminated by signal %d' % -rc
+                    elif rc > 0:
+                        error = 'Please, check your Pandoc installation.'
+                        exit(error)
+                except OSError as e:
+                    error = 'Pandoc execution failed: %s' % e
                     exit(error)
 
-                elif rc > 0:
-                    error = 'Please, check your Pandoc installation.'
-                    exit(error)
-            except OSError as e:
-                error = 'Pandoc execution failed: %s' % e
-                exit(error)
-
-            os.remove(html_filename)
-
             with open(out_filename, encoding='utf-8') as fs:
                 content = fs.read()
                 if out_markup == 'markdown':