mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #3264 from boxydog/medium_importer
This commit is contained in:
commit
ff35d26cbc
7 changed files with 357 additions and 21 deletions
|
|
@ -439,8 +439,8 @@ For **Markdown**, one must rely on an extension. For example, using the `mdx_inc
|
||||||
Importing an existing site
|
Importing an existing site
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
It is possible to import your site from WordPress, Tumblr, Dotclear, and RSS
|
It is possible to import your site from several other blogging sites
|
||||||
feeds using a simple script. See :ref:`import`.
|
(like WordPress, Tumblr, ..) using a simple script. See :ref:`import`.
|
||||||
|
|
||||||
Translations
|
Translations
|
||||||
============
|
============
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ software to reStructuredText or Markdown. The supported import formats are:
|
||||||
|
|
||||||
- Blogger XML export
|
- Blogger XML export
|
||||||
- Dotclear export
|
- Dotclear export
|
||||||
|
- Medium export
|
||||||
- Tumblr API
|
- Tumblr API
|
||||||
- WordPress XML export
|
- WordPress XML export
|
||||||
- RSS/Atom feed
|
- RSS/Atom feed
|
||||||
|
|
@ -71,6 +72,7 @@ Optional arguments
|
||||||
-h, --help Show this help message and exit
|
-h, --help Show this help message and exit
|
||||||
--blogger Blogger XML export (default: False)
|
--blogger Blogger XML export (default: False)
|
||||||
--dotclear Dotclear export (default: False)
|
--dotclear Dotclear export (default: False)
|
||||||
|
--medium Medium export (default: False)
|
||||||
--tumblr Tumblr API (default: False)
|
--tumblr Tumblr API (default: False)
|
||||||
--wpfile WordPress XML export (default: False)
|
--wpfile WordPress XML export (default: False)
|
||||||
--feed Feed to parse (default: False)
|
--feed Feed to parse (default: False)
|
||||||
|
|
@ -86,8 +88,7 @@ Optional arguments
|
||||||
(default: False)
|
(default: False)
|
||||||
--filter-author Import only post from the specified author
|
--filter-author Import only post from the specified author
|
||||||
--strip-raw Strip raw HTML code that can't be converted to markup
|
--strip-raw Strip raw HTML code that can't be converted to markup
|
||||||
such as flash embeds or iframes (wordpress import
|
such as flash embeds or iframes (default: False)
|
||||||
only) (default: False)
|
|
||||||
--wp-custpost Put wordpress custom post types in directories. If
|
--wp-custpost Put wordpress custom post types in directories. If
|
||||||
used with --dir-cat option directories will be created
|
used with --dir-cat option directories will be created
|
||||||
as "/post_type/category/" (wordpress import only)
|
as "/post_type/category/" (wordpress import only)
|
||||||
|
|
@ -119,6 +120,14 @@ For Dotclear::
|
||||||
|
|
||||||
$ pelican-import --dotclear -o ~/output ~/backup.txt
|
$ pelican-import --dotclear -o ~/output ~/backup.txt
|
||||||
|
|
||||||
|
For Medium::
|
||||||
|
|
||||||
|
$ pelican-import --medium -o ~/output ~/medium-export/posts/
|
||||||
|
|
||||||
|
The Medium export is a zip file. Unzip it, and point this tool to the
|
||||||
|
"posts" subdirectory. For more information on how to export, see
|
||||||
|
https://help.medium.com/hc/en-us/articles/115004745787-Export-your-account-data.
|
||||||
|
|
||||||
For Tumblr::
|
For Tumblr::
|
||||||
|
|
||||||
$ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_key>
|
$ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_key>
|
||||||
|
|
|
||||||
4
pelican/tests/content/medium_post_content.txt
vendored
Normal file
4
pelican/tests/content/medium_post_content.txt
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
|
||||||
|
<hr/><h3>Title header</h3><p>A paragraph of content.</p><p>Paragraph number two.</p><p>A list:</p><ol><li>One.</li><li>Two.</li><li>Three.</li></ol><p>A link: <a data-href="https://example.com/example" href="https://example.com/example" target="_blank">link text</a>.</p><h3>Header 2</h3><p>A block quote:</p><blockquote>quote words <strong>strong words</strong></blockquote><p>after blockquote</p><figure><img data-height="282" data-image-id="image1.png" data-width="739" src="https://cdn-images-1.medium.com/max/800/image1.png"/><figcaption>A figure caption.</figcaption></figure><p>A final note: <a data-href="http://stats.stackexchange.com/" href="http://stats.stackexchange.com/" rel="noopener" target="_blank">Cross-Validated</a> has sometimes been helpful.</p><hr/><p><em>Next: </em><a data-href="https://medium.com/@username/post-url" href="https://medium.com/@username/post-url" target="_blank"><em>Next post</em>
|
||||||
|
</a></p>
|
||||||
|
<p>By <a href="https://medium.com/@username">User Name</a> on <a href="https://medium.com/p/medium-short-url"><time datetime="2017-04-21T17:11:55.799Z">April 21, 2017</time></a>.</p><p><a href="https://medium.com/@username/this-post-url">Canonical link</a></p><p>Exported from <a href="https://medium.com">Medium</a> on December 1, 2023.</p>
|
||||||
72
pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html
vendored
Normal file
72
pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html
vendored
Normal file
|
|
@ -0,0 +1,72 @@
|
||||||
|
<!DOCTYPE html><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>A title</title><style>
|
||||||
|
* {
|
||||||
|
font-family: Georgia, Cambria, "Times New Roman", Times, serif;
|
||||||
|
}
|
||||||
|
html, body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
font-size: 50px;
|
||||||
|
margin-bottom: 17px;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
h2 {
|
||||||
|
font-size: 24px;
|
||||||
|
line-height: 1.6;
|
||||||
|
margin: 30px 0 0 0;
|
||||||
|
margin-bottom: 18px;
|
||||||
|
margin-top: 33px;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
h3 {
|
||||||
|
font-size: 30px;
|
||||||
|
margin: 10px 0 20px 0;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
header {
|
||||||
|
width: 640px;
|
||||||
|
margin: auto;
|
||||||
|
}
|
||||||
|
section {
|
||||||
|
width: 640px;
|
||||||
|
margin: auto;
|
||||||
|
}
|
||||||
|
section p {
|
||||||
|
margin-bottom: 27px;
|
||||||
|
font-size: 20px;
|
||||||
|
line-height: 1.6;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
section img {
|
||||||
|
max-width: 640px;
|
||||||
|
}
|
||||||
|
footer {
|
||||||
|
padding: 0 20px;
|
||||||
|
margin: 50px 0;
|
||||||
|
text-align: center;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
.aspectRatioPlaceholder {
|
||||||
|
max-width: auto !important;
|
||||||
|
max-height: auto !important;
|
||||||
|
}
|
||||||
|
.aspectRatioPlaceholder-fill {
|
||||||
|
padding-bottom: 0 !important;
|
||||||
|
}
|
||||||
|
header,
|
||||||
|
section[data-field=subtitle],
|
||||||
|
section[data-field=description] {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
</style></head><body><article class="h-entry">
|
||||||
|
<header>
|
||||||
|
<h1 class="p-name">A name (like title)</h1>
|
||||||
|
</header>
|
||||||
|
<section data-field="subtitle" class="p-summary">
|
||||||
|
Summary (first several words of content)
|
||||||
|
</section>
|
||||||
|
<section data-field="body" class="e-content">
|
||||||
|
<section name="ad15" class="section section--body section--first"><div class="section-divider"><hr class="section-divider"></div><div class="section-content"><div class="section-inner sectionLayout--insetColumn"><h3 name="20a3" id="20a3" class="graf graf--h3 graf--leading graf--title">Title header</h3><p name="e3d6" id="e3d6" class="graf graf--p graf-after--h3">A paragraph of content.</p><p name="c7a8" id="c7a8" class="graf graf--p graf-after--p">Paragraph number two.</p><p name="42aa" id="42aa" class="graf graf--p graf-after--p">A list:</p><ol class="postList"><li name="d65f" id="d65f" class="graf graf--li graf-after--p">One.</li><li name="232b" id="232b" class="graf graf--li graf-after--li">Two.</li><li name="ef87" id="ef87" class="graf graf--li graf-after--li">Three.</li></ol><p name="e743" id="e743" class="graf graf--p graf-after--p">A link: <a href="https://example.com/example" data-href="https://example.com/example" class="markup--anchor markup--p-anchor" target="_blank">link text</a>.</p><h3 name="4cfd" id="4cfd" class="graf graf--h3 graf-after--p">Header 2</h3><p name="433c" id="433c" class="graf graf--p graf-after--p">A block quote:</p><blockquote name="3537" id="3537" class="graf graf--blockquote graf-after--p">quote words <strong class="markup--strong markup--blockquote-strong">strong words</strong></blockquote><p name="00cc" id="00cc" class="graf graf--p graf-after--blockquote">after blockquote</p><figure name="edb0" id="edb0" class="graf graf--figure graf-after--p"><img class="graf-image" data-image-id="image1.png" data-width="739" data-height="282" src="https://cdn-images-1.medium.com/max/800/image1.png"><figcaption class="imageCaption">A figure caption.</figcaption></figure><p name="f401" id="f401" class="graf graf--p graf-after--p graf--trailing">A final note: <a href="http://stats.stackexchange.com/" data-href="http://stats.stackexchange.com/" class="markup--anchor markup--p-anchor" rel="noopener" target="_blank">Cross-Validated</a> has sometimes been helpful.</p></div></div></section><section name="09a3" class="section section--body section--last"><div class="section-divider"><hr class="section-divider"></div><div class="section-content"><div class="section-inner sectionLayout--insetColumn"><p name="81e8" id="81e8" class="graf graf--p graf--leading"><em class="markup--em markup--p-em">Next: </em><a href="https://medium.com/@username/post-url" data-href="https://medium.com/@username/post-url" class="markup--anchor markup--p-anchor" target="_blank"><em class="markup--em markup--p-em">Next post</em>
|
||||||
|
</section>
|
||||||
|
<footer><p>By <a href="https://medium.com/@username" class="p-author h-card">User Name</a> on <a href="https://medium.com/p/medium-short-url"><time class="dt-published" datetime="2017-04-21T17:11:55.799Z">April 21, 2017</time></a>.</p><p><a href="https://medium.com/@username/this-post-url" class="p-canonical">Canonical link</a></p><p>Exported from <a href="https://medium.com">Medium</a> on December 1, 2023.</p></footer></article></body></html>
|
||||||
|
|
@ -264,6 +264,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
|
|
||||||
def test_generate_context(self):
|
def test_generate_context(self):
|
||||||
articles_expected = [
|
articles_expected = [
|
||||||
|
["A title", "published", "medium_posts", "article"],
|
||||||
["Article title", "published", "Default", "article"],
|
["Article title", "published", "Default", "article"],
|
||||||
[
|
[
|
||||||
"Article with markdown and summary metadata multi",
|
"Article with markdown and summary metadata multi",
|
||||||
|
|
@ -391,13 +392,24 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
# terms of process order will define the name for that category
|
# terms of process order will define the name for that category
|
||||||
categories = [cat.name for cat, _ in self.generator.categories]
|
categories = [cat.name for cat, _ in self.generator.categories]
|
||||||
categories_alternatives = (
|
categories_alternatives = (
|
||||||
sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]),
|
sorted(
|
||||||
sorted(["Default", "TestCategory", "yeah", "test", "指導書"]),
|
["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"]
|
||||||
|
),
|
||||||
|
sorted(
|
||||||
|
["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"]
|
||||||
|
),
|
||||||
)
|
)
|
||||||
self.assertIn(sorted(categories), categories_alternatives)
|
self.assertIn(sorted(categories), categories_alternatives)
|
||||||
# test for slug
|
# test for slug
|
||||||
categories = [cat.slug for cat, _ in self.generator.categories]
|
categories = [cat.slug for cat, _ in self.generator.categories]
|
||||||
categories_expected = ["default", "testcategory", "yeah", "test", "zhi-dao-shu"]
|
categories_expected = [
|
||||||
|
"default",
|
||||||
|
"testcategory",
|
||||||
|
"medium_posts",
|
||||||
|
"yeah",
|
||||||
|
"test",
|
||||||
|
"zhi-dao-shu",
|
||||||
|
]
|
||||||
self.assertEqual(sorted(categories), sorted(categories_expected))
|
self.assertEqual(sorted(categories), sorted(categories_expected))
|
||||||
|
|
||||||
def test_do_not_use_folder_as_category(self):
|
def test_do_not_use_folder_as_category(self):
|
||||||
|
|
@ -549,7 +561,8 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
granularity: {period["period"] for period in periods}
|
granularity: {period["period"] for period in periods}
|
||||||
for granularity, periods in period_archives.items()
|
for granularity, periods in period_archives.items()
|
||||||
}
|
}
|
||||||
expected = {"year": {(1970,), (2010,), (2012,), (2014,)}}
|
self.maxDiff = None
|
||||||
|
expected = {"year": {(1970,), (2010,), (2012,), (2014,), (2017,)}}
|
||||||
self.assertEqual(expected, abbreviated_archives)
|
self.assertEqual(expected, abbreviated_archives)
|
||||||
|
|
||||||
# Month archives enabled:
|
# Month archives enabled:
|
||||||
|
|
@ -570,7 +583,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
for granularity, periods in period_archives.items()
|
for granularity, periods in period_archives.items()
|
||||||
}
|
}
|
||||||
expected = {
|
expected = {
|
||||||
"year": {(1970,), (2010,), (2012,), (2014,)},
|
"year": {(1970,), (2010,), (2012,), (2014,), (2017,)},
|
||||||
"month": {
|
"month": {
|
||||||
(1970, "January"),
|
(1970, "January"),
|
||||||
(2010, "December"),
|
(2010, "December"),
|
||||||
|
|
@ -578,6 +591,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
(2012, "November"),
|
(2012, "November"),
|
||||||
(2012, "October"),
|
(2012, "October"),
|
||||||
(2014, "February"),
|
(2014, "February"),
|
||||||
|
(2017, "April"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
self.assertEqual(expected, abbreviated_archives)
|
self.assertEqual(expected, abbreviated_archives)
|
||||||
|
|
@ -602,7 +616,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
for granularity, periods in period_archives.items()
|
for granularity, periods in period_archives.items()
|
||||||
}
|
}
|
||||||
expected = {
|
expected = {
|
||||||
"year": {(1970,), (2010,), (2012,), (2014,)},
|
"year": {(1970,), (2010,), (2012,), (2014,), (2017,)},
|
||||||
"month": {
|
"month": {
|
||||||
(1970, "January"),
|
(1970, "January"),
|
||||||
(2010, "December"),
|
(2010, "December"),
|
||||||
|
|
@ -610,6 +624,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
(2012, "November"),
|
(2012, "November"),
|
||||||
(2012, "October"),
|
(2012, "October"),
|
||||||
(2014, "February"),
|
(2014, "February"),
|
||||||
|
(2017, "April"),
|
||||||
},
|
},
|
||||||
"day": {
|
"day": {
|
||||||
(1970, "January", 1),
|
(1970, "January", 1),
|
||||||
|
|
@ -619,6 +634,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
(2012, "October", 30),
|
(2012, "October", 30),
|
||||||
(2012, "October", 31),
|
(2012, "October", 31),
|
||||||
(2014, "February", 9),
|
(2014, "February", 9),
|
||||||
|
(2017, "April", 21),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
self.assertEqual(expected, abbreviated_archives)
|
self.assertEqual(expected, abbreviated_archives)
|
||||||
|
|
@ -836,8 +852,12 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
|
|
||||||
categories = sorted([category.name for category, _ in generator.categories])
|
categories = sorted([category.name for category, _ in generator.categories])
|
||||||
categories_expected = [
|
categories_expected = [
|
||||||
sorted(["Default", "TestCategory", "yeah", "test", "指導書"]),
|
sorted(
|
||||||
sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]),
|
["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"]
|
||||||
|
),
|
||||||
|
sorted(
|
||||||
|
["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"]
|
||||||
|
),
|
||||||
]
|
]
|
||||||
self.assertIn(categories, categories_expected)
|
self.assertIn(categories, categories_expected)
|
||||||
|
|
||||||
|
|
@ -864,6 +884,7 @@ class TestArticlesGenerator(unittest.TestCase):
|
||||||
generator.generate_context()
|
generator.generate_context()
|
||||||
|
|
||||||
expected = [
|
expected = [
|
||||||
|
"A title",
|
||||||
"An Article With Code Block To Test Typogrify Ignore",
|
"An Article With Code Block To Test Typogrify Ignore",
|
||||||
"Article title",
|
"Article title",
|
||||||
"Article with Nonconformant HTML meta tags",
|
"Article with Nonconformant HTML meta tags",
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,10 @@ from pelican.tools.pelican_import import (
|
||||||
get_attachments,
|
get_attachments,
|
||||||
tumblr2fields,
|
tumblr2fields,
|
||||||
wp2fields,
|
wp2fields,
|
||||||
|
mediumpost2fields,
|
||||||
|
mediumposts2fields,
|
||||||
|
strip_medium_post_content,
|
||||||
|
medium_slug,
|
||||||
)
|
)
|
||||||
from pelican.utils import path_to_file_url, slugify
|
from pelican.utils import path_to_file_url, slugify
|
||||||
|
|
||||||
|
|
@ -708,3 +712,82 @@ class TestTumblrImporter(TestCaseWithCLocale):
|
||||||
posts,
|
posts,
|
||||||
posts,
|
posts,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMediumImporter(TestCaseWithCLocale):
|
||||||
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
|
self.test_content_root = "pelican/tests/content"
|
||||||
|
# The content coming out of parsing is similar, but not the same.
|
||||||
|
# Beautiful soup rearranges the order of attributes, for example.
|
||||||
|
# So, we keep a copy of the content for the test.
|
||||||
|
content_filename = f"{self.test_content_root}/medium_post_content.txt"
|
||||||
|
with open(content_filename, encoding="utf-8") as the_content_file:
|
||||||
|
# Many editors and scripts add a final newline, so live with that
|
||||||
|
# in our test
|
||||||
|
the_content = the_content_file.read()
|
||||||
|
assert the_content[-1] == "\n"
|
||||||
|
the_content = the_content[:-1]
|
||||||
|
self.post_tuple = (
|
||||||
|
"A title",
|
||||||
|
the_content,
|
||||||
|
# slug:
|
||||||
|
"2017-04-21-medium-post",
|
||||||
|
"2017-04-21 17:11",
|
||||||
|
"User Name",
|
||||||
|
None,
|
||||||
|
(),
|
||||||
|
"published",
|
||||||
|
"article",
|
||||||
|
"html",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_mediumpost2field(self):
|
||||||
|
"""Parse one post"""
|
||||||
|
post_filename = f"{self.test_content_root}/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html"
|
||||||
|
val = mediumpost2fields(post_filename)
|
||||||
|
self.assertEqual(self.post_tuple, val, val)
|
||||||
|
|
||||||
|
def test_mediumposts2field(self):
|
||||||
|
"""Parse all posts in an export directory"""
|
||||||
|
posts = [
|
||||||
|
fields
|
||||||
|
for fields in mediumposts2fields(f"{self.test_content_root}/medium_posts")
|
||||||
|
]
|
||||||
|
self.assertEqual(1, len(posts))
|
||||||
|
self.assertEqual(self.post_tuple, posts[0])
|
||||||
|
|
||||||
|
def test_strip_content(self):
|
||||||
|
"""Strip out unhelpful tags"""
|
||||||
|
html_doc = (
|
||||||
|
"<section>This keeps <i>lots</i> of <b>tags</b>, but not "
|
||||||
|
"the <section>section</section> tags</section>"
|
||||||
|
)
|
||||||
|
soup = BeautifulSoup(html_doc, "html.parser")
|
||||||
|
self.assertEqual(
|
||||||
|
"This keeps <i>lots</i> of <b>tags</b>, but not the section tags",
|
||||||
|
strip_medium_post_content(soup),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_medium_slug(self):
|
||||||
|
# Remove hex stuff at the end
|
||||||
|
self.assertEqual(
|
||||||
|
"2017-04-27_A-long-title",
|
||||||
|
medium_slug(
|
||||||
|
"medium-export/posts/2017-04-27_A-long-title--2971442227dd.html"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Remove "--DRAFT" at the end
|
||||||
|
self.assertEqual(
|
||||||
|
"2017-04-27_A-long-title",
|
||||||
|
medium_slug("medium-export/posts/2017-04-27_A-long-title--DRAFT.html"),
|
||||||
|
)
|
||||||
|
# Remove both (which happens)
|
||||||
|
self.assertEqual(
|
||||||
|
"draft_How-to-do", medium_slug("draft_How-to-do--DRAFT--87225c81dddd.html")
|
||||||
|
)
|
||||||
|
# If no hex stuff, leave it alone
|
||||||
|
self.assertEqual(
|
||||||
|
"2017-04-27_A-long-title",
|
||||||
|
medium_slug("medium-export/posts/2017-04-27_A-long-title.html"),
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,8 @@ from urllib.error import URLError
|
||||||
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
|
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
|
||||||
from urllib.request import urlretrieve
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
|
import dateutil.parser
|
||||||
|
|
||||||
# because logging.setLoggerClass has to be called before logging.getLogger
|
# because logging.setLoggerClass has to be called before logging.getLogger
|
||||||
from pelican.log import init
|
from pelican.log import init
|
||||||
from pelican.settings import DEFAULT_CONFIG
|
from pelican.settings import DEFAULT_CONFIG
|
||||||
|
|
@ -114,19 +116,25 @@ def decode_wp_content(content, br=True):
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
def xml_to_soup(xml):
|
def _import_bs4():
|
||||||
"""Opens an xml file"""
|
"""Import and return bs4, otherwise sys.exit."""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
import bs4
|
||||||
except ImportError:
|
except ImportError:
|
||||||
error = (
|
error = (
|
||||||
'Missing dependency "BeautifulSoup4" and "lxml" required to '
|
'Missing dependency "BeautifulSoup4" and "lxml" required to '
|
||||||
"import XML files."
|
"import XML files."
|
||||||
)
|
)
|
||||||
sys.exit(error)
|
sys.exit(error)
|
||||||
|
return bs4
|
||||||
|
|
||||||
|
|
||||||
|
def file_to_soup(xml, features="xml"):
|
||||||
|
"""Reads a file, returns soup."""
|
||||||
|
bs4 = _import_bs4()
|
||||||
with open(xml, encoding="utf-8") as infile:
|
with open(xml, encoding="utf-8") as infile:
|
||||||
xmlfile = infile.read()
|
xmlfile = infile.read()
|
||||||
soup = BeautifulSoup(xmlfile, "xml")
|
soup = bs4.BeautifulSoup(xmlfile, features)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -140,7 +148,7 @@ def get_filename(post_name, post_id):
|
||||||
def wp2fields(xml, wp_custpost=False):
|
def wp2fields(xml, wp_custpost=False):
|
||||||
"""Opens a wordpress XML file, and yield Pelican fields"""
|
"""Opens a wordpress XML file, and yield Pelican fields"""
|
||||||
|
|
||||||
soup = xml_to_soup(xml)
|
soup = file_to_soup(xml)
|
||||||
items = soup.rss.channel.findAll("item")
|
items = soup.rss.channel.findAll("item")
|
||||||
for item in items:
|
for item in items:
|
||||||
if item.find("status").string in ["publish", "draft"]:
|
if item.find("status").string in ["publish", "draft"]:
|
||||||
|
|
@ -210,7 +218,7 @@ def wp2fields(xml, wp_custpost=False):
|
||||||
def blogger2fields(xml):
|
def blogger2fields(xml):
|
||||||
"""Opens a blogger XML file, and yield Pelican fields"""
|
"""Opens a blogger XML file, and yield Pelican fields"""
|
||||||
|
|
||||||
soup = xml_to_soup(xml)
|
soup = file_to_soup(xml)
|
||||||
entries = soup.feed.findAll("entry")
|
entries = soup.feed.findAll("entry")
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
raw_kind = entry.find(
|
raw_kind = entry.find(
|
||||||
|
|
@ -536,6 +544,133 @@ def tumblr2fields(api_key, blogname):
|
||||||
posts = _get_tumblr_posts(api_key, blogname, offset)
|
posts = _get_tumblr_posts(api_key, blogname, offset)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_medium_post_content(soup) -> str:
|
||||||
|
"""Strip some tags and attributes from medium post content.
|
||||||
|
|
||||||
|
For example, the 'section' and 'div' tags cause trouble while rendering.
|
||||||
|
|
||||||
|
The problem with these tags is you can get a section divider (--------------)
|
||||||
|
that is not between two pieces of content. For example:
|
||||||
|
|
||||||
|
Some text.
|
||||||
|
|
||||||
|
.. container:: section-divider
|
||||||
|
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. container:: section-content
|
||||||
|
|
||||||
|
More content.
|
||||||
|
|
||||||
|
In this case, pandoc complains: "Unexpected section title or transition."
|
||||||
|
|
||||||
|
Also, the "id" and "name" attributes in tags cause similar problems. They show
|
||||||
|
up in .rst as extra junk that separates transitions.
|
||||||
|
"""
|
||||||
|
# Remove tags
|
||||||
|
# section and div cause problems
|
||||||
|
# footer also can cause problems, and has nothing we want to keep
|
||||||
|
# See https://stackoverflow.com/a/8439761
|
||||||
|
invalid_tags = ["section", "div", "footer"]
|
||||||
|
for tag in invalid_tags:
|
||||||
|
for match in soup.findAll(tag):
|
||||||
|
match.replaceWithChildren()
|
||||||
|
|
||||||
|
# Remove attributes
|
||||||
|
# See https://stackoverflow.com/a/9045719
|
||||||
|
invalid_attributes = ["name", "id", "class"]
|
||||||
|
bs4 = _import_bs4()
|
||||||
|
for tag in soup.descendants:
|
||||||
|
if isinstance(tag, bs4.element.Tag):
|
||||||
|
tag.attrs = {
|
||||||
|
key: value
|
||||||
|
for key, value in tag.attrs.items()
|
||||||
|
if key not in invalid_attributes
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get the string of all content, keeping other tags
|
||||||
|
all_content = "".join(str(element) for element in soup.contents)
|
||||||
|
return all_content
|
||||||
|
|
||||||
|
|
||||||
|
def mediumpost2fields(filepath: str) -> tuple:
|
||||||
|
"""Take an HTML post from a medium export, return Pelican fields."""
|
||||||
|
|
||||||
|
soup = file_to_soup(filepath, "html.parser")
|
||||||
|
if not soup:
|
||||||
|
raise ValueError(f"{filepath} could not be parsed by beautifulsoup")
|
||||||
|
kind = "article"
|
||||||
|
|
||||||
|
content = soup.find("section", class_="e-content")
|
||||||
|
if not content:
|
||||||
|
raise ValueError(f"{filepath}: Post has no content")
|
||||||
|
|
||||||
|
title = soup.find("title").string or ""
|
||||||
|
|
||||||
|
raw_date = soup.find("time", class_="dt-published")
|
||||||
|
date = None
|
||||||
|
if raw_date:
|
||||||
|
# This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z"
|
||||||
|
# python before 3.11 can't parse the timezone using datetime.fromisoformat
|
||||||
|
# See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat
|
||||||
|
# "This does not support parsing arbitrary ISO 8601 strings"
|
||||||
|
# So, we use dateutil.parser, which can handle it.
|
||||||
|
date_object = dateutil.parser.parse(raw_date.attrs["datetime"])
|
||||||
|
date = date_object.strftime("%Y-%m-%d %H:%M")
|
||||||
|
status = "published"
|
||||||
|
else:
|
||||||
|
status = "draft"
|
||||||
|
author = soup.find("a", class_="p-author h-card")
|
||||||
|
if author:
|
||||||
|
author = author.string
|
||||||
|
|
||||||
|
# Now that we're done with classes, we can strip the content
|
||||||
|
content = strip_medium_post_content(content)
|
||||||
|
|
||||||
|
# medium HTML export doesn't have tag or category
|
||||||
|
# RSS feed has tags, but it doesn't have all the posts.
|
||||||
|
tags = ()
|
||||||
|
|
||||||
|
slug = medium_slug(filepath)
|
||||||
|
|
||||||
|
# TODO: make the fields a python dataclass
|
||||||
|
return (
|
||||||
|
title,
|
||||||
|
content,
|
||||||
|
slug,
|
||||||
|
date,
|
||||||
|
author,
|
||||||
|
None,
|
||||||
|
tags,
|
||||||
|
status,
|
||||||
|
kind,
|
||||||
|
"html",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def medium_slug(filepath: str) -> str:
|
||||||
|
"""Make the filepath of a medium exported file into a slug."""
|
||||||
|
# slug: filename without extension
|
||||||
|
slug = os.path.basename(filepath)
|
||||||
|
slug = os.path.splitext(slug)[0]
|
||||||
|
# A medium export filename looks like date_-title-...html
|
||||||
|
# But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350)
|
||||||
|
# so get rid of it
|
||||||
|
slug = slug.replace("_-", "-")
|
||||||
|
# drop the hex string medium puts on the end of the filename, why keep it.
|
||||||
|
# e.g., "-a8a8a8a8" or "---a9a9a9a9"
|
||||||
|
# also: drafts don't need "--DRAFT"
|
||||||
|
slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug)
|
||||||
|
return slug
|
||||||
|
|
||||||
|
|
||||||
|
def mediumposts2fields(medium_export_dir: str):
|
||||||
|
"""Take HTML posts in a medium export directory, and yield Pelican fields."""
|
||||||
|
for file in os.listdir(medium_export_dir):
|
||||||
|
filename = os.fsdecode(file)
|
||||||
|
yield mediumpost2fields(os.path.join(medium_export_dir, filename))
|
||||||
|
|
||||||
|
|
||||||
def feed2fields(file):
|
def feed2fields(file):
|
||||||
"""Read a feed and yield pelican fields"""
|
"""Read a feed and yield pelican fields"""
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
@ -711,7 +846,7 @@ def get_attachments(xml):
|
||||||
"""returns a dictionary of posts that have attachments with a list
|
"""returns a dictionary of posts that have attachments with a list
|
||||||
of the attachment_urls
|
of the attachment_urls
|
||||||
"""
|
"""
|
||||||
soup = xml_to_soup(xml)
|
soup = file_to_soup(xml)
|
||||||
items = soup.rss.channel.findAll("item")
|
items = soup.rss.channel.findAll("item")
|
||||||
names = {}
|
names = {}
|
||||||
attachments = []
|
attachments = []
|
||||||
|
|
@ -837,6 +972,9 @@ def fields2pelican(
|
||||||
posts_require_pandoc.append(filename)
|
posts_require_pandoc.append(filename)
|
||||||
|
|
||||||
slug = not disable_slugs and filename or None
|
slug = not disable_slugs and filename or None
|
||||||
|
assert slug is None or filename == os.path.basename(
|
||||||
|
filename
|
||||||
|
), f"filename is not a basename: {filename}"
|
||||||
|
|
||||||
if wp_attach and attachments:
|
if wp_attach and attachments:
|
||||||
try:
|
try:
|
||||||
|
|
@ -984,6 +1122,9 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dotclear", action="store_true", dest="dotclear", help="Dotclear export"
|
"--dotclear", action="store_true", dest="dotclear", help="Dotclear export"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--medium", action="store_true", dest="medium", help="Medium export"
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tumblr", action="store_true", dest="tumblr", help="Tumblr export"
|
"--tumblr", action="store_true", dest="tumblr", help="Tumblr export"
|
||||||
)
|
)
|
||||||
|
|
@ -1069,6 +1210,8 @@ def main():
|
||||||
input_type = "blogger"
|
input_type = "blogger"
|
||||||
elif args.dotclear:
|
elif args.dotclear:
|
||||||
input_type = "dotclear"
|
input_type = "dotclear"
|
||||||
|
elif args.medium:
|
||||||
|
input_type = "medium"
|
||||||
elif args.tumblr:
|
elif args.tumblr:
|
||||||
input_type = "tumblr"
|
input_type = "tumblr"
|
||||||
elif args.wpfile:
|
elif args.wpfile:
|
||||||
|
|
@ -1077,8 +1220,8 @@ def main():
|
||||||
input_type = "feed"
|
input_type = "feed"
|
||||||
else:
|
else:
|
||||||
error = (
|
error = (
|
||||||
"You must provide either --blogger, --dotclear, "
|
"You must provide one of --blogger, --dotclear, "
|
||||||
"--tumblr, --wpfile or --feed options"
|
"--medium, --tumblr, --wpfile or --feed options"
|
||||||
)
|
)
|
||||||
exit(error)
|
exit(error)
|
||||||
|
|
||||||
|
|
@ -1097,12 +1240,16 @@ def main():
|
||||||
fields = blogger2fields(args.input)
|
fields = blogger2fields(args.input)
|
||||||
elif input_type == "dotclear":
|
elif input_type == "dotclear":
|
||||||
fields = dc2fields(args.input)
|
fields = dc2fields(args.input)
|
||||||
|
elif input_type == "medium":
|
||||||
|
fields = mediumposts2fields(args.input)
|
||||||
elif input_type == "tumblr":
|
elif input_type == "tumblr":
|
||||||
fields = tumblr2fields(args.input, args.blogname)
|
fields = tumblr2fields(args.input, args.blogname)
|
||||||
elif input_type == "wordpress":
|
elif input_type == "wordpress":
|
||||||
fields = wp2fields(args.input, args.wp_custpost or False)
|
fields = wp2fields(args.input, args.wp_custpost or False)
|
||||||
elif input_type == "feed":
|
elif input_type == "feed":
|
||||||
fields = feed2fields(args.input)
|
fields = feed2fields(args.input)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unhandled input_type {input_type}")
|
||||||
|
|
||||||
if args.wp_attach:
|
if args.wp_attach:
|
||||||
attachments = get_attachments(args.input)
|
attachments = get_attachments(args.input)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue