1
0
Fork 0
forked from github/pelican

Merge pull request #3264 from boxydog/medium_importer

This commit is contained in:
Justin Mayer 2024-01-26 10:02:54 +01:00 committed by GitHub
commit ff35d26cbc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 357 additions and 21 deletions

View file

@ -15,6 +15,8 @@ from urllib.error import URLError
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
from urllib.request import urlretrieve
import dateutil.parser
# because logging.setLoggerClass has to be called before logging.getLogger
from pelican.log import init
from pelican.settings import DEFAULT_CONFIG
@ -114,19 +116,25 @@ def decode_wp_content(content, br=True):
return content
def xml_to_soup(xml):
"""Opens an xml file"""
def _import_bs4():
"""Import and return bs4, otherwise sys.exit."""
try:
from bs4 import BeautifulSoup
import bs4
except ImportError:
error = (
'Missing dependency "BeautifulSoup4" and "lxml" required to '
"import XML files."
)
sys.exit(error)
return bs4
def file_to_soup(xml, features="xml"):
"""Reads a file, returns soup."""
bs4 = _import_bs4()
with open(xml, encoding="utf-8") as infile:
xmlfile = infile.read()
soup = BeautifulSoup(xmlfile, "xml")
soup = bs4.BeautifulSoup(xmlfile, features)
return soup
@ -140,7 +148,7 @@ def get_filename(post_name, post_id):
def wp2fields(xml, wp_custpost=False):
"""Opens a wordpress XML file, and yield Pelican fields"""
soup = xml_to_soup(xml)
soup = file_to_soup(xml)
items = soup.rss.channel.findAll("item")
for item in items:
if item.find("status").string in ["publish", "draft"]:
@ -210,7 +218,7 @@ def wp2fields(xml, wp_custpost=False):
def blogger2fields(xml):
"""Opens a blogger XML file, and yield Pelican fields"""
soup = xml_to_soup(xml)
soup = file_to_soup(xml)
entries = soup.feed.findAll("entry")
for entry in entries:
raw_kind = entry.find(
@ -536,6 +544,133 @@ def tumblr2fields(api_key, blogname):
posts = _get_tumblr_posts(api_key, blogname, offset)
def strip_medium_post_content(soup) -> str:
"""Strip some tags and attributes from medium post content.
For example, the 'section' and 'div' tags cause trouble while rendering.
The problem with these tags is you can get a section divider (--------------)
that is not between two pieces of content. For example:
Some text.
.. container:: section-divider
--------------
.. container:: section-content
More content.
In this case, pandoc complains: "Unexpected section title or transition."
Also, the "id" and "name" attributes in tags cause similar problems. They show
up in .rst as extra junk that separates transitions.
"""
# Remove tags
# section and div cause problems
# footer also can cause problems, and has nothing we want to keep
# See https://stackoverflow.com/a/8439761
invalid_tags = ["section", "div", "footer"]
for tag in invalid_tags:
for match in soup.findAll(tag):
match.replaceWithChildren()
# Remove attributes
# See https://stackoverflow.com/a/9045719
invalid_attributes = ["name", "id", "class"]
bs4 = _import_bs4()
for tag in soup.descendants:
if isinstance(tag, bs4.element.Tag):
tag.attrs = {
key: value
for key, value in tag.attrs.items()
if key not in invalid_attributes
}
# Get the string of all content, keeping other tags
all_content = "".join(str(element) for element in soup.contents)
return all_content
def mediumpost2fields(filepath: str) -> tuple:
"""Take an HTML post from a medium export, return Pelican fields."""
soup = file_to_soup(filepath, "html.parser")
if not soup:
raise ValueError(f"{filepath} could not be parsed by beautifulsoup")
kind = "article"
content = soup.find("section", class_="e-content")
if not content:
raise ValueError(f"{filepath}: Post has no content")
title = soup.find("title").string or ""
raw_date = soup.find("time", class_="dt-published")
date = None
if raw_date:
# This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z"
# python before 3.11 can't parse the timezone using datetime.fromisoformat
# See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat
# "This does not support parsing arbitrary ISO 8601 strings"
# So, we use dateutil.parser, which can handle it.
date_object = dateutil.parser.parse(raw_date.attrs["datetime"])
date = date_object.strftime("%Y-%m-%d %H:%M")
status = "published"
else:
status = "draft"
author = soup.find("a", class_="p-author h-card")
if author:
author = author.string
# Now that we're done with classes, we can strip the content
content = strip_medium_post_content(content)
# medium HTML export doesn't have tag or category
# RSS feed has tags, but it doesn't have all the posts.
tags = ()
slug = medium_slug(filepath)
# TODO: make the fields a python dataclass
return (
title,
content,
slug,
date,
author,
None,
tags,
status,
kind,
"html",
)
def medium_slug(filepath: str) -> str:
"""Make the filepath of a medium exported file into a slug."""
# slug: filename without extension
slug = os.path.basename(filepath)
slug = os.path.splitext(slug)[0]
# A medium export filename looks like date_-title-...html
# But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350)
# so get rid of it
slug = slug.replace("_-", "-")
# drop the hex string medium puts on the end of the filename, why keep it.
# e.g., "-a8a8a8a8" or "---a9a9a9a9"
# also: drafts don't need "--DRAFT"
slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug)
return slug
def mediumposts2fields(medium_export_dir: str):
"""Take HTML posts in a medium export directory, and yield Pelican fields."""
for file in os.listdir(medium_export_dir):
filename = os.fsdecode(file)
yield mediumpost2fields(os.path.join(medium_export_dir, filename))
def feed2fields(file):
"""Read a feed and yield pelican fields"""
import feedparser
@ -711,7 +846,7 @@ def get_attachments(xml):
"""returns a dictionary of posts that have attachments with a list
of the attachment_urls
"""
soup = xml_to_soup(xml)
soup = file_to_soup(xml)
items = soup.rss.channel.findAll("item")
names = {}
attachments = []
@ -837,6 +972,9 @@ def fields2pelican(
posts_require_pandoc.append(filename)
slug = not disable_slugs and filename or None
assert slug is None or filename == os.path.basename(
filename
), f"filename is not a basename: {filename}"
if wp_attach and attachments:
try:
@ -984,6 +1122,9 @@ def main():
parser.add_argument(
"--dotclear", action="store_true", dest="dotclear", help="Dotclear export"
)
parser.add_argument(
"--medium", action="store_true", dest="medium", help="Medium export"
)
parser.add_argument(
"--tumblr", action="store_true", dest="tumblr", help="Tumblr export"
)
@ -1069,6 +1210,8 @@ def main():
input_type = "blogger"
elif args.dotclear:
input_type = "dotclear"
elif args.medium:
input_type = "medium"
elif args.tumblr:
input_type = "tumblr"
elif args.wpfile:
@ -1077,8 +1220,8 @@ def main():
input_type = "feed"
else:
error = (
"You must provide either --blogger, --dotclear, "
"--tumblr, --wpfile or --feed options"
"You must provide one of --blogger, --dotclear, "
"--medium, --tumblr, --wpfile or --feed options"
)
exit(error)
@ -1097,12 +1240,16 @@ def main():
fields = blogger2fields(args.input)
elif input_type == "dotclear":
fields = dc2fields(args.input)
elif input_type == "medium":
fields = mediumposts2fields(args.input)
elif input_type == "tumblr":
fields = tumblr2fields(args.input, args.blogname)
elif input_type == "wordpress":
fields = wp2fields(args.input, args.wp_custpost or False)
elif input_type == "feed":
fields = feed2fields(args.input)
else:
raise ValueError(f"Unhandled input_type {input_type}")
if args.wp_attach:
attachments = get_attachments(args.input)