Merge pull request #3264 from boxydog/medium_importer

2024-01-26 10:02:54 +01:00 · 2024-01-26 10:02:54 +01:00 · ff35d26cbc
commit ff35d26cbc
parent dbf90a4821 d6a33f1d21
7 changed files with 357 additions and 21 deletions
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@ -15,6 +15,8 @@ from urllib.error import URLError
 from urllib.parse import quote, urlparse, urlsplit, urlunsplit
 from urllib.request import urlretrieve

+import dateutil.parser
+
 # because logging.setLoggerClass has to be called before logging.getLogger
 from pelican.log import init
 from pelican.settings import DEFAULT_CONFIG
@ -114,19 +116,25 @@ def decode_wp_content(content, br=True):
    return content


-def xml_to_soup(xml):
-    """Opens an xml file"""
+def _import_bs4():
+    """Import and return bs4, otherwise sys.exit."""
    try:
-        from bs4 import BeautifulSoup
+        import bs4
    except ImportError:
        error = (
            'Missing dependency "BeautifulSoup4" and "lxml" required to '
            "import XML files."
        )
        sys.exit(error)
+    return bs4
+
+
+def file_to_soup(xml, features="xml"):
+    """Reads a file, returns soup."""
+    bs4 = _import_bs4()
    with open(xml, encoding="utf-8") as infile:
        xmlfile = infile.read()
-    soup = BeautifulSoup(xmlfile, "xml")
+    soup = bs4.BeautifulSoup(xmlfile, features)
    return soup


@ -140,7 +148,7 @@ def get_filename(post_name, post_id):
 def wp2fields(xml, wp_custpost=False):
    """Opens a wordpress XML file, and yield Pelican fields"""

-    soup = xml_to_soup(xml)
+    soup = file_to_soup(xml)
    items = soup.rss.channel.findAll("item")
    for item in items:
        if item.find("status").string in ["publish", "draft"]:
@ -210,7 +218,7 @@ def wp2fields(xml, wp_custpost=False):
 def blogger2fields(xml):
    """Opens a blogger XML file, and yield Pelican fields"""

-    soup = xml_to_soup(xml)
+    soup = file_to_soup(xml)
    entries = soup.feed.findAll("entry")
    for entry in entries:
        raw_kind = entry.find(
@ -536,6 +544,133 @@ def tumblr2fields(api_key, blogname):
        posts = _get_tumblr_posts(api_key, blogname, offset)


+def strip_medium_post_content(soup) -> str:
+    """Strip some tags and attributes from medium post content.
+
+    For example, the 'section' and 'div' tags cause trouble while rendering.
+
+    The problem with these tags is you can get a section divider (--------------)
+    that is not between two pieces of content.  For example:
+
+      Some text.
+
+      .. container:: section-divider
+
+         --------------
+
+      .. container:: section-content
+
+      More content.
+
+    In this case, pandoc complains: "Unexpected section title or transition."
+
+    Also, the "id" and "name" attributes in tags cause similar problems.  They show
+    up in .rst as extra junk that separates transitions.
+    """
+    # Remove tags
+    # section and div cause problems
+    # footer also can cause problems, and has nothing we want to keep
+    # See https://stackoverflow.com/a/8439761
+    invalid_tags = ["section", "div", "footer"]
+    for tag in invalid_tags:
+        for match in soup.findAll(tag):
+            match.replaceWithChildren()
+
+    # Remove attributes
+    # See https://stackoverflow.com/a/9045719
+    invalid_attributes = ["name", "id", "class"]
+    bs4 = _import_bs4()
+    for tag in soup.descendants:
+        if isinstance(tag, bs4.element.Tag):
+            tag.attrs = {
+                key: value
+                for key, value in tag.attrs.items()
+                if key not in invalid_attributes
+            }
+
+    # Get the string of all content, keeping other tags
+    all_content = "".join(str(element) for element in soup.contents)
+    return all_content
+
+
+def mediumpost2fields(filepath: str) -> tuple:
+    """Take an HTML post from a medium export, return Pelican fields."""
+
+    soup = file_to_soup(filepath, "html.parser")
+    if not soup:
+        raise ValueError(f"{filepath} could not be parsed by beautifulsoup")
+    kind = "article"
+
+    content = soup.find("section", class_="e-content")
+    if not content:
+        raise ValueError(f"{filepath}: Post has no content")
+
+    title = soup.find("title").string or ""
+
+    raw_date = soup.find("time", class_="dt-published")
+    date = None
+    if raw_date:
+        # This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z"
+        # python before 3.11 can't parse the timezone using datetime.fromisoformat
+        # See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat
+        # "This does not support parsing arbitrary ISO 8601 strings"
+        # So, we use dateutil.parser, which can handle it.
+        date_object = dateutil.parser.parse(raw_date.attrs["datetime"])
+        date = date_object.strftime("%Y-%m-%d %H:%M")
+        status = "published"
+    else:
+        status = "draft"
+    author = soup.find("a", class_="p-author h-card")
+    if author:
+        author = author.string
+
+    # Now that we're done with classes, we can strip the content
+    content = strip_medium_post_content(content)
+
+    # medium HTML export doesn't have tag or category
+    # RSS feed has tags, but it doesn't have all the posts.
+    tags = ()
+
+    slug = medium_slug(filepath)
+
+    # TODO: make the fields a python dataclass
+    return (
+        title,
+        content,
+        slug,
+        date,
+        author,
+        None,
+        tags,
+        status,
+        kind,
+        "html",
+    )
+
+
+def medium_slug(filepath: str) -> str:
+    """Make the filepath of a medium exported file into a slug."""
+    # slug: filename without extension
+    slug = os.path.basename(filepath)
+    slug = os.path.splitext(slug)[0]
+    # A medium export filename looks like date_-title-...html
+    # But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350)
+    # so get rid of it
+    slug = slug.replace("_-", "-")
+    # drop the hex string medium puts on the end of the filename, why keep it.
+    # e.g., "-a8a8a8a8" or "---a9a9a9a9"
+    # also: drafts don't need "--DRAFT"
+    slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug)
+    return slug
+
+
+def mediumposts2fields(medium_export_dir: str):
+    """Take HTML posts in a medium export directory, and yield Pelican fields."""
+    for file in os.listdir(medium_export_dir):
+        filename = os.fsdecode(file)
+        yield mediumpost2fields(os.path.join(medium_export_dir, filename))
+
+
 def feed2fields(file):
    """Read a feed and yield pelican fields"""
    import feedparser
@ -711,7 +846,7 @@ def get_attachments(xml):
    """returns a dictionary of posts that have attachments with a list
    of the attachment_urls
    """
-    soup = xml_to_soup(xml)
+    soup = file_to_soup(xml)
    items = soup.rss.channel.findAll("item")
    names = {}
    attachments = []
@ -837,6 +972,9 @@ def fields2pelican(
            posts_require_pandoc.append(filename)

        slug = not disable_slugs and filename or None
+        assert slug is None or filename == os.path.basename(
+            filename
+        ), f"filename is not a basename: {filename}"

        if wp_attach and attachments:
            try:
@ -984,6 +1122,9 @@ def main():
    parser.add_argument(
        "--dotclear", action="store_true", dest="dotclear", help="Dotclear export"
    )
+    parser.add_argument(
+        "--medium", action="store_true", dest="medium", help="Medium export"
+    )
    parser.add_argument(
        "--tumblr", action="store_true", dest="tumblr", help="Tumblr export"
    )
@ -1069,6 +1210,8 @@ def main():
        input_type = "blogger"
    elif args.dotclear:
        input_type = "dotclear"
+    elif args.medium:
+        input_type = "medium"
    elif args.tumblr:
        input_type = "tumblr"
    elif args.wpfile:
@ -1077,8 +1220,8 @@ def main():
        input_type = "feed"
    else:
        error = (
-            "You must provide either --blogger, --dotclear, "
-            "--tumblr, --wpfile or --feed options"
+            "You must provide one of --blogger, --dotclear, "
+            "--medium, --tumblr, --wpfile or --feed options"
        )
        exit(error)

@ -1097,12 +1240,16 @@ def main():
        fields = blogger2fields(args.input)
    elif input_type == "dotclear":
        fields = dc2fields(args.input)
+    elif input_type == "medium":
+        fields = mediumposts2fields(args.input)
    elif input_type == "tumblr":
        fields = tumblr2fields(args.input, args.blogname)
    elif input_type == "wordpress":
        fields = wp2fields(args.input, args.wp_custpost or False)
    elif input_type == "feed":
        fields = feed2fields(args.input)
+    else:
+        raise ValueError(f"Unhandled input_type {input_type}")

    if args.wp_attach:
        attachments = get_attachments(args.input)