Merge pull request #2395 from oulenz/import_from_blogger

Add Blogger XML backup importer
This commit is contained in:
Justin Mayer 2018-08-08 09:45:52 +02:00 committed by GitHub
commit e9b654bbaa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 1218 additions and 33 deletions

View file

@ -9,10 +9,11 @@ Description
``pelican-import`` is a command-line tool for converting articles from other ``pelican-import`` is a command-line tool for converting articles from other
software to reStructuredText or Markdown. The supported import formats are: software to reStructuredText or Markdown. The supported import formats are:
- WordPress XML export - Blogger XML export
- Dotclear export - Dotclear export
- Posterous API - Posterous API
- Tumblr API - Tumblr API
- WordPress XML export
- RSS/Atom feed - RSS/Atom feed
The conversion from HTML to reStructuredText or Markdown relies on `Pandoc`_. The conversion from HTML to reStructuredText or Markdown relies on `Pandoc`_.
@ -40,8 +41,8 @@ Usage
:: ::
pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--tumblr] [--feed] [-o OUTPUT] pelican-import [-h] [--blogger] [--dotclear] [--posterous] [--tumblr] [--wpfile] [--feed]
[-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost] [-o OUTPUT] [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost]
[--wp-attach] [--disable-slugs] [-e EMAIL] [-p PASSWORD] [-b BLOGNAME] [--wp-attach] [--disable-slugs] [-e EMAIL] [-p PASSWORD] [-b BLOGNAME]
input|api_token|api_key input|api_token|api_key
@ -57,10 +58,11 @@ Optional arguments
------------------ ------------------
-h, --help Show this help message and exit -h, --help Show this help message and exit
--wpfile WordPress XML export (default: False) --blogger Blogger XML export (default: False)
--dotclear Dotclear export (default: False) --dotclear Dotclear export (default: False)
--posterous Posterous API (default: False) --posterous Posterous API (default: False)
--tumblr Tumblr API (default: False) --tumblr Tumblr API (default: False)
--wpfile WordPress XML export (default: False)
--feed Feed to parse (default: False) --feed Feed to parse (default: False)
-o OUTPUT, --output OUTPUT -o OUTPUT, --output OUTPUT
Output path (default: content) Output path (default: content)
@ -70,7 +72,8 @@ Optional arguments
--dir-cat Put files in directories with categories name --dir-cat Put files in directories with categories name
(default: False) (default: False)
--dir-page Put files recognised as pages in "pages/" sub- --dir-page Put files recognised as pages in "pages/" sub-
directory (wordpress import only) (default: False) directory (blogger and wordpress import only)
(default: False)
--filter-author Import only post from the specified author --filter-author Import only post from the specified author
--strip-raw Strip raw HTML code that can't be converted to markup --strip-raw Strip raw HTML code that can't be converted to markup
such as flash embeds or iframes (wordpress import such as flash embeds or iframes (wordpress import
@ -102,9 +105,9 @@ Optional arguments
Examples Examples
======== ========
For WordPress:: For Blogger::
$ pelican-import --wpfile -o ~/output ~/posts.xml $ pelican-import --blogger -o ~/output ~/posts.xml
For Dotclear:: For Dotclear::
@ -118,6 +121,10 @@ For Tumblr::
$ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_token> $ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_token>
For WordPress::
$ pelican-import --wpfile -o ~/output ~/posts.xml
Tests Tests
===== =====

1067
pelican/tests/content/bloggerexport.xml vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -8,13 +8,15 @@ from codecs import open
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder, from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
unittest) unittest)
from pelican.tools.pelican_import import (build_header, build_markdown_header, from pelican.tools.pelican_import import (blogger2fields, build_header,
build_markdown_header,
decode_wp_content, decode_wp_content,
download_attachments, fields2pelican, download_attachments, fields2pelican,
get_attachments, wp2fields) get_attachments, wp2fields)
from pelican.utils import path_to_file_url, slugify from pelican.utils import path_to_file_url, slugify
CUR_DIR = os.path.abspath(os.path.dirname(__file__)) CUR_DIR = os.path.abspath(os.path.dirname(__file__))
BLOGGER_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'bloggerexport.xml')
WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
'content', 'content',
@ -34,6 +36,53 @@ except ImportError:
LXML = False LXML = False
@skipIfNoExecutable(['pandoc', '--version'])
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestBloggerXmlImporter(unittest.TestCase):
def setUp(self):
self.old_locale = locale.setlocale(locale.LC_ALL)
locale.setlocale(locale.LC_ALL, str('C'))
self.posts = list(blogger2fields(BLOGGER_XML_SAMPLE))
def tearDown(self):
locale.setlocale(locale.LC_ALL, self.old_locale)
def test_recognise_kind_and_title(self):
"""Check that importer only outputs pages, articles and comments,
that these are correctly identified and that titles are correct.
"""
kinds = {x[8] for x in self.posts}
self.assertEqual({'page', 'article', 'comment'}, kinds)
page_titles = {x[0] for x in self.posts if x[8] == 'page'}
self.assertEqual({'Test page', 'Test page 2'}, page_titles)
article_titles = {x[0] for x in self.posts if x[8] == 'article'}
self.assertEqual({'Black as Egypt\'s Night', 'The Steel Windpipe'},
article_titles)
comment_titles = {x[0] for x in self.posts if x[8] == 'comment'}
self.assertEqual({'Mishka, always a pleasure to read your '
'adventures!...'},
comment_titles)
def test_recognise_status_with_correct_filename(self):
"""Check that importerer outputs only statuses 'published' and 'draft',
that these are correctly identified and that filenames are correct.
"""
statuses = {x[7] for x in self.posts}
self.assertEqual({'published', 'draft'}, statuses)
draft_filenames = {x[2] for x in self.posts if x[7] == 'draft'}
# draft filenames are id-based
self.assertEqual({'page-4386962582497458967',
'post-1276418104709695660'}, draft_filenames)
published_filenames = {x[2] for x in self.posts if x[7] == 'published'}
# published filenames are url-based, except comments
self.assertEqual({'the-steel-windpipe',
'test-page',
'post-5590533389087749201'}, published_filenames)
@skipIfNoExecutable(['pandoc', '--version']) @skipIfNoExecutable(['pandoc', '--version'])
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module') @unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestWordpressXmlImporter(unittest.TestCase): class TestWordpressXmlImporter(unittest.TestCase):

View file

@ -8,7 +8,6 @@ import os
import re import re
import subprocess import subprocess
import sys import sys
import time
from codecs import open from codecs import open
from collections import defaultdict from collections import defaultdict
@ -117,19 +116,18 @@ def decode_wp_content(content, br=True):
return content return content
def get_items(xml): def xml_to_soup(xml):
"""Opens a WordPress xml file and returns a list of items""" """Opens an xml file"""
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
except ImportError: except ImportError:
error = ('Missing dependency "BeautifulSoup4" and "lxml" required to ' error = ('Missing dependency "BeautifulSoup4" and "lxml" required to '
'import WordPress XML files.') 'import XML files.')
sys.exit(error) sys.exit(error)
with open(xml, encoding='utf-8') as infile: with open(xml, encoding='utf-8') as infile:
xmlfile = infile.read() xmlfile = infile.read()
soup = BeautifulSoup(xmlfile, "xml") soup = BeautifulSoup(xmlfile, "xml")
items = soup.rss.channel.findAll('item') return soup
return items
def get_filename(filename, post_id): def get_filename(filename, post_id):
@ -142,7 +140,8 @@ def get_filename(filename, post_id):
def wp2fields(xml, wp_custpost=False): def wp2fields(xml, wp_custpost=False):
"""Opens a wordpress XML file, and yield Pelican fields""" """Opens a wordpress XML file, and yield Pelican fields"""
items = get_items(xml) soup = xml_to_soup(xml)
items = soup.rss.channel.findAll('item')
for item in items: for item in items:
if item.find('status').string in ["publish", "draft"]: if item.find('status').string in ["publish", "draft"]:
@ -163,8 +162,9 @@ def wp2fields(xml, wp_custpost=False):
if raw_date == u'0000-00-00 00:00:00': if raw_date == u'0000-00-00 00:00:00':
date = None date = None
else: else:
date_object = time.strptime(raw_date, '%Y-%m-%d %H:%M:%S') date_object = SafeDatetime.strptime(
date = time.strftime('%Y-%m-%d %H:%M', date_object) raw_date, '%Y-%m-%d %H:%M:%S')
date = date_object.strftime('%Y-%m-%d %H:%M')
author = item.find('creator').string author = item.find('creator').string
categories = [cat.string for cat categories = [cat.string for cat
@ -195,6 +195,59 @@ def wp2fields(xml, wp_custpost=False):
tags, status, kind, 'wp-html') tags, status, kind, 'wp-html')
def blogger2fields(xml):
"""Opens a blogger XML file, and yield Pelican fields"""
soup = xml_to_soup(xml)
entries = soup.feed.findAll('entry')
for entry in entries:
raw_kind = entry.find(
'category', {'scheme': 'http://schemas.google.com/g/2005#kind'}
).get('term')
if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
kind = 'article'
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
kind = 'comment'
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
kind = 'page'
else:
continue
try:
assert kind != 'comment'
filename = entry.find('link', {'rel': 'alternate'})['href']
filename = os.path.splitext(os.path.basename(filename))[0]
except (AssertionError, TypeError, KeyError):
filename = entry.find('id').string.split('.')[-1]
title = entry.find('title').string or ''
content = entry.find('content').string
raw_date = entry.find('published').string
if hasattr(SafeDatetime, 'fromisoformat'):
date_object = SafeDatetime.fromisoformat(raw_date)
else:
date_object = SafeDatetime.strptime(
raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f')
date = date_object.strftime('%Y-%m-%d %H:%M')
author = entry.find('author').find('name').string
# blogger posts only have tags, no category
tags = [tag.get('term') for tag in entry.findAll(
'category', {'scheme': 'http://www.blogger.com/atom/ns#'})]
# Drafts have <app:control><app:draft>yes</app:draft></app:control>
status = 'published'
try:
if entry.find('control').find('draft').string == 'yes':
status = 'draft'
except AttributeError:
pass
yield (title, content, filename, date, author, None, tags, status,
kind, 'html')
def dc2fields(file): def dc2fields(file):
"""Opens a Dotclear export file, and yield pelican fields""" """Opens a Dotclear export file, and yield pelican fields"""
try: try:
@ -391,7 +444,6 @@ def posterous2fields(api_token, email, password):
def tumblr2fields(api_key, blogname): def tumblr2fields(api_key, blogname):
""" Imports Tumblr posts (API v2)""" """ Imports Tumblr posts (API v2)"""
from time import strftime, localtime
try: try:
# py3k import # py3k import
import json import json
@ -426,8 +478,10 @@ def tumblr2fields(api_key, blogname):
slug = post.get('slug') or slugify(title) slug = post.get('slug') or slugify(title)
tags = post.get('tags') tags = post.get('tags')
timestamp = post.get('timestamp') timestamp = post.get('timestamp')
date = strftime("%Y-%m-%d %H:%M:%S", localtime(int(timestamp))) date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
slug = strftime("%Y-%m-%d-", localtime(int(timestamp))) + slug "%Y-%m-%d %H:%M:%S")
slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
"%Y-%m-%d-") + slug
format = post.get('format') format = post.get('format')
content = post.get('body') content = post.get('body')
type = post.get('type') type = post.get('type')
@ -499,7 +553,7 @@ def feed2fields(file):
import feedparser import feedparser
d = feedparser.parse(file) d = feedparser.parse(file)
for entry in d.entries: for entry in d.entries:
date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed) date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M')
if hasattr(entry, 'updated_parsed') else None) if hasattr(entry, 'updated_parsed') else None)
author = entry.author if hasattr(entry, 'author') else None author = entry.author if hasattr(entry, 'author') else None
tags = ([e['term'] for e in entry.tags] tags = ([e['term'] for e in entry.tags]
@ -619,7 +673,8 @@ def get_attachments(xml):
"""returns a dictionary of posts that have attachments with a list """returns a dictionary of posts that have attachments with a list
of the attachment_urls of the attachment_urls
""" """
items = get_items(xml) soup = xml_to_soup(xml)
items = soup.rss.channel.findAll('item')
names = {} names = {}
attachments = [] attachments = []
@ -809,16 +864,16 @@ def fields2pelican(
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Transform feed, WordPress, Tumblr, Dotclear, or " description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or"
"Posterous files into reST (rst) or Markdown (md) files. " "WordPress files into reST (rst) or Markdown (md) files. "
"Be sure to have pandoc installed.", "Be sure to have pandoc installed.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument( parser.add_argument(
dest='input', help='The input file to read') dest='input', help='The input file to read')
parser.add_argument( parser.add_argument(
'--wpfile', action='store_true', dest='wpfile', '--blogger', action='store_true', dest='blogger',
help='Wordpress XML export') help='Blogger XML export')
parser.add_argument( parser.add_argument(
'--dotclear', action='store_true', dest='dotclear', '--dotclear', action='store_true', dest='dotclear',
help='Dotclear export') help='Dotclear export')
@ -828,6 +883,9 @@ def main():
parser.add_argument( parser.add_argument(
'--tumblr', action='store_true', dest='tumblr', '--tumblr', action='store_true', dest='tumblr',
help='Tumblr export') help='Tumblr export')
parser.add_argument(
'--wpfile', action='store_true', dest='wpfile',
help='Wordpress XML export')
parser.add_argument( parser.add_argument(
'--feed', action='store_true', dest='feed', '--feed', action='store_true', dest='feed',
help='Feed to parse') help='Feed to parse')
@ -843,7 +901,7 @@ def main():
parser.add_argument( parser.add_argument(
'--dir-page', action='store_true', dest='dirpage', '--dir-page', action='store_true', dest='dirpage',
help=('Put files recognised as pages in "pages/" sub-directory' help=('Put files recognised as pages in "pages/" sub-directory'
' (wordpress import only)')) ' (blogger and wordpress import only)'))
parser.add_argument( parser.add_argument(
'--filter-author', dest='author', '--filter-author', dest='author',
help='Import only post from the specified author') help='Import only post from the specified author')
@ -885,19 +943,21 @@ def main():
args = parser.parse_args() args = parser.parse_args()
input_type = None input_type = None
if args.wpfile: if args.blogger:
input_type = 'wordpress' input_type = 'blogger'
elif args.dotclear: elif args.dotclear:
input_type = 'dotclear' input_type = 'dotclear'
elif args.posterous: elif args.posterous:
input_type = 'posterous' input_type = 'posterous'
elif args.tumblr: elif args.tumblr:
input_type = 'tumblr' input_type = 'tumblr'
elif args.wpfile:
input_type = 'wordpress'
elif args.feed: elif args.feed:
input_type = 'feed' input_type = 'feed'
else: else:
error = ('You must provide either --wpfile, --dotclear, ' error = ('You must provide either --blogger, --dotclear, '
'--posterous, --tumblr or --feed options') '--posterous, --tumblr, --wpfile or --feed options')
exit(error) exit(error)
if not os.path.exists(args.output): if not os.path.exists(args.output):
@ -912,14 +972,16 @@ def main():
'to use the --wp-attach option') 'to use the --wp-attach option')
exit(error) exit(error)
if input_type == 'wordpress': if input_type == 'blogger':
fields = wp2fields(args.input, args.wp_custpost or False) fields = blogger2fields(args.input)
elif input_type == 'dotclear': elif input_type == 'dotclear':
fields = dc2fields(args.input) fields = dc2fields(args.input)
elif input_type == 'posterous': elif input_type == 'posterous':
fields = posterous2fields(args.input, args.email, args.password) fields = posterous2fields(args.input, args.email, args.password)
elif input_type == 'tumblr': elif input_type == 'tumblr':
fields = tumblr2fields(args.input, args.blogname) fields = tumblr2fields(args.input, args.blogname)
elif input_type == 'wordpress':
fields = wp2fields(args.input, args.wp_custpost or False)
elif input_type == 'feed': elif input_type == 'feed':
fields = feed2fields(args.input) fields = feed2fields(args.input)