add blogger importer

This commit is contained in:
Oliver Urs Lenz 2018-08-07 14:06:46 +02:00
commit c388f14d3e
4 changed files with 1218 additions and 33 deletions

View file

@ -9,10 +9,11 @@ Description
``pelican-import`` is a command-line tool for converting articles from other
software to reStructuredText or Markdown. The supported import formats are:
- WordPress XML export
- Blogger XML export
- Dotclear export
- Posterous API
- Tumblr API
- WordPress XML export
- RSS/Atom feed
The conversion from HTML to reStructuredText or Markdown relies on `Pandoc`_.
@ -40,8 +41,8 @@ Usage
::
pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--tumblr] [--feed] [-o OUTPUT]
[-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost]
pelican-import [-h] [--blogger] [--dotclear] [--posterous] [--tumblr] [--wpfile] [--feed]
[-o OUTPUT] [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost]
[--wp-attach] [--disable-slugs] [-e EMAIL] [-p PASSWORD] [-b BLOGNAME]
input|api_token|api_key
@ -57,10 +58,11 @@ Optional arguments
------------------
-h, --help Show this help message and exit
--wpfile WordPress XML export (default: False)
--blogger Blogger XML export (default: False)
--dotclear Dotclear export (default: False)
--posterous Posterous API (default: False)
--tumblr Tumblr API (default: False)
--wpfile WordPress XML export (default: False)
--feed Feed to parse (default: False)
-o OUTPUT, --output OUTPUT
Output path (default: content)
@ -70,7 +72,8 @@ Optional arguments
--dir-cat Put files in directories with categories name
(default: False)
--dir-page Put files recognised as pages in "pages/" sub-
directory (wordpress import only) (default: False)
directory (blogger and wordpress import only)
(default: False)
--filter-author Import only post from the specified author
--strip-raw Strip raw HTML code that can't be converted to markup
such as flash embeds or iframes (wordpress import
@ -102,9 +105,9 @@ Optional arguments
Examples
========
For WordPress::
For Blogger::
$ pelican-import --wpfile -o ~/output ~/posts.xml
$ pelican-import --blogger -o ~/output ~/posts.xml
For Dotclear::
@ -118,6 +121,10 @@ For Tumblr::
$ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_token>
For WordPress::
$ pelican-import --wpfile -o ~/output ~/posts.xml
Tests
=====

1067
pelican/tests/content/bloggerexport.xml vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -8,13 +8,15 @@ from codecs import open
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
unittest)
from pelican.tools.pelican_import import (build_header, build_markdown_header,
from pelican.tools.pelican_import import (blogger2fields, build_header,
build_markdown_header,
decode_wp_content,
download_attachments, fields2pelican,
get_attachments, wp2fields)
from pelican.utils import path_to_file_url, slugify
CUR_DIR = os.path.abspath(os.path.dirname(__file__))
BLOGGER_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'bloggerexport.xml')
WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
'content',
@ -34,6 +36,53 @@ except ImportError:
LXML = False
@skipIfNoExecutable(['pandoc', '--version'])
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestBloggerXmlImporter(unittest.TestCase):
def setUp(self):
self.old_locale = locale.setlocale(locale.LC_ALL)
locale.setlocale(locale.LC_ALL, str('C'))
self.posts = list(blogger2fields(BLOGGER_XML_SAMPLE))
def tearDown(self):
locale.setlocale(locale.LC_ALL, self.old_locale)
def test_recognise_kind_and_title(self):
"""Check that importer only outputs pages, articles and comments,
that these are correctly identified and that titles are correct.
"""
kinds = {x[8] for x in self.posts}
self.assertEqual({'page', 'article', 'comment'}, kinds)
page_titles = {x[0] for x in self.posts if x[8] == 'page'}
self.assertEqual({'Test page', 'Test page 2'}, page_titles)
article_titles = {x[0] for x in self.posts if x[8] == 'article'}
self.assertEqual({'Black as Egypt\'s Night', 'The Steel Windpipe'},
article_titles)
comment_titles = {x[0] for x in self.posts if x[8] == 'comment'}
self.assertEqual({'Mishka, always a pleasure to read your '
'adventures!...'},
comment_titles)
def test_recognise_status_with_correct_filename(self):
"""Check that importerer outputs only statuses 'published' and 'draft',
that these are correctly identified and that filenames are correct.
"""
statuses = {x[7] for x in self.posts}
self.assertEqual({'published', 'draft'}, statuses)
draft_filenames = {x[2] for x in self.posts if x[7] == 'draft'}
# draft filenames are id-based
self.assertEqual({'page-4386962582497458967',
'post-1276418104709695660'}, draft_filenames)
published_filenames = {x[2] for x in self.posts if x[7] == 'published'}
# published filenames are url-based, except comments
self.assertEqual({'the-steel-windpipe',
'test-page',
'post-5590533389087749201'}, published_filenames)
@skipIfNoExecutable(['pandoc', '--version'])
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestWordpressXmlImporter(unittest.TestCase):

View file

@ -8,7 +8,6 @@ import os
import re
import subprocess
import sys
import time
from codecs import open
from collections import defaultdict
@ -117,19 +116,18 @@ def decode_wp_content(content, br=True):
return content
def get_items(xml):
"""Opens a WordPress xml file and returns a list of items"""
def xml_to_soup(xml):
"""Opens an xml file"""
try:
from bs4 import BeautifulSoup
except ImportError:
error = ('Missing dependency "BeautifulSoup4" and "lxml" required to '
'import WordPress XML files.')
'import XML files.')
sys.exit(error)
with open(xml, encoding='utf-8') as infile:
xmlfile = infile.read()
soup = BeautifulSoup(xmlfile, "xml")
items = soup.rss.channel.findAll('item')
return items
return soup
def get_filename(filename, post_id):
@ -142,7 +140,8 @@ def get_filename(filename, post_id):
def wp2fields(xml, wp_custpost=False):
"""Opens a wordpress XML file, and yield Pelican fields"""
items = get_items(xml)
soup = xml_to_soup(xml)
items = soup.rss.channel.findAll('item')
for item in items:
if item.find('status').string in ["publish", "draft"]:
@ -163,8 +162,9 @@ def wp2fields(xml, wp_custpost=False):
if raw_date == u'0000-00-00 00:00:00':
date = None
else:
date_object = time.strptime(raw_date, '%Y-%m-%d %H:%M:%S')
date = time.strftime('%Y-%m-%d %H:%M', date_object)
date_object = SafeDatetime.strptime(
raw_date, '%Y-%m-%d %H:%M:%S')
date = date_object.strftime('%Y-%m-%d %H:%M')
author = item.find('creator').string
categories = [cat.string for cat
@ -195,6 +195,59 @@ def wp2fields(xml, wp_custpost=False):
tags, status, kind, 'wp-html')
def blogger2fields(xml):
"""Opens a blogger XML file, and yield Pelican fields"""
soup = xml_to_soup(xml)
entries = soup.feed.findAll('entry')
for entry in entries:
raw_kind = entry.find(
'category', {'scheme': 'http://schemas.google.com/g/2005#kind'}
).get('term')
if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
kind = 'article'
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
kind = 'comment'
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
kind = 'page'
else:
continue
try:
assert kind != 'comment'
filename = entry.find('link', {'rel': 'alternate'})['href']
filename = os.path.splitext(os.path.basename(filename))[0]
except (AssertionError, TypeError, KeyError):
filename = entry.find('id').string.split('.')[-1]
title = entry.find('title').string or ''
content = entry.find('content').string
raw_date = entry.find('published').string
if hasattr(SafeDatetime, 'fromisoformat'):
date_object = SafeDatetime.fromisoformat(raw_date)
else:
date_object = SafeDatetime.strptime(
raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f')
date = date_object.strftime('%Y-%m-%d %H:%M')
author = entry.find('author').find('name').string
# blogger posts only have tags, no category
tags = [tag.get('term') for tag in entry.findAll(
'category', {'scheme': 'http://www.blogger.com/atom/ns#'})]
# Drafts have <app:control><app:draft>yes</app:draft></app:control>
status = 'published'
try:
if entry.find('control').find('draft').string == 'yes':
status = 'draft'
except AttributeError:
pass
yield (title, content, filename, date, author, None, tags, status,
kind, 'html')
def dc2fields(file):
"""Opens a Dotclear export file, and yield pelican fields"""
try:
@ -391,7 +444,6 @@ def posterous2fields(api_token, email, password):
def tumblr2fields(api_key, blogname):
""" Imports Tumblr posts (API v2)"""
from time import strftime, localtime
try:
# py3k import
import json
@ -426,8 +478,10 @@ def tumblr2fields(api_key, blogname):
slug = post.get('slug') or slugify(title)
tags = post.get('tags')
timestamp = post.get('timestamp')
date = strftime("%Y-%m-%d %H:%M:%S", localtime(int(timestamp)))
slug = strftime("%Y-%m-%d-", localtime(int(timestamp))) + slug
date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
"%Y-%m-%d %H:%M:%S")
slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
"%Y-%m-%d-") + slug
format = post.get('format')
content = post.get('body')
type = post.get('type')
@ -499,7 +553,7 @@ def feed2fields(file):
import feedparser
d = feedparser.parse(file)
for entry in d.entries:
date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed)
date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M')
if hasattr(entry, 'updated_parsed') else None)
author = entry.author if hasattr(entry, 'author') else None
tags = ([e['term'] for e in entry.tags]
@ -619,7 +673,8 @@ def get_attachments(xml):
"""returns a dictionary of posts that have attachments with a list
of the attachment_urls
"""
items = get_items(xml)
soup = xml_to_soup(xml)
items = soup.rss.channel.findAll('item')
names = {}
attachments = []
@ -807,16 +862,16 @@ def fields2pelican(
def main():
parser = argparse.ArgumentParser(
description="Transform feed, WordPress, Tumblr, Dotclear, or "
"Posterous files into reST (rst) or Markdown (md) files. "
description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or"
"WordPress files into reST (rst) or Markdown (md) files. "
"Be sure to have pandoc installed.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
dest='input', help='The input file to read')
parser.add_argument(
'--wpfile', action='store_true', dest='wpfile',
help='Wordpress XML export')
'--blogger', action='store_true', dest='blogger',
help='Blogger XML export')
parser.add_argument(
'--dotclear', action='store_true', dest='dotclear',
help='Dotclear export')
@ -826,6 +881,9 @@ def main():
parser.add_argument(
'--tumblr', action='store_true', dest='tumblr',
help='Tumblr export')
parser.add_argument(
'--wpfile', action='store_true', dest='wpfile',
help='Wordpress XML export')
parser.add_argument(
'--feed', action='store_true', dest='feed',
help='Feed to parse')
@ -841,7 +899,7 @@ def main():
parser.add_argument(
'--dir-page', action='store_true', dest='dirpage',
help=('Put files recognised as pages in "pages/" sub-directory'
' (wordpress import only)'))
' (blogger and wordpress import only)'))
parser.add_argument(
'--filter-author', dest='author',
help='Import only post from the specified author')
@ -883,19 +941,21 @@ def main():
args = parser.parse_args()
input_type = None
if args.wpfile:
input_type = 'wordpress'
if args.blogger:
input_type = 'blogger'
elif args.dotclear:
input_type = 'dotclear'
elif args.posterous:
input_type = 'posterous'
elif args.tumblr:
input_type = 'tumblr'
elif args.wpfile:
input_type = 'wordpress'
elif args.feed:
input_type = 'feed'
else:
error = ('You must provide either --wpfile, --dotclear, '
'--posterous, --tumblr or --feed options')
error = ('You must provide either --blogger, --dotclear, '
'--posterous, --tumblr, --wpfile or --feed options')
exit(error)
if not os.path.exists(args.output):
@ -910,14 +970,16 @@ def main():
'to use the --wp-attach option')
exit(error)
if input_type == 'wordpress':
fields = wp2fields(args.input, args.wp_custpost or False)
if input_type == 'blogger':
fields = blogger2fields(args.input)
elif input_type == 'dotclear':
fields = dc2fields(args.input)
elif input_type == 'posterous':
fields = posterous2fields(args.input, args.email, args.password)
elif input_type == 'tumblr':
fields = tumblr2fields(args.input, args.blogname)
elif input_type == 'wordpress':
fields = wp2fields(args.input, args.wp_custpost or False)
elif input_type == 'feed':
fields = feed2fields(args.input)