forked from github/pelican
add blogger importer
This commit is contained in:
parent
f2c3136921
commit
c388f14d3e
4 changed files with 1218 additions and 33 deletions
|
|
@ -9,10 +9,11 @@ Description
|
|||
``pelican-import`` is a command-line tool for converting articles from other
|
||||
software to reStructuredText or Markdown. The supported import formats are:
|
||||
|
||||
- WordPress XML export
|
||||
- Blogger XML export
|
||||
- Dotclear export
|
||||
- Posterous API
|
||||
- Tumblr API
|
||||
- WordPress XML export
|
||||
- RSS/Atom feed
|
||||
|
||||
The conversion from HTML to reStructuredText or Markdown relies on `Pandoc`_.
|
||||
|
|
@ -40,8 +41,8 @@ Usage
|
|||
|
||||
::
|
||||
|
||||
pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--tumblr] [--feed] [-o OUTPUT]
|
||||
[-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost]
|
||||
pelican-import [-h] [--blogger] [--dotclear] [--posterous] [--tumblr] [--wpfile] [--feed]
|
||||
[-o OUTPUT] [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost]
|
||||
[--wp-attach] [--disable-slugs] [-e EMAIL] [-p PASSWORD] [-b BLOGNAME]
|
||||
input|api_token|api_key
|
||||
|
||||
|
|
@ -57,10 +58,11 @@ Optional arguments
|
|||
------------------
|
||||
|
||||
-h, --help Show this help message and exit
|
||||
--wpfile WordPress XML export (default: False)
|
||||
--blogger Blogger XML export (default: False)
|
||||
--dotclear Dotclear export (default: False)
|
||||
--posterous Posterous API (default: False)
|
||||
--tumblr Tumblr API (default: False)
|
||||
--wpfile WordPress XML export (default: False)
|
||||
--feed Feed to parse (default: False)
|
||||
-o OUTPUT, --output OUTPUT
|
||||
Output path (default: content)
|
||||
|
|
@ -70,7 +72,8 @@ Optional arguments
|
|||
--dir-cat Put files in directories with categories name
|
||||
(default: False)
|
||||
--dir-page Put files recognised as pages in "pages/" sub-
|
||||
directory (wordpress import only) (default: False)
|
||||
directory (blogger and wordpress import only)
|
||||
(default: False)
|
||||
--filter-author Import only post from the specified author
|
||||
--strip-raw Strip raw HTML code that can't be converted to markup
|
||||
such as flash embeds or iframes (wordpress import
|
||||
|
|
@ -102,9 +105,9 @@ Optional arguments
|
|||
Examples
|
||||
========
|
||||
|
||||
For WordPress::
|
||||
For Blogger::
|
||||
|
||||
$ pelican-import --wpfile -o ~/output ~/posts.xml
|
||||
$ pelican-import --blogger -o ~/output ~/posts.xml
|
||||
|
||||
For Dotclear::
|
||||
|
||||
|
|
@ -118,6 +121,10 @@ For Tumblr::
|
|||
|
||||
$ pelican-import --tumblr -o ~/output --blogname=<blogname> <api_token>
|
||||
|
||||
For WordPress::
|
||||
|
||||
$ pelican-import --wpfile -o ~/output ~/posts.xml
|
||||
|
||||
Tests
|
||||
=====
|
||||
|
||||
|
|
|
|||
1067
pelican/tests/content/bloggerexport.xml
vendored
Normal file
1067
pelican/tests/content/bloggerexport.xml
vendored
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -8,13 +8,15 @@ from codecs import open
|
|||
|
||||
from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
|
||||
unittest)
|
||||
from pelican.tools.pelican_import import (build_header, build_markdown_header,
|
||||
from pelican.tools.pelican_import import (blogger2fields, build_header,
|
||||
build_markdown_header,
|
||||
decode_wp_content,
|
||||
download_attachments, fields2pelican,
|
||||
get_attachments, wp2fields)
|
||||
from pelican.utils import path_to_file_url, slugify
|
||||
|
||||
CUR_DIR = os.path.abspath(os.path.dirname(__file__))
|
||||
BLOGGER_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'bloggerexport.xml')
|
||||
WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
|
||||
WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
|
||||
'content',
|
||||
|
|
@ -34,6 +36,53 @@ except ImportError:
|
|||
LXML = False
|
||||
|
||||
|
||||
@skipIfNoExecutable(['pandoc', '--version'])
|
||||
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
|
||||
class TestBloggerXmlImporter(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.old_locale = locale.setlocale(locale.LC_ALL)
|
||||
locale.setlocale(locale.LC_ALL, str('C'))
|
||||
self.posts = list(blogger2fields(BLOGGER_XML_SAMPLE))
|
||||
|
||||
def tearDown(self):
|
||||
locale.setlocale(locale.LC_ALL, self.old_locale)
|
||||
|
||||
def test_recognise_kind_and_title(self):
|
||||
"""Check that importer only outputs pages, articles and comments,
|
||||
that these are correctly identified and that titles are correct.
|
||||
"""
|
||||
kinds = {x[8] for x in self.posts}
|
||||
self.assertEqual({'page', 'article', 'comment'}, kinds)
|
||||
page_titles = {x[0] for x in self.posts if x[8] == 'page'}
|
||||
self.assertEqual({'Test page', 'Test page 2'}, page_titles)
|
||||
article_titles = {x[0] for x in self.posts if x[8] == 'article'}
|
||||
self.assertEqual({'Black as Egypt\'s Night', 'The Steel Windpipe'},
|
||||
article_titles)
|
||||
comment_titles = {x[0] for x in self.posts if x[8] == 'comment'}
|
||||
self.assertEqual({'Mishka, always a pleasure to read your '
|
||||
'adventures!...'},
|
||||
comment_titles)
|
||||
|
||||
def test_recognise_status_with_correct_filename(self):
|
||||
"""Check that importerer outputs only statuses 'published' and 'draft',
|
||||
that these are correctly identified and that filenames are correct.
|
||||
"""
|
||||
statuses = {x[7] for x in self.posts}
|
||||
self.assertEqual({'published', 'draft'}, statuses)
|
||||
|
||||
draft_filenames = {x[2] for x in self.posts if x[7] == 'draft'}
|
||||
# draft filenames are id-based
|
||||
self.assertEqual({'page-4386962582497458967',
|
||||
'post-1276418104709695660'}, draft_filenames)
|
||||
|
||||
published_filenames = {x[2] for x in self.posts if x[7] == 'published'}
|
||||
# published filenames are url-based, except comments
|
||||
self.assertEqual({'the-steel-windpipe',
|
||||
'test-page',
|
||||
'post-5590533389087749201'}, published_filenames)
|
||||
|
||||
|
||||
@skipIfNoExecutable(['pandoc', '--version'])
|
||||
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
|
||||
class TestWordpressXmlImporter(unittest.TestCase):
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ import os
|
|||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from codecs import open
|
||||
from collections import defaultdict
|
||||
|
||||
|
|
@ -117,19 +116,18 @@ def decode_wp_content(content, br=True):
|
|||
return content
|
||||
|
||||
|
||||
def get_items(xml):
|
||||
"""Opens a WordPress xml file and returns a list of items"""
|
||||
def xml_to_soup(xml):
|
||||
"""Opens an xml file"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
error = ('Missing dependency "BeautifulSoup4" and "lxml" required to '
|
||||
'import WordPress XML files.')
|
||||
'import XML files.')
|
||||
sys.exit(error)
|
||||
with open(xml, encoding='utf-8') as infile:
|
||||
xmlfile = infile.read()
|
||||
soup = BeautifulSoup(xmlfile, "xml")
|
||||
items = soup.rss.channel.findAll('item')
|
||||
return items
|
||||
return soup
|
||||
|
||||
|
||||
def get_filename(filename, post_id):
|
||||
|
|
@ -142,7 +140,8 @@ def get_filename(filename, post_id):
|
|||
def wp2fields(xml, wp_custpost=False):
|
||||
"""Opens a wordpress XML file, and yield Pelican fields"""
|
||||
|
||||
items = get_items(xml)
|
||||
soup = xml_to_soup(xml)
|
||||
items = soup.rss.channel.findAll('item')
|
||||
for item in items:
|
||||
|
||||
if item.find('status').string in ["publish", "draft"]:
|
||||
|
|
@ -163,8 +162,9 @@ def wp2fields(xml, wp_custpost=False):
|
|||
if raw_date == u'0000-00-00 00:00:00':
|
||||
date = None
|
||||
else:
|
||||
date_object = time.strptime(raw_date, '%Y-%m-%d %H:%M:%S')
|
||||
date = time.strftime('%Y-%m-%d %H:%M', date_object)
|
||||
date_object = SafeDatetime.strptime(
|
||||
raw_date, '%Y-%m-%d %H:%M:%S')
|
||||
date = date_object.strftime('%Y-%m-%d %H:%M')
|
||||
author = item.find('creator').string
|
||||
|
||||
categories = [cat.string for cat
|
||||
|
|
@ -195,6 +195,59 @@ def wp2fields(xml, wp_custpost=False):
|
|||
tags, status, kind, 'wp-html')
|
||||
|
||||
|
||||
def blogger2fields(xml):
|
||||
"""Opens a blogger XML file, and yield Pelican fields"""
|
||||
|
||||
soup = xml_to_soup(xml)
|
||||
entries = soup.feed.findAll('entry')
|
||||
for entry in entries:
|
||||
raw_kind = entry.find(
|
||||
'category', {'scheme': 'http://schemas.google.com/g/2005#kind'}
|
||||
).get('term')
|
||||
if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
|
||||
kind = 'article'
|
||||
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
|
||||
kind = 'comment'
|
||||
elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
|
||||
kind = 'page'
|
||||
else:
|
||||
continue
|
||||
|
||||
try:
|
||||
assert kind != 'comment'
|
||||
filename = entry.find('link', {'rel': 'alternate'})['href']
|
||||
filename = os.path.splitext(os.path.basename(filename))[0]
|
||||
except (AssertionError, TypeError, KeyError):
|
||||
filename = entry.find('id').string.split('.')[-1]
|
||||
|
||||
title = entry.find('title').string or ''
|
||||
|
||||
content = entry.find('content').string
|
||||
raw_date = entry.find('published').string
|
||||
if hasattr(SafeDatetime, 'fromisoformat'):
|
||||
date_object = SafeDatetime.fromisoformat(raw_date)
|
||||
else:
|
||||
date_object = SafeDatetime.strptime(
|
||||
raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f')
|
||||
date = date_object.strftime('%Y-%m-%d %H:%M')
|
||||
author = entry.find('author').find('name').string
|
||||
|
||||
# blogger posts only have tags, no category
|
||||
tags = [tag.get('term') for tag in entry.findAll(
|
||||
'category', {'scheme': 'http://www.blogger.com/atom/ns#'})]
|
||||
|
||||
# Drafts have <app:control><app:draft>yes</app:draft></app:control>
|
||||
status = 'published'
|
||||
try:
|
||||
if entry.find('control').find('draft').string == 'yes':
|
||||
status = 'draft'
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
yield (title, content, filename, date, author, None, tags, status,
|
||||
kind, 'html')
|
||||
|
||||
|
||||
def dc2fields(file):
|
||||
"""Opens a Dotclear export file, and yield pelican fields"""
|
||||
try:
|
||||
|
|
@ -391,7 +444,6 @@ def posterous2fields(api_token, email, password):
|
|||
|
||||
def tumblr2fields(api_key, blogname):
|
||||
""" Imports Tumblr posts (API v2)"""
|
||||
from time import strftime, localtime
|
||||
try:
|
||||
# py3k import
|
||||
import json
|
||||
|
|
@ -426,8 +478,10 @@ def tumblr2fields(api_key, blogname):
|
|||
slug = post.get('slug') or slugify(title)
|
||||
tags = post.get('tags')
|
||||
timestamp = post.get('timestamp')
|
||||
date = strftime("%Y-%m-%d %H:%M:%S", localtime(int(timestamp)))
|
||||
slug = strftime("%Y-%m-%d-", localtime(int(timestamp))) + slug
|
||||
date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
|
||||
"%Y-%m-%d %H:%M:%S")
|
||||
slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
|
||||
"%Y-%m-%d-") + slug
|
||||
format = post.get('format')
|
||||
content = post.get('body')
|
||||
type = post.get('type')
|
||||
|
|
@ -499,7 +553,7 @@ def feed2fields(file):
|
|||
import feedparser
|
||||
d = feedparser.parse(file)
|
||||
for entry in d.entries:
|
||||
date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed)
|
||||
date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M')
|
||||
if hasattr(entry, 'updated_parsed') else None)
|
||||
author = entry.author if hasattr(entry, 'author') else None
|
||||
tags = ([e['term'] for e in entry.tags]
|
||||
|
|
@ -619,7 +673,8 @@ def get_attachments(xml):
|
|||
"""returns a dictionary of posts that have attachments with a list
|
||||
of the attachment_urls
|
||||
"""
|
||||
items = get_items(xml)
|
||||
soup = xml_to_soup(xml)
|
||||
items = soup.rss.channel.findAll('item')
|
||||
names = {}
|
||||
attachments = []
|
||||
|
||||
|
|
@ -807,16 +862,16 @@ def fields2pelican(
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Transform feed, WordPress, Tumblr, Dotclear, or "
|
||||
"Posterous files into reST (rst) or Markdown (md) files. "
|
||||
description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or"
|
||||
"WordPress files into reST (rst) or Markdown (md) files. "
|
||||
"Be sure to have pandoc installed.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
parser.add_argument(
|
||||
dest='input', help='The input file to read')
|
||||
parser.add_argument(
|
||||
'--wpfile', action='store_true', dest='wpfile',
|
||||
help='Wordpress XML export')
|
||||
'--blogger', action='store_true', dest='blogger',
|
||||
help='Blogger XML export')
|
||||
parser.add_argument(
|
||||
'--dotclear', action='store_true', dest='dotclear',
|
||||
help='Dotclear export')
|
||||
|
|
@ -826,6 +881,9 @@ def main():
|
|||
parser.add_argument(
|
||||
'--tumblr', action='store_true', dest='tumblr',
|
||||
help='Tumblr export')
|
||||
parser.add_argument(
|
||||
'--wpfile', action='store_true', dest='wpfile',
|
||||
help='Wordpress XML export')
|
||||
parser.add_argument(
|
||||
'--feed', action='store_true', dest='feed',
|
||||
help='Feed to parse')
|
||||
|
|
@ -841,7 +899,7 @@ def main():
|
|||
parser.add_argument(
|
||||
'--dir-page', action='store_true', dest='dirpage',
|
||||
help=('Put files recognised as pages in "pages/" sub-directory'
|
||||
' (wordpress import only)'))
|
||||
' (blogger and wordpress import only)'))
|
||||
parser.add_argument(
|
||||
'--filter-author', dest='author',
|
||||
help='Import only post from the specified author')
|
||||
|
|
@ -883,19 +941,21 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
input_type = None
|
||||
if args.wpfile:
|
||||
input_type = 'wordpress'
|
||||
if args.blogger:
|
||||
input_type = 'blogger'
|
||||
elif args.dotclear:
|
||||
input_type = 'dotclear'
|
||||
elif args.posterous:
|
||||
input_type = 'posterous'
|
||||
elif args.tumblr:
|
||||
input_type = 'tumblr'
|
||||
elif args.wpfile:
|
||||
input_type = 'wordpress'
|
||||
elif args.feed:
|
||||
input_type = 'feed'
|
||||
else:
|
||||
error = ('You must provide either --wpfile, --dotclear, '
|
||||
'--posterous, --tumblr or --feed options')
|
||||
error = ('You must provide either --blogger, --dotclear, '
|
||||
'--posterous, --tumblr, --wpfile or --feed options')
|
||||
exit(error)
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
|
|
@ -910,14 +970,16 @@ def main():
|
|||
'to use the --wp-attach option')
|
||||
exit(error)
|
||||
|
||||
if input_type == 'wordpress':
|
||||
fields = wp2fields(args.input, args.wp_custpost or False)
|
||||
if input_type == 'blogger':
|
||||
fields = blogger2fields(args.input)
|
||||
elif input_type == 'dotclear':
|
||||
fields = dc2fields(args.input)
|
||||
elif input_type == 'posterous':
|
||||
fields = posterous2fields(args.input, args.email, args.password)
|
||||
elif input_type == 'tumblr':
|
||||
fields = tumblr2fields(args.input, args.blogname)
|
||||
elif input_type == 'wordpress':
|
||||
fields = wp2fields(args.input, args.wp_custpost or False)
|
||||
elif input_type == 'feed':
|
||||
fields = feed2fields(args.input)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue