From c388f14d3e9067ec9b6329a2eae3925d08d8cb8c Mon Sep 17 00:00:00 2001 From: Oliver Urs Lenz Date: Tue, 7 Aug 2018 14:06:46 +0200 Subject: [PATCH] add blogger importer --- docs/importer.rst | 21 +- pelican/tests/content/bloggerexport.xml | 1067 +++++++++++++++++++++++ pelican/tests/test_importer.py | 51 +- pelican/tools/pelican_import.py | 112 ++- 4 files changed, 1218 insertions(+), 33 deletions(-) create mode 100644 pelican/tests/content/bloggerexport.xml diff --git a/docs/importer.rst b/docs/importer.rst index 713cc3a3..e8614f2a 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -9,10 +9,11 @@ Description ``pelican-import`` is a command-line tool for converting articles from other software to reStructuredText or Markdown. The supported import formats are: -- WordPress XML export +- Blogger XML export - Dotclear export - Posterous API - Tumblr API +- WordPress XML export - RSS/Atom feed The conversion from HTML to reStructuredText or Markdown relies on `Pandoc`_. @@ -40,8 +41,8 @@ Usage :: - pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--tumblr] [--feed] [-o OUTPUT] - [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost] + pelican-import [-h] [--blogger] [--dotclear] [--posterous] [--tumblr] [--wpfile] [--feed] + [-o OUTPUT] [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--wp-custpost] [--wp-attach] [--disable-slugs] [-e EMAIL] [-p PASSWORD] [-b BLOGNAME] input|api_token|api_key @@ -57,10 +58,11 @@ Optional arguments ------------------ -h, --help Show this help message and exit - --wpfile WordPress XML export (default: False) + --blogger Blogger XML export (default: False) --dotclear Dotclear export (default: False) --posterous Posterous API (default: False) --tumblr Tumblr API (default: False) + --wpfile WordPress XML export (default: False) --feed Feed to parse (default: False) -o OUTPUT, --output OUTPUT Output path (default: content) @@ -70,7 +72,8 @@ Optional arguments --dir-cat Put files in directories with categories name (default: False) --dir-page Put files recognised as pages in "pages/" sub- - directory (wordpress import only) (default: False) + directory (blogger and wordpress import only) + (default: False) --filter-author Import only post from the specified author --strip-raw Strip raw HTML code that can't be converted to markup such as flash embeds or iframes (wordpress import @@ -102,9 +105,9 @@ Optional arguments Examples ======== -For WordPress:: +For Blogger:: - $ pelican-import --wpfile -o ~/output ~/posts.xml + $ pelican-import --blogger -o ~/output ~/posts.xml For Dotclear:: @@ -118,6 +121,10 @@ For Tumblr:: $ pelican-import --tumblr -o ~/output --blogname= +For WordPress:: + + $ pelican-import --wpfile -o ~/output ~/posts.xml + Tests ===== diff --git a/pelican/tests/content/bloggerexport.xml b/pelican/tests/content/bloggerexport.xml new file mode 100644 index 00000000..4bc0985a --- /dev/null +++ b/pelican/tests/content/bloggerexport.xml @@ -0,0 +1,1067 @@ + + + + tag:blogger.com,1999:blog-6303278419262689239.archive + + 2018-08-02T12:38:27.320-07:00 + + Notes of a Young Doctor + + + + + + Mikhail Afanasyevich Bulgakov + + https://www.blogger.com/profile/000082957 + + noreply@blogger.com + + + + Blogger + + + tag:blogger.com,1999:blog-6303278419262689239.layout + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Template: Notes of a Young Doctor + [Over 2000 lines of mostly css that we don't need here.] + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_PUBLISHING_MODE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het type publicatie voor deze blog. + PUBLISH_MODE_BLOGSPOT + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ADMIN_PERMISSION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De lijst van e-mails van beheerders voor de blog. + mikhail.afanasyevich.bulgakov@gmail.com + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ADULT_CONTENT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of deze blog content voor volwassenen bevat + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ALTERNATE_JSRENDER_ALLOWED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of alternatieve weergaven in JavaScript zijn toegestaan + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ANALYTICS_ACCOUNT_NUMBER + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Google Analytics-accountnummer voor een blog + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ARCHIVE_DATE_FORMAT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het getal van de datumnotatie voor de archiefindex + 9 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_ARCHIVE_FREQUENCY + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hoe vaak deze blog moet worden gearchiveerd + MONTHLY + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_AUTHOR_PERMISSION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De lijst van e-mails van auteurs die toestemming hebben om te publiceren. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_BACKLINKS_ALLOWED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of reactiebacklinks op de blog moeten worden getoond + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_BY_POST_ARCHIVING + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of iedere post moet worden voorzien van een archiefpagina + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_ACCESS + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Wie kan reacties achterlaten + BLOGGERS + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_CAPTCHA + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of personen die reacties geven, een Captcha (woordverificatie) moeten invullen + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_EMAIL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Lijst met e-mailadressen om meldingen van nieuwe reacties naar te sturen + mikhail.afanasyevich.bulgakov@gmail.com + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_FEED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het type feed dat voor blogreacties moet worden gegeven + FULL + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_FORM_LOCATION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Locatie van formulier voor blogreacties + EMBEDDED_IFRAME + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_MESSAGE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Bericht bij blogreactie + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_MODERATION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of het modereren van reacties moet worden ingeschakeld + DISABLED + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_MODERATION_DELAY + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Aantal dagen waarna nieuwe reacties in aanmerking komen voor moderaten + 14 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_MODERATION_EMAIL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + E-mailadres waar meldingen binnenkomen over welke nieuwe reacties bewerkt of verwijderd moeten worden + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENT_PROFILE_IMAGES + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of profielafbeeldingen in reacties moeten worden getoond + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENTS_ALLOWED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of er reacties moeten worden weergegeven + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_COMMENTS_TIME_STAMP_FORMAT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Getal van de tijdstempelnotatie voor reacties + 29 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CONVERT_LINE_BREAKS + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of de regelscheidingen moeten worden omgezet in <br />-tags in de posteditor + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CUSTOM_ADS_TXT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De aangepaste ads.txt-content van de blog die aan advertentiezoekmachines wordt getoond. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CUSTOM_ADS_TXT_ENABLED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Bepaalt of deze blog aangepaste ads.txt-content aan advertentiezoekmachines toont. + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CUSTOM_PAGE_NOT_FOUND + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De content die wordt weergegeven wanneer een post of pagina niet is gevonden. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CUSTOM_ROBOTS_TXT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De aangepaste robots.txt-content van de blog wordt aan zoekmachines getoond. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_CUSTOM_ROBOTS_TXT_ENABLED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Bepaalt of deze blog aangepaste robots.txt-content aan zoekmachines toont. + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_DATE_FORMAT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het getal van de datumnotatie voor koppen + 26 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_DEFAULT_BACKLINKS_MODE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Standaardbacklinks voor posts + DEFAULT_HAVE_BACKLINKS + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_DEFAULT_COMMENTS_MODE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Standaardreactie voor posts + DEFAULT_HAVE_COMMENTS + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_DESCRIPTION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Een beschrijving van de blog + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_EMAIL_POST_LINKS + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of er een link moet worden weergegeven waarmee gebruikers posts kunnen e-mailen + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_FEED_REDIRECT_URL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + URL waar verzoeken om postfeed naartoe worden geleid + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_FLOAT_ALIGNMENT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of zwevende uitlijning is ingeschakeld voor de blog + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_LOCALE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Taal voor deze blog + nl + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_M2B_WHITELIST_EMAIL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Lijst met e-mailadressen die via e-mail posts op de blog kunnen plaatsen. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_MAX_NUM + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Maximaal aantal items voor weergave op de hoofdpagina" + 100 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_MAX_UNIT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Eenheid van items voor weergave op de hoofdpagina + POSTS + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_META_DESCRIPTION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De metabeschrijving van de blog die wordt gebruikt door zoekmachines. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_META_DESCRIPTION_ENABLED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Of deze blog wordt weergegeven met metabeschrijvingen. + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_NAME + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De naam van de blog + Notes of a Young Doctor + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_PER_POST_FEED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het type feed dat voor reacties op afzonderlijke posts moet worden gegeven + FULL + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_POST_FEED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het type feed dat voor blogposts moet worden gegeven + FULL + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_POST_FEED_FOOTER + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Voettekst om aan het einde van iedere vermelding in de postfeed toe te voegen + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_POST_TEMPLATE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De template voor blogposts + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_PROMOTED + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of deze blog op Blogger kan worden aangeprezen + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_QUICK_EDITING + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of Snel bewerken is ingeschakeld + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_READ_ACCESS_MODE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het type toegang voor de lezers van de blog. + PUBLIC + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_READER_PERMISSION + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De e-maillijst voor gebruikers die toestemming hebben om de blog te lezen. + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_SEARCHABLE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of deze blog door zoekmachines moet worden geïndexeerd + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_SEND_EMAIL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Door komma's gescheiden lijst met e-mailadressen om nieuwe blogposts naar te sturen + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_SHOW_TITLE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of het titelveld moet worden weergegeven + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_SHOW_URL + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Hier wordt aangegeven of er een verwante link in de postopsteller moet worden weergegeven + false + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_SUBDOMAIN + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het subdomein van BlogSpot om je blog op te publiceren + youngdoctornotes + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_TIME_STAMP_FORMAT + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Het getal van de tijdstempelnotatie + 27 + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_TIME_ZONE + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + De tijdzone voor deze blog + America/Los_Angeles + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.settings.BLOG_USE_LIGHTBOX + 2010-11-27T07:08:20.877-08:00 + 2018-08-02T12:38:27.320-07:00 + + Of afbeeldingen worden weergegeven in de lightbox wanneer erop wordt geklikt + true + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.post-1276418104709695660 + 2010-11-27T08:21:00.000-08:00 + 2018-08-02T12:22:48.286-07:00 + + yes + + + + + Black as Egypt's Night + Write next story here + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + 0 + + + tag:blogger.com,1999:blog-6303278419262689239.post-1858599377741856733 + 2010-11-27T07:12:00.000-08:00 + 2010-11-27T07:56:43.964-08:00 + + The Steel Windpipe + It was a cold Winter's night.<br /><br /><ul><li>Very cold indeed.</li><br /><li>Note to self: pad out ending</li></ul> + + + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + 1 + + + tag:blogger.com,1999:blog-6303278419262689239.page-4386962582497458967 + 2018-08-02T12:38:00.001-07:00 + 2018-08-02T12:38:27.171-07:00 + + yes + + + Test page 2 + <div dir="ltr" style="text-align: left;" trbidi="on">This is a second test</div> + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.page-1406163839769953231 + 2018-08-02T12:37:00.004-07:00 + 2018-08-02T12:37:47.424-07:00 + + Test page + <div dir="ltr" style="text-align: left;" trbidi="on">This is a test.</div> + + + + + Mikhail Afanasyevich Bulgakov + https://www.blogger.com/profile/000082957 + noreply@blogger.com + + + + + tag:blogger.com,1999:blog-6303278419262689239.post-5590533389087749201 + 2010-11-29T12:35:44.027-08:00 + 2010-11-29T12:35:44.027-08:00 + + Mishka, always a pleasure to read your adventures!... + Mishka, always a pleasure to read your adventures!<br /><br />It's a shame you don't get more time for writing. + + + + + Thomas Isidore Noël Sankara + https://www.blogger.com/profile/0617349827 + noreply@blogger.com + + + + + + + \ No newline at end of file diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 7bb4aa6e..f913562f 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -8,13 +8,15 @@ from codecs import open from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder, unittest) -from pelican.tools.pelican_import import (build_header, build_markdown_header, +from pelican.tools.pelican_import import (blogger2fields, build_header, + build_markdown_header, decode_wp_content, download_attachments, fields2pelican, get_attachments, wp2fields) from pelican.utils import path_to_file_url, slugify CUR_DIR = os.path.abspath(os.path.dirname(__file__)) +BLOGGER_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'bloggerexport.xml') WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, 'content', @@ -34,6 +36,53 @@ except ImportError: LXML = False +@skipIfNoExecutable(['pandoc', '--version']) +@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module') +class TestBloggerXmlImporter(unittest.TestCase): + + def setUp(self): + self.old_locale = locale.setlocale(locale.LC_ALL) + locale.setlocale(locale.LC_ALL, str('C')) + self.posts = list(blogger2fields(BLOGGER_XML_SAMPLE)) + + def tearDown(self): + locale.setlocale(locale.LC_ALL, self.old_locale) + + def test_recognise_kind_and_title(self): + """Check that importer only outputs pages, articles and comments, + that these are correctly identified and that titles are correct. + """ + kinds = {x[8] for x in self.posts} + self.assertEqual({'page', 'article', 'comment'}, kinds) + page_titles = {x[0] for x in self.posts if x[8] == 'page'} + self.assertEqual({'Test page', 'Test page 2'}, page_titles) + article_titles = {x[0] for x in self.posts if x[8] == 'article'} + self.assertEqual({'Black as Egypt\'s Night', 'The Steel Windpipe'}, + article_titles) + comment_titles = {x[0] for x in self.posts if x[8] == 'comment'} + self.assertEqual({'Mishka, always a pleasure to read your ' + 'adventures!...'}, + comment_titles) + + def test_recognise_status_with_correct_filename(self): + """Check that importerer outputs only statuses 'published' and 'draft', + that these are correctly identified and that filenames are correct. + """ + statuses = {x[7] for x in self.posts} + self.assertEqual({'published', 'draft'}, statuses) + + draft_filenames = {x[2] for x in self.posts if x[7] == 'draft'} + # draft filenames are id-based + self.assertEqual({'page-4386962582497458967', + 'post-1276418104709695660'}, draft_filenames) + + published_filenames = {x[2] for x in self.posts if x[7] == 'published'} + # published filenames are url-based, except comments + self.assertEqual({'the-steel-windpipe', + 'test-page', + 'post-5590533389087749201'}, published_filenames) + + @skipIfNoExecutable(['pandoc', '--version']) @unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module') class TestWordpressXmlImporter(unittest.TestCase): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 461a3263..a3f2ffa8 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -8,7 +8,6 @@ import os import re import subprocess import sys -import time from codecs import open from collections import defaultdict @@ -117,19 +116,18 @@ def decode_wp_content(content, br=True): return content -def get_items(xml): - """Opens a WordPress xml file and returns a list of items""" +def xml_to_soup(xml): + """Opens an xml file""" try: from bs4 import BeautifulSoup except ImportError: error = ('Missing dependency "BeautifulSoup4" and "lxml" required to ' - 'import WordPress XML files.') + 'import XML files.') sys.exit(error) with open(xml, encoding='utf-8') as infile: xmlfile = infile.read() soup = BeautifulSoup(xmlfile, "xml") - items = soup.rss.channel.findAll('item') - return items + return soup def get_filename(filename, post_id): @@ -142,7 +140,8 @@ def get_filename(filename, post_id): def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" - items = get_items(xml) + soup = xml_to_soup(xml) + items = soup.rss.channel.findAll('item') for item in items: if item.find('status').string in ["publish", "draft"]: @@ -163,8 +162,9 @@ def wp2fields(xml, wp_custpost=False): if raw_date == u'0000-00-00 00:00:00': date = None else: - date_object = time.strptime(raw_date, '%Y-%m-%d %H:%M:%S') - date = time.strftime('%Y-%m-%d %H:%M', date_object) + date_object = SafeDatetime.strptime( + raw_date, '%Y-%m-%d %H:%M:%S') + date = date_object.strftime('%Y-%m-%d %H:%M') author = item.find('creator').string categories = [cat.string for cat @@ -195,6 +195,59 @@ def wp2fields(xml, wp_custpost=False): tags, status, kind, 'wp-html') +def blogger2fields(xml): + """Opens a blogger XML file, and yield Pelican fields""" + + soup = xml_to_soup(xml) + entries = soup.feed.findAll('entry') + for entry in entries: + raw_kind = entry.find( + 'category', {'scheme': 'http://schemas.google.com/g/2005#kind'} + ).get('term') + if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post': + kind = 'article' + elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment': + kind = 'comment' + elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page': + kind = 'page' + else: + continue + + try: + assert kind != 'comment' + filename = entry.find('link', {'rel': 'alternate'})['href'] + filename = os.path.splitext(os.path.basename(filename))[0] + except (AssertionError, TypeError, KeyError): + filename = entry.find('id').string.split('.')[-1] + + title = entry.find('title').string or '' + + content = entry.find('content').string + raw_date = entry.find('published').string + if hasattr(SafeDatetime, 'fromisoformat'): + date_object = SafeDatetime.fromisoformat(raw_date) + else: + date_object = SafeDatetime.strptime( + raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f') + date = date_object.strftime('%Y-%m-%d %H:%M') + author = entry.find('author').find('name').string + + # blogger posts only have tags, no category + tags = [tag.get('term') for tag in entry.findAll( + 'category', {'scheme': 'http://www.blogger.com/atom/ns#'})] + + # Drafts have yes + status = 'published' + try: + if entry.find('control').find('draft').string == 'yes': + status = 'draft' + except AttributeError: + pass + + yield (title, content, filename, date, author, None, tags, status, + kind, 'html') + + def dc2fields(file): """Opens a Dotclear export file, and yield pelican fields""" try: @@ -391,7 +444,6 @@ def posterous2fields(api_token, email, password): def tumblr2fields(api_key, blogname): """ Imports Tumblr posts (API v2)""" - from time import strftime, localtime try: # py3k import import json @@ -426,8 +478,10 @@ def tumblr2fields(api_key, blogname): slug = post.get('slug') or slugify(title) tags = post.get('tags') timestamp = post.get('timestamp') - date = strftime("%Y-%m-%d %H:%M:%S", localtime(int(timestamp))) - slug = strftime("%Y-%m-%d-", localtime(int(timestamp))) + slug + date = SafeDatetime.fromtimestamp(int(timestamp)).strftime( + "%Y-%m-%d %H:%M:%S") + slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime( + "%Y-%m-%d-") + slug format = post.get('format') content = post.get('body') type = post.get('type') @@ -499,7 +553,7 @@ def feed2fields(file): import feedparser d = feedparser.parse(file) for entry in d.entries: - date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed) + date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M') if hasattr(entry, 'updated_parsed') else None) author = entry.author if hasattr(entry, 'author') else None tags = ([e['term'] for e in entry.tags] @@ -619,7 +673,8 @@ def get_attachments(xml): """returns a dictionary of posts that have attachments with a list of the attachment_urls """ - items = get_items(xml) + soup = xml_to_soup(xml) + items = soup.rss.channel.findAll('item') names = {} attachments = [] @@ -807,16 +862,16 @@ def fields2pelican( def main(): parser = argparse.ArgumentParser( - description="Transform feed, WordPress, Tumblr, Dotclear, or " - "Posterous files into reST (rst) or Markdown (md) files. " + description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or" + "WordPress files into reST (rst) or Markdown (md) files. " "Be sure to have pandoc installed.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( dest='input', help='The input file to read') parser.add_argument( - '--wpfile', action='store_true', dest='wpfile', - help='Wordpress XML export') + '--blogger', action='store_true', dest='blogger', + help='Blogger XML export') parser.add_argument( '--dotclear', action='store_true', dest='dotclear', help='Dotclear export') @@ -826,6 +881,9 @@ def main(): parser.add_argument( '--tumblr', action='store_true', dest='tumblr', help='Tumblr export') + parser.add_argument( + '--wpfile', action='store_true', dest='wpfile', + help='Wordpress XML export') parser.add_argument( '--feed', action='store_true', dest='feed', help='Feed to parse') @@ -841,7 +899,7 @@ def main(): parser.add_argument( '--dir-page', action='store_true', dest='dirpage', help=('Put files recognised as pages in "pages/" sub-directory' - ' (wordpress import only)')) + ' (blogger and wordpress import only)')) parser.add_argument( '--filter-author', dest='author', help='Import only post from the specified author') @@ -883,19 +941,21 @@ def main(): args = parser.parse_args() input_type = None - if args.wpfile: - input_type = 'wordpress' + if args.blogger: + input_type = 'blogger' elif args.dotclear: input_type = 'dotclear' elif args.posterous: input_type = 'posterous' elif args.tumblr: input_type = 'tumblr' + elif args.wpfile: + input_type = 'wordpress' elif args.feed: input_type = 'feed' else: - error = ('You must provide either --wpfile, --dotclear, ' - '--posterous, --tumblr or --feed options') + error = ('You must provide either --blogger, --dotclear, ' + '--posterous, --tumblr, --wpfile or --feed options') exit(error) if not os.path.exists(args.output): @@ -910,14 +970,16 @@ def main(): 'to use the --wp-attach option') exit(error) - if input_type == 'wordpress': - fields = wp2fields(args.input, args.wp_custpost or False) + if input_type == 'blogger': + fields = blogger2fields(args.input) elif input_type == 'dotclear': fields = dc2fields(args.input) elif input_type == 'posterous': fields = posterous2fields(args.input, args.email, args.password) elif input_type == 'tumblr': fields = tumblr2fields(args.input, args.blogname) + elif input_type == 'wordpress': + fields = wp2fields(args.input, args.wp_custpost or False) elif input_type == 'feed': fields = feed2fields(args.input)