import os import re from posixpath import join as posix_join from unittest.mock import patch from pelican.settings import DEFAULT_CONFIG from pelican.tests.support import ( TestCaseWithCLocale, mute, skipIfNoExecutable, temporary_folder, unittest, ) from pelican.tools.pelican_import import ( blogger2fields, build_header, build_markdown_header, decode_wp_content, download_attachments, fields2pelican, get_attachments, medium_slug, mediumpost2fields, mediumposts2fields, strip_medium_post_content, tumblr2fields, wp2fields, ) from pelican.utils import path_to_file_url, slugify CUR_DIR = os.path.abspath(os.path.dirname(__file__)) BLOGGER_XML_SAMPLE = os.path.join(CUR_DIR, "content", "bloggerexport.xml") WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, "content", "wordpressexport.xml") WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join( CUR_DIR, "content", "wordpress_content_encoded" ) WORDPRESS_DECODED_CONTENT_SAMPLE = os.path.join( CUR_DIR, "content", "wordpress_content_decoded" ) try: from bs4 import BeautifulSoup except ImportError: BeautifulSoup = False try: import bs4.builder._lxml as LXML except ImportError: LXML = False @skipIfNoExecutable(["pandoc", "--version"]) @unittest.skipUnless(BeautifulSoup, "Needs BeautifulSoup module") class TestBloggerXmlImporter(TestCaseWithCLocale): def setUp(self): super().setUp() self.posts = blogger2fields(BLOGGER_XML_SAMPLE) def test_recognise_kind_and_title(self): """Check that importer only outputs pages, articles and comments, that these are correctly identified and that titles are correct. """ test_posts = list(self.posts) kinds = {x[8] for x in test_posts} self.assertEqual({"page", "article", "comment"}, kinds) page_titles = {x[0] for x in test_posts if x[8] == "page"} self.assertEqual({"Test page", "Test page 2"}, page_titles) article_titles = {x[0] for x in test_posts if x[8] == "article"} self.assertEqual( {"Black as Egypt's Night", "The Steel Windpipe"}, article_titles ) comment_titles = {x[0] for x in test_posts if x[8] == "comment"} self.assertEqual( {"Mishka, always a pleasure to read your " "adventures!..."}, comment_titles ) def test_recognise_status_with_correct_filename(self): """Check that importerer outputs only statuses 'published' and 'draft', that these are correctly identified and that filenames are correct. """ test_posts = list(self.posts) statuses = {x[7] for x in test_posts} self.assertEqual({"published", "draft"}, statuses) draft_filenames = {x[2] for x in test_posts if x[7] == "draft"} # draft filenames are id-based self.assertEqual( {"page-4386962582497458967", "post-1276418104709695660"}, draft_filenames ) published_filenames = {x[2] for x in test_posts if x[7] == "published"} # published filenames are url-based, except comments self.assertEqual( {"the-steel-windpipe", "test-page", "post-5590533389087749201"}, published_filenames, ) @skipIfNoExecutable(["pandoc", "--version"]) @unittest.skipUnless(BeautifulSoup, "Needs BeautifulSoup module") class TestWordpressXmlImporter(TestCaseWithCLocale): def setUp(self): super().setUp() self.posts = wp2fields(WORDPRESS_XML_SAMPLE) self.custposts = wp2fields(WORDPRESS_XML_SAMPLE, True) def test_ignore_empty_posts(self): self.assertTrue(self.posts) for ( title, content, fname, date, author, categ, tags, status, kind, format, ) in self.posts: self.assertTrue(title.strip()) def test_recognise_page_kind(self): """Check that we recognise pages in wordpress, as opposed to posts""" self.assertTrue(self.posts) # Collect (title, filename, kind) of non-empty posts recognised as page pages_data = [] for ( title, content, fname, date, author, categ, tags, status, kind, format, ) in self.posts: if kind == "page": pages_data.append((title, fname)) self.assertEqual(2, len(pages_data)) self.assertEqual(("Page", "contact"), pages_data[0]) self.assertEqual(("Empty Page", "empty"), pages_data[1]) def test_dirpage_directive_for_page_kind(self): silent_f2p = mute(True)(fields2pelican) test_post = filter(lambda p: p[0].startswith("Empty Page"), self.posts) with temporary_folder() as temp: fname = list(silent_f2p(test_post, "markdown", temp, dirpage=True))[0] self.assertTrue(fname.endswith("pages%sempty.md" % os.path.sep)) def test_dircat(self): silent_f2p = mute(True)(fields2pelican) test_posts = [] for post in self.posts: # check post kind if len(post[5]) > 0: # Has a category test_posts.append(post) with temporary_folder() as temp: fnames = list(silent_f2p(test_posts, "markdown", temp, dircat=True)) subs = DEFAULT_CONFIG["SLUG_REGEX_SUBSTITUTIONS"] index = 0 for post in test_posts: name = post[2] category = slugify(post[5][0], regex_subs=subs, preserve_case=True) name += ".md" filename = os.path.join(category, name) out_name = fnames[index] self.assertTrue(out_name.endswith(filename)) index += 1 def test_unless_custom_post_all_items_should_be_pages_or_posts(self): self.assertTrue(self.posts) pages_data = [] for ( title, content, fname, date, author, categ, tags, status, kind, format, ) in self.posts: if kind == "page" or kind == "article": pass else: pages_data.append((title, fname)) self.assertEqual(0, len(pages_data)) def test_recognise_custom_post_type(self): self.assertTrue(self.custposts) cust_data = [] for ( title, content, fname, date, author, categ, tags, status, kind, format, ) in self.custposts: if kind == "article" or kind == "page": pass else: cust_data.append((title, kind)) self.assertEqual(3, len(cust_data)) self.assertEqual(("A custom post in category 4", "custom1"), cust_data[0]) self.assertEqual(("A custom post in category 5", "custom1"), cust_data[1]) self.assertEqual( ("A 2nd custom post type also in category 5", "custom2"), cust_data[2] ) def test_custom_posts_put_in_own_dir(self): silent_f2p = mute(True)(fields2pelican) test_posts = [] for post in self.custposts: # check post kind if post[8] == "article" or post[8] == "page": pass else: test_posts.append(post) with temporary_folder() as temp: fnames = list(silent_f2p(test_posts, "markdown", temp, wp_custpost=True)) index = 0 for post in test_posts: name = post[2] kind = post[8] name += ".md" filename = os.path.join(kind, name) out_name = fnames[index] self.assertTrue(out_name.endswith(filename)) index += 1 def test_custom_posts_put_in_own_dir_and_catagory_sub_dir(self): silent_f2p = mute(True)(fields2pelican) test_posts = [] for post in self.custposts: # check post kind if post[8] == "article" or post[8] == "page": pass else: test_posts.append(post) with temporary_folder() as temp: fnames = list( silent_f2p(test_posts, "markdown", temp, wp_custpost=True, dircat=True) ) subs = DEFAULT_CONFIG["SLUG_REGEX_SUBSTITUTIONS"] index = 0 for post in test_posts: name = post[2] kind = post[8] category = slugify(post[5][0], regex_subs=subs, preserve_case=True) name += ".md" filename = os.path.join(kind, category, name) out_name = fnames[index] self.assertTrue(out_name.endswith(filename)) index += 1 def test_wp_custpost_true_dirpage_false(self): # pages should only be put in their own directory when dirpage = True silent_f2p = mute(True)(fields2pelican) test_posts = [] for post in self.custposts: # check post kind if post[8] == "page": test_posts.append(post) with temporary_folder() as temp: fnames = list( silent_f2p( test_posts, "markdown", temp, wp_custpost=True, dirpage=False ) ) index = 0 for post in test_posts: name = post[2] name += ".md" filename = os.path.join("pages", name) out_name = fnames[index] self.assertFalse(out_name.endswith(filename)) def test_can_toggle_raw_html_code_parsing(self): test_posts = list(self.posts) def r(f): with open(f, encoding="utf-8") as infile: return infile.read() silent_f2p = mute(True)(fields2pelican) with temporary_folder() as temp: rst_files = (r(f) for f in silent_f2p(test_posts, "markdown", temp)) self.assertTrue(any(" entities in " "the title. You can't miss them.", ) self.assertNotIn("&", title) def test_decode_wp_content_returns_empty(self): """Check that given an empty string we return an empty string.""" self.assertEqual(decode_wp_content(""), "") def test_decode_wp_content(self): """Check that we can decode a wordpress content string.""" with open(WORDPRESS_ENCODED_CONTENT_SAMPLE) as encoded_file: encoded_content = encoded_file.read() with open(WORDPRESS_DECODED_CONTENT_SAMPLE) as decoded_file: decoded_content = decoded_file.read() self.assertEqual( decode_wp_content(encoded_content, br=False), decoded_content ) def test_preserve_verbatim_formatting(self): def r(f): with open(f, encoding="utf-8") as infile: return infile.read() silent_f2p = mute(True)(fields2pelican) test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts) with temporary_folder() as temp: md = [r(f) for f in silent_f2p(test_post, "markdown", temp)][0] self.assertTrue(re.search(r"\s+a = \[1, 2, 3\]", md)) self.assertTrue(re.search(r"\s+b = \[4, 5, 6\]", md)) for_line = re.search(r"\s+for i in zip\(a, b\):", md).group(0) print_line = re.search(r"\s+print i", md).group(0) self.assertTrue(for_line.rindex("for") < print_line.rindex("print")) def test_code_in_list(self): def r(f): with open(f, encoding="utf-8") as infile: return infile.read() silent_f2p = mute(True)(fields2pelican) test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts) with temporary_folder() as temp: md = [r(f) for f in silent_f2p(test_post, "markdown", temp)][0] sample_line = re.search(r"- This is a code sample", md).group(0) code_line = re.search(r"\s+a = \[1, 2, 3\]", md).group(0) self.assertTrue(sample_line.rindex("This") < code_line.rindex("a")) def test_dont_use_smart_quotes(self): def r(f): with open(f, encoding="utf-8") as infile: return infile.read() silent_f2p = mute(True)(fields2pelican) test_post = filter(lambda p: p[0].startswith("Post with raw data"), self.posts) with temporary_folder() as temp: md = [r(f) for f in silent_f2p(test_post, "markdown", temp)][0] escaped_quotes = re.search(r'\\[\'"“”‘’]', md) self.assertFalse(escaped_quotes) def test_convert_caption_to_figure(self): def r(f): with open(f, encoding="utf-8") as infile: return infile.read() silent_f2p = mute(True)(fields2pelican) test_post = filter(lambda p: p[0].startswith("Caption on image"), self.posts) with temporary_folder() as temp: md = [r(f) for f in silent_f2p(test_post, "markdown", temp)][0] caption = re.search(r"\[caption", md) self.assertFalse(caption) for occurence in [ "/theme/img/xpelican.png.pagespeed.ic.Rjep0025-y.png", "/theme/img/xpelican-3.png.pagespeed.ic.m-NAIdRCOM.png", "/theme/img/xpelican.png.pagespeed.ic.Rjep0025-y.png", "This is a pelican", "This also a pelican", "Yet another pelican", ]: # pandoc 2.x converts into ![text](src) # pandoc 3.x converts into
src
text
self.assertIn(occurence, md) class TestBuildHeader(unittest.TestCase): def test_build_header(self): header = build_header("test", None, None, None, None, None) self.assertEqual(header, "test\n####\n\n") def test_build_header_with_fields(self): header_data = [ "Test Post", "2014-11-04", "Alexis Métaireau", ["Programming"], ["Pelican", "Python"], "test-post", ] expected_docutils = "\n".join( [ "Test Post", "#########", ":date: 2014-11-04", ":author: Alexis Métaireau", ":category: Programming", ":tags: Pelican, Python", ":slug: test-post", "\n", ] ) expected_md = "\n".join( [ "Title: Test Post", "Date: 2014-11-04", "Author: Alexis Métaireau", "Category: Programming", "Tags: Pelican, Python", "Slug: test-post", "\n", ] ) self.assertEqual(build_header(*header_data), expected_docutils) self.assertEqual(build_markdown_header(*header_data), expected_md) def test_build_header_with_east_asian_characters(self): header = build_header( "これは広い幅の文字だけで構成されたタイトルです", None, None, None, None, None, ) self.assertEqual( header, ( "これは広い幅の文字だけで構成されたタイトルです\n" "##############################################" "\n\n" ), ) def test_galleries_added_to_header(self): header = build_header( "test", None, None, None, None, None, attachments=["output/test1", "output/test2"], ) self.assertEqual( header, ("test\n####\n" ":attachments: output/test1, " "output/test2\n\n") ) def test_galleries_added_to_markdown_header(self): header = build_markdown_header( "test", None, None, None, None, None, attachments=["output/test1", "output/test2"], ) self.assertEqual( header, "Title: test\nAttachments: output/test1, output/test2\n\n" ) @unittest.skipUnless(BeautifulSoup, "Needs BeautifulSoup module") @unittest.skipUnless(LXML, "Needs lxml module") class TestWordpressXMLAttachements(TestCaseWithCLocale): def setUp(self): super().setUp() self.attachments = get_attachments(WORDPRESS_XML_SAMPLE) def test_recognise_attachments(self): self.assertTrue(self.attachments) self.assertTrue(len(self.attachments.keys()) == 3) def test_attachments_associated_with_correct_post(self): self.assertTrue(self.attachments) for post in self.attachments.keys(): if post is None: expected = { ( "https://upload.wikimedia.org/wikipedia/commons/" "thumb/2/2c/Pelican_lakes_entrance02.jpg/" "240px-Pelican_lakes_entrance02.jpg" ) } self.assertEqual(self.attachments[post], expected) elif post == "with-excerpt": expected_invalid = ( "http://thisurlisinvalid.notarealdomain/" "not_an_image.jpg" ) expected_pelikan = ( "http://en.wikipedia.org/wiki/" "File:Pelikan_Walvis_Bay.jpg" ) self.assertEqual( self.attachments[post], {expected_invalid, expected_pelikan} ) elif post == "with-tags": expected_invalid = "http://thisurlisinvalid.notarealdomain" self.assertEqual(self.attachments[post], {expected_invalid}) else: self.fail( "all attachments should match to a " f"filename or None, {post}" ) def test_download_attachments(self): real_file = os.path.join(CUR_DIR, "content/article.rst") good_url = path_to_file_url(real_file) bad_url = "http://localhost:1/not_a_file.txt" silent_da = mute()(download_attachments) with temporary_folder() as temp: locations = list(silent_da(temp, [good_url, bad_url])) self.assertEqual(1, len(locations)) directory = locations[0] self.assertTrue( directory.endswith(posix_join("content", "article.rst")), directory ) class TestTumblrImporter(TestCaseWithCLocale): @patch("pelican.tools.pelican_import._get_tumblr_posts") def test_posts(self, get): def get_posts(api_key, blogname, offset=0): if offset > 0: return [] return [ { "type": "photo", "blog_name": "testy", "date": "2019-11-07 21:26:40 UTC", "timestamp": 1573162000, "format": "html", "slug": "a-slug", "tags": ["economics"], "state": "published", "photos": [ { "caption": "", "original_size": { "url": "https://..fccdc2360ba7182a.jpg", "width": 634, "height": 789, }, } ], } ] get.side_effect = get_posts posts = list(tumblr2fields("api_key", "blogname")) self.assertEqual( [ ( "Photo", '\n', "2019-11-07-a-slug", "2019-11-07 21:26:40+0000", "testy", ["photo"], ["economics"], "published", "article", "html", ) ], posts, posts, ) @patch("pelican.tools.pelican_import._get_tumblr_posts") def test_video_embed(self, get): def get_posts(api_key, blogname, offset=0): if offset > 0: return [] return [ { "type": "video", "blog_name": "testy", "slug": "the-slug", "date": "2017-07-07 20:31:41 UTC", "timestamp": 1499459501, "state": "published", "format": "html", "tags": [], "source_url": "https://href.li/?https://www.youtube.com/a", "source_title": "youtube.com", "caption": "

Caption

", "player": [ {"width": 250, "embed_code": ""}, {"width": 400, "embed_code": ""}, {"width": 500, "embed_code": ""}, ], "video_type": "youtube", } ] get.side_effect = get_posts posts = list(tumblr2fields("api_key", "blogname")) self.assertEqual( [ ( "youtube.com", '

via

\n

Caption

' "\n" "\n" "\n", "2017-07-07-the-slug", "2017-07-07 20:31:41+0000", "testy", ["video"], [], "published", "article", "html", ) ], posts, posts, ) @patch("pelican.tools.pelican_import._get_tumblr_posts") def test_broken_video_embed(self, get): def get_posts(api_key, blogname, offset=0): if offset > 0: return [] return [ { "type": "video", "blog_name": "testy", "slug": "the-slug", "date": "2016-08-14 16:37:35 UTC", "timestamp": 1471192655, "state": "published", "format": "html", "tags": ["interviews"], "source_url": "https://href.li/?https://www.youtube.com/watch?v=b", "source_title": "youtube.com", "caption": "

Caption

", "player": [ { "width": 250, # If video is gone, embed_code is False "embed_code": False, }, {"width": 400, "embed_code": False}, {"width": 500, "embed_code": False}, ], "video_type": "youtube", } ] get.side_effect = get_posts posts = list(tumblr2fields("api_key", "blogname")) self.assertEqual( [ ( "youtube.com", '

via

\n

Caption

' "

(This video isn't available anymore.)

\n", "2016-08-14-the-slug", "2016-08-14 16:37:35+0000", "testy", ["video"], ["interviews"], "published", "article", "html", ) ], posts, posts, ) class TestMediumImporter(TestCaseWithCLocale): def setUp(self): super().setUp() self.test_content_root = "pelican/tests/content" # The content coming out of parsing is similar, but not the same. # Beautiful soup rearranges the order of attributes, for example. # So, we keep a copy of the content for the test. content_filename = f"{self.test_content_root}/medium_post_content.txt" with open(content_filename, encoding="utf-8") as the_content_file: # Many editors and scripts add a final newline, so live with that # in our test the_content = the_content_file.read() assert the_content[-1] == "\n" the_content = the_content[:-1] self.post_tuple = ( "A title", the_content, # slug: "2017-04-21-medium-post", "2017-04-21 17:11", "User Name", None, (), "published", "article", "html", ) def test_mediumpost2field(self): """Parse one post""" post_filename = f"{self.test_content_root}/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html" val = mediumpost2fields(post_filename) self.assertEqual(self.post_tuple, val, val) def test_mediumposts2field(self): """Parse all posts in an export directory""" posts = [ fields for fields in mediumposts2fields(f"{self.test_content_root}/medium_posts") ] self.assertEqual(1, len(posts)) self.assertEqual(self.post_tuple, posts[0]) def test_strip_content(self): """Strip out unhelpful tags""" html_doc = ( "
This keeps lots of tags, but not " "the
section
tags
" ) soup = BeautifulSoup(html_doc, "html.parser") self.assertEqual( "This keeps lots of tags, but not the section tags", strip_medium_post_content(soup), ) def test_medium_slug(self): # Remove hex stuff at the end self.assertEqual( "2017-04-27_A-long-title", medium_slug( "medium-export/posts/2017-04-27_A-long-title--2971442227dd.html" ), ) # Remove "--DRAFT" at the end self.assertEqual( "2017-04-27_A-long-title", medium_slug("medium-export/posts/2017-04-27_A-long-title--DRAFT.html"), ) # Remove both (which happens) self.assertEqual( "draft_How-to-do", medium_slug("draft_How-to-do--DRAFT--87225c81dddd.html") ) # If no hex stuff, leave it alone self.assertEqual( "2017-04-27_A-long-title", medium_slug("medium-export/posts/2017-04-27_A-long-title.html"), )