pelican/pelican/tests/test_importer.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function

import os
import re

import locale
from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content, build_header, build_markdown_header, get_attachments, download_attachments
from pelican.tests.support import (unittest, temporary_folder, mute,
                                   skipIfNoExecutable)

from pelican.utils import slugify

CUR_DIR = os.path.abspath(os.path.dirname(__file__))
WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
                                                'content',
                                                'wordpress_content_encoded')
WORDPRESS_DECODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
                                                'content',
                                                'wordpress_content_decoded')

try:
    from bs4 import BeautifulSoup
except ImportError:
    BeautifulSoup = False  # NOQA


@skipIfNoExecutable(['pandoc', '--version'])
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestWordpressXmlImporter(unittest.TestCase):

    def setUp(self):
        self.old_locale = locale.setlocale(locale.LC_ALL)
        locale.setlocale(locale.LC_ALL, str('C'))
        self.posts = list(wp2fields(WORDPRESS_XML_SAMPLE))
        self.custposts = list(wp2fields(WORDPRESS_XML_SAMPLE, True))

    def tearDown(self):
        locale.setlocale(locale.LC_ALL, self.old_locale)

    def test_ignore_empty_posts(self):
        self.assertTrue(self.posts)
        for title, content, fname, date, author, categ, tags, kind, format in self.posts:
            self.assertTrue(title.strip())

    def test_recognise_page_kind(self):
        """ Check that we recognise pages in wordpress, as opposed to posts """
        self.assertTrue(self.posts)
        # Collect (title, filename, kind) of non-empty posts recognised as page
        pages_data = []
        for title, content, fname, date, author, categ, tags, kind, format in self.posts:
            if kind == 'page':
                pages_data.append((title, fname))
        self.assertEqual(2, len(pages_data))
        self.assertEqual(('Page', 'contact'), pages_data[0])
        self.assertEqual(('Empty Page', 'empty'), pages_data[1])

    def test_dirpage_directive_for_page_kind(self):
        silent_f2p = mute(True)(fields2pelican)
        test_post = filter(lambda p: p[0].startswith("Empty Page"), self.posts)
        with temporary_folder() as temp:
            fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0]
            self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep))

    def test_dircat(self):
        silent_f2p = mute(True)(fields2pelican)
        test_posts = []
        for post in self.posts:
            # check post kind
            if len(post[5]) > 0: # Has a category
                test_posts.append(post)
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown', temp, dircat=True))
        index = 0
        for post in test_posts:
            name = post[2]
            category = slugify(post[5][0])
            name += '.md'
            filename = os.path.join(category, name)
            out_name = fnames[index]
            self.assertTrue(out_name.endswith(filename))
            index += 1

    def test_unless_custom_post_all_items_should_be_pages_or_posts(self):
        self.assertTrue(self.posts)
        pages_data = []
        for title, content, fname, date, author, categ, tags, kind, format in self.posts:
            if kind == 'page' or kind == 'article':
                pass
            else:
                pages_data.append((title, fname))
        self.assertEqual(0, len(pages_data))

    def test_recognise_custom_post_type(self):
        self.assertTrue(self.custposts)
        cust_data = []
        for title, content, fname, date, author, categ, tags, kind, format in self.custposts:
            if kind == 'article' or kind == 'page':
                pass
            else:
                cust_data.append((title, kind))
        self.assertEqual(3, len(cust_data))
        self.assertEqual(('A custom post in category 4', 'custom1'), cust_data[0])
        self.assertEqual(('A custom post in category 5', 'custom1'), cust_data[1])
        self.assertEqual(('A 2nd custom post type also in category 5', 'custom2'), cust_data[2])

    def test_custom_posts_put_in_own_dir(self):
        silent_f2p = mute(True)(fields2pelican)
        test_posts = []
        for post in self.custposts:
            # check post kind
            if post[7] == 'article' or post[7] == 'page':
                pass
            else:
                test_posts.append(post)
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown', temp, wp_custpost = True))
        index = 0
        for post in test_posts:
            name = post[2]
            kind = post[7]
            name += '.md'
            filename = os.path.join(kind, name)
            out_name = fnames[index]
            self.assertTrue(out_name.endswith(filename))
            index += 1

    def test_custom_posts_put_in_own_dir_and_catagory_sub_dir(self):
        silent_f2p = mute(True)(fields2pelican)
        test_posts = []
        for post in self.custposts:
            # check post kind
            if post[7] == 'article' or post[7] == 'page':
                pass
            else:
                test_posts.append(post)
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown', temp,
                wp_custpost=True, dircat=True))
        index = 0
        for post in test_posts:
            name = post[2]
            kind = post[7]
            category = slugify(post[5][0])
            name += '.md'
            filename = os.path.join(kind, category, name)
            out_name = fnames[index]
            self.assertTrue(out_name.endswith(filename))
            index += 1

    def test_wp_custpost_true_dirpage_false(self):
        #pages should only be put in their own directory when dirpage = True
        silent_f2p = mute(True)(fields2pelican)
        test_posts = []
        for post in self.custposts:
            # check post kind
            if post[7] == 'page':
                test_posts.append(post)
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown', temp,
                wp_custpost=True, dirpage=False))
        index = 0
        for post in test_posts:
            name = post[2]
            name += '.md'
            filename = os.path.join('pages', name)
            out_name = fnames[index]
            self.assertFalse(out_name.endswith(filename))


    def test_can_toggle_raw_html_code_parsing(self):
        def r(f):
            with open(f) as infile:
                return infile.read()
        silent_f2p = mute(True)(fields2pelican)

        with temporary_folder() as temp:

            rst_files = (r(f) for f in silent_f2p(self.posts, 'markdown', temp))
            self.assertTrue(any('<iframe' in rst for rst in rst_files))
            rst_files = (r(f) for f in silent_f2p(self.posts, 'markdown', temp,
                         strip_raw=True))
            self.assertFalse(any('<iframe' in rst for rst in rst_files))
            # no effect in rst
            rst_files = (r(f) for f in silent_f2p(self.posts, 'rst', temp))
            self.assertFalse(any('<iframe' in rst for rst in rst_files))
            rst_files = (r(f) for f in silent_f2p(self.posts, 'rst', temp,
                         strip_raw=True))
            self.assertFalse(any('<iframe' in rst for rst in rst_files))

    def test_decode_html_entities_in_titles(self):
        test_posts = [post for post in self.posts if post[2] == 'html-entity-test']
        self.assertEqual(len(test_posts), 1)

        post = test_posts[0]
        title = post[0]
        self.assertTrue(title, "A normal post with some <html> entities in the"
                               " title. You can't miss them.")
        self.assertNotIn('&', title)

    def test_decode_wp_content_returns_empty(self):
        """ Check that given an empty string we return an empty string."""
        self.assertEqual(decode_wp_content(""), "")

    def test_decode_wp_content(self):
        """ Check that we can decode a wordpress content string."""
        with open(WORDPRESS_ENCODED_CONTENT_SAMPLE, 'r') as encoded_file:
            encoded_content = encoded_file.read()
            with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
                decoded_content = decoded_file.read()
                self.assertEqual(decode_wp_content(encoded_content, br=False), decoded_content)

    def test_preserve_verbatim_formatting(self):
        def r(f):
            with open(f) as infile:
                return infile.read()
        silent_f2p = mute(True)(fields2pelican)
        test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts)
        with temporary_folder() as temp:
            md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0]
            self.assertTrue(re.search(r'\s+a = \[1, 2, 3\]', md))
            self.assertTrue(re.search(r'\s+b = \[4, 5, 6\]', md))

            for_line = re.search(r'\s+for i in zip\(a, b\):', md).group(0)
            print_line = re.search(r'\s+print i', md).group(0)
            self.assertTrue(for_line.rindex('for') < print_line.rindex('print'))

    def test_code_in_list(self):
        def r(f):
            with open(f) as infile:
                return infile.read()
        silent_f2p = mute(True)(fields2pelican)
        test_post = filter(lambda p: p[0].startswith("Code in List"), self.posts)
        with temporary_folder() as temp:
            md = [r(f) for f in silent_f2p(test_post, 'markdown', temp)][0]
            sample_line = re.search(r'-   This is a code sample', md).group(0)
            code_line = re.search(r'\s+a = \[1, 2, 3\]', md).group(0)
            self.assertTrue(sample_line.rindex('This') < code_line.rindex('a'))


class TestBuildHeader(unittest.TestCase):
    def test_build_header(self):
        header = build_header('test', None, None, None, None, None)
        self.assertEqual(header, 'test\n####\n\n')

    def test_build_header_with_east_asian_characters(self):
        header = build_header('これは広い幅の文字だけで構成されたタイトルです',
                None, None, None, None, None)

        self.assertEqual(header,
                'これは広い幅の文字だけで構成されたタイトルです\n' +
                '##############################################\n\n')

    def test_galleries_added_to_header(self):
        header = build_header('test', None, None, None, None,
                None, ['output/test1', 'output/test2'])
        self.assertEqual(header, 'test\n####\n' + ':attachments: output/test1, '
                + 'output/test2\n\n')

    def test_galleries_added_to_markdown_header(self):
        header = build_markdown_header('test', None, None, None, None, None,
            ['output/test1', 'output/test2'])
        self.assertEqual(header, 'Title: test\n' + 'Attachments: output/test1, '
                + 'output/test2\n\n')

@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestWordpressXMLAttachements(unittest.TestCase):
    def setUp(self):
        self.old_locale = locale.setlocale(locale.LC_ALL)
        locale.setlocale(locale.LC_ALL, str('C'))
        self.attachments = get_attachments(WORDPRESS_XML_SAMPLE)

    def tearDown(self):
        locale.setlocale(locale.LC_ALL, self.old_locale)

    def test_recognise_attachments(self):
        self.assertTrue(self.attachments)
        self.assertTrue(len(self.attachments.keys()) == 3)

    def test_attachments_associated_with_correct_post(self):
        self.assertTrue(self.attachments)
        for post in self.attachments.keys():
            if post is None:
                self.assertTrue(self.attachments[post][0] == 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg')
            elif post == 'with-excerpt':
                self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain/not_an_image.jpg')
                self.assertTrue(self.attachments[post][1] == 'http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg')
            elif post == 'with-tags':
                self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain')
            else:
                self.fail('all attachments should match to a filename or None, {}'.format(post))

    def test_download_attachments(self):
        real_file = os.path.join(CUR_DIR, 'content/article.rst')
        good_url = 'file://' + real_file
        bad_url = 'http://localhost:1/not_a_file.txt'
        silent_da = mute()(download_attachments)
        with temporary_folder() as temp:
            #locations = download_attachments(temp, [good_url, bad_url])
            locations = list(silent_da(temp, [good_url, bad_url]))
            self.assertTrue(len(locations) == 1)
            directory = locations[0]
            self.assertTrue(directory.endswith('content/article.rst'))