Merge pull request #151 from saimn/importer

improvements for importers
2025-10-15 20:28:56 +02:00 · 2011-08-04 01:21:59 -07:00 · 2011-08-04 01:21:59 -07:00 · d6a2946aef
commit d6a2946aef
parent 4685e41173 7b7695509d
3 changed files with 105 additions and 37 deletions
--- a/docs/importer.rst
+++ b/docs/importer.rst
@ -0,0 +1,56 @@
+=================================
+ Import from other blog software
+=================================
+
+Description
+===========
+
+``importer.py`` is a command line tool for converting articles from other
+software to ReStructuredText. The supported formats are:
+
+- Wordpress XML export
+- Dotclear export
+- RSS/ATOM feed
+
+The conversion from HTML to ReStructuredText relies on `pandoc
+<http://johnmacfarlane.net/pandoc/>`_. For Dotclear, if the source posts are
+written with Markdown syntax, they will not be converted (as Pelican also
+supports Markdown).
+
+Usage
+"""""
+
+| importer.py [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT]
+|                    [--dir-cat]
+|                    input
+
+Optional arguments:
+"""""""""""""""""""
+
+  -h, --help            show this help message and exit
+  --wpfile              Wordpress XML export
+  --dotclear            Dotclear export
+  --feed                Feed to parse
+  -o OUTPUT, --output OUTPUT
+                        Output path
+  --dir-cat             Put files in directories with categories name
+
+Examples
+========
+
+for Wordpress::
+
+    $ python2 tools/importer.py --wpfile -o ~/output ~/posts.xml
+
+for Dotclear::
+
+    $ python2 tools/importer.py --dotclear -o ~/output ~/backup.txt
+
+
+Tests
+=====
+
+To test the module, one can use sample files:
+
+- for Wordpress: http://wpcandy.com/made/the-sample-post-collection
+- for Dotclear: http://themes.dotaddict.org/files/public/downloads/lorem-backup.txt
--- a/docs/index.rst
+++ b/docs/index.rst
@ -42,8 +42,8 @@ If you want to see new features in Pelican, dont hesitate to tell me, to clone
 the repository, etc. That's open source, dude!

 Contact me at "alexis at notmyidea dot org" for any request/feedback! You can
-also join the team at `#pelican on irc.freenode.org 
-<irc://irc.freenode.net/pelican>`_ 
+also join the team at `#pelican on irc.freenode.org
+<irc://irc.freenode.net/pelican>`_
 (or if you don't have any IRC client, using `the webchat
 <http://webchat.freenode.net/?channels=pelican&uio=d4>`_)
 for quick feedback.
@ -55,11 +55,12 @@ A french version of the documentation is available at :doc:`fr/index`.

 .. toctree::
   :maxdepth: 2
-   
+
   getting_started
   settings
   themes
   internals
   pelican-themes
+   importer
   faq
   contribute
--- a/tools/importer.py
+++ b/tools/importer.py
@ -27,7 +27,9 @@ def wp2fields(xml):
            date = time.strftime("%Y-%m-%d %H:%M", date_object)

            author = item.fetch('dc:creator')[0].contents[0].title()
-            categories = [(cat['nicename'],cat.contents[0]) for cat in item.fetch(domain='category')]
+
+            categories = [cat.contents[0] for cat in item.fetch(domain='category')]
+            # caturl = [cat['nicename'] for cat in item.fetch(domain='category')]

            tags = [tag.contents[0].title() for tag in item.fetch(domain='tag', nicename=None)]

@ -101,16 +103,22 @@ def dc2fields(file):
        # post_meta = fields[27]
        # redirect_url = fields[28][:-1]

+        # remove seconds
+        post_creadt = ':'.join(post_creadt.split(':')[0:2])
+
        author = ""
-        categories = ""
+        categories = []
+        tags = []
+
        if cat_id:
-            categories = category_list[cat_id]
-        tags = ""
+            categories = [category_list[id].strip() for id in cat_id.split(',')]

        if post_format == "markdown":
            content = post_excerpt + post_content
        else:
            content = post_excerpt_xhtml + post_content_xhtml
+            content = content.replace('\\n', '')
+            post_format = "html"

        yield (post_title, content, post_url, post_creadt, author, categories, tags, post_format)

@ -135,7 +143,7 @@ def build_header(title, date, author, categories, tags):
    if date:
        header += ':date: %s\n' % date
    if categories:
-        header += ':category: %s\n' % categories
+        header += ':category: %s\n' % ', '.join(categories)
    if tags:
        header += ':tags: %s\n' % ', '.join(tags)
    header += '\n'
@ -147,52 +155,53 @@ def build_markdown_header(title, date, author, categories, tags):
    if date:
        header += 'Date: %s\n' % date
    if categories:
-        header += 'Category: %s\n' % categories
+        header += 'Category: %s\n' % ', '.join(categories)
    if tags:
        header += 'Tags: %s\n' % ', '.join(tags)
    header += '\n'
    return header

-def fields2pelican(fields, output_path):
+def fields2pelican(fields, output_path, dircat=False):
    for title, content, filename, date, author, categories, tags, markup in fields:
        if markup == "markdown":
-            md_filename = os.path.join(output_path, filename+'.md')
+            ext = '.md'
            header = build_markdown_header(title, date, author, categories, tags)
-
-            # content.replace('\r\n', '\n')
-
-            with open(md_filename, 'w', encoding='utf-8') as fp:
-                fp.write(header+content)
-
        else:
-            filename = os.path.basename(filename)
-            html_filename = os.path.join(output_path, filename+'.html')
+            ext = '.rst'
+            header = build_header(title, date, author, categories, tags)

-            # if(len(categories) == 1):
-            #     rst_filename = os.path.join(output_path, categories[0][0], filename+'.rst')
-            #     if not os.path.isdir(os.path.join(output_path, categories[0][0])):
-            #         os.mkdir(os.path.join(output_path, categories[0][0]))
-            # else:
-            rst_filename = os.path.join(output_path, filename+'.rst')
+        filename = os.path.basename(filename)
+
+        # option to put files in directories with categories names
+        if dircat and (len(categories) == 1):
+            catname = categories[0]
+            out_filename = os.path.join(output_path, catname, filename+'.rst')
+            if not os.path.isdir(os.path.join(output_path, catname)):
+                os.mkdir(os.path.join(output_path, catname))
+        else:
+            out_filename = os.path.join(output_path, filename+ext)
+
+        print out_filename
+
+        if markup == "html":
+            html_filename = os.path.join(output_path, filename+'.html')

            with open(html_filename, 'w', encoding='utf-8') as fp:
                fp.write(content)

-            print rst_filename
-            os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (rst_filename,
+            os.system('pandoc --normalize --reference-links --from=html --to=rst -o "%s" "%s"' % (out_filename,
                html_filename))

            os.remove(html_filename)

-            with open(rst_filename, 'r', encoding='utf-8') as fs:
+            with open(out_filename, 'r', encoding='utf-8') as fs:
                content = fs.read()
-            with open(rst_filename, 'w', encoding='utf-8') as fs:
-                # categories = [x[1] for x in categories]
-                header = build_header(title, date, author, categories, tags)
-                fs.write(header + content)
+
+        with open(out_filename, 'w', encoding='utf-8') as fs:
+            fs.write(header + content)


-def main(input_type, input, output_path):
+def main(input_type, input, output_path, dircat=False):
    if input_type == 'wordpress':
        fields = wp2fields(input)
    elif input_type == 'dotclear':
@ -200,13 +209,13 @@ def main(input_type, input, output_path):
    elif input_type == 'feed':
        fields = feed2fields(input)

-    fields2pelican(fields, output_path)
+    fields2pelican(fields, output_path, dircat=dircat)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
-            description="Transform even feed or XML files to rst files."
-                    "Be sure to have pandoc installed")
+            description="Transform feed, Wordpress or Dotclear files to rst files."
+                        "Be sure to have pandoc installed")

    parser.add_argument(dest='input', help='The input file to read')
    parser.add_argument('--wpfile', action='store_true', dest='wpfile',
@ -214,9 +223,11 @@ if __name__ == '__main__':
    parser.add_argument('--dotclear', action='store_true', dest='dotclear',
            help='Dotclear export')
    parser.add_argument('--feed', action='store_true', dest='feed',
-            help='feed to parse')
+            help='Feed to parse')
    parser.add_argument('-o', '--output', dest='output', default='output',
            help='Output path')
+    parser.add_argument('--dir-cat', action='store_true', dest='dircat',
+            help='Put files in directories with categories name')
    args = parser.parse_args()

    input_type = None
@ -227,6 +238,6 @@ if __name__ == '__main__':
    elif args.feed:
        input_type = 'feed'
    else:
-        print "you must provide either --wpfile or --feed options"
+        print "you must provide either --wpfile, --dotclear or --feed options"
        exit()
-    main(input_type, args.input, args.output)
+    main(input_type, args.input, args.output, dircat=args.dircat)