Merge pull request #577 from davidjb/import-improvements-slug

Provide slug storage option for posts during Pelican import
2025-10-15 20:28:56 +02:00 · 2012-12-11 03:52:12 -08:00 · 2012-12-11 03:52:12 -08:00 · 98c8db568b
commit 98c8db568b
parent f79c844855 b4c5d7cf62
4 changed files with 64 additions and 16 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -9,6 +9,8 @@ Release history
 3.1 (2012-12-04)
 ================

+* Importer now stores slugs within files by default. This can be disabled with
+  the ``--disable-slugs`` option.
 * Improve handling of links to intra-site resources
 * Ensure WordPress import adds paragraphs for all types of line endings
  in post content
--- a/docs/importer.rst
+++ b/docs/importer.rst
@ -39,29 +39,44 @@ Usage
 """""

 | pelican-import [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT]
-|                [-m MARKUP][--dir-cat]
+|                [-m MARKUP] [--dir-cat] [--strip-raw] [--disable-slugs]
 |                input

+Positional arguments
+====================
+
+  input                 The input file to read
+
 Optional arguments
 """"""""""""""""""

  -h, --help            show this help message and exit
-  --wpfile              Wordpress XML export
-  --dotclear            Dotclear export
-  --feed                Feed to parse
+  --wpfile              Wordpress XML export (default: False)
+  --dotclear            Dotclear export (default: False)
+  --feed                Feed to parse (default: False)
  -o OUTPUT, --output OUTPUT
-                        Output path
-  -m MARKUP             Output markup
+                        Output path (default: output)
+  -m MARKUP, --markup MARKUP
+                        Output markup format (supports rst & markdown)
+                        (default: rst)
  --dir-cat             Put files in directories with categories name
+                        (default: False)
+  --strip-raw           Strip raw HTML code that can't be converted to markup
+                        such as flash embeds or iframes (wordpress import
+                        only) (default: False)
+  --disable-slugs       Disable storing slugs from imported posts within
+                        output. With this disabled, your Pelican URLs may not
+                        be consistent with your original posts. (default:
+                        False)

 Examples
 ========

-for WordPress::
+For WordPress::

    $ pelican-import --wpfile -o ~/output ~/posts.xml

-for Dotclear::
+For Dotclear::

    $ pelican-import --dotclear -o ~/output ~/backup.txt

--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@ -181,7 +181,7 @@ def feed2fields(file):
        yield (entry.title, entry.description, slug, date, author, [], tags, "html")


-def build_header(title, date, author, categories, tags):
+def build_header(title, date, author, categories, tags, slug):
    """Build a header from a list of fields"""
    header = '%s\n%s\n' % (title, '#' * len(title))
    if date:
@ -192,10 +192,12 @@ def build_header(title, date, author, categories, tags):
        header += ':category: %s\n' % ', '.join(categories)
    if tags:
        header += ':tags: %s\n' % ', '.join(tags)
+    if slug:
+        header += ':slug: %s\n' % slug
    header += '\n'
    return header

-def build_markdown_header(title, date, author, categories, tags):
+def build_markdown_header(title, date, author, categories, tags, slug):
    """Build a header from a list of fields"""
    header = 'Title: %s\n' % title
    if date:
@ -206,18 +208,21 @@ def build_markdown_header(title, date, author, categories, tags):
        header += 'Category: %s\n' % ', '.join(categories)
    if tags:
        header += 'Tags: %s\n' % ', '.join(tags)
+    if slug:
+        header += 'Slug: %s\n' % slug
    header += '\n'
    return header

-def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False):
+def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False, disable_slugs=False):
    for title, content, filename, date, author, categories, tags, in_markup in fields:
+        slug = not disable_slugs and filename or None
        if (in_markup == "markdown") or (out_markup == "markdown") :
            ext = '.md'
-            header = build_markdown_header(title, date, author, categories, tags)
+            header = build_markdown_header(title, date, author, categories, tags, slug)
        else:
            out_markup = "rst"
            ext = '.rst'
-            header = build_header(title, date, author, categories, tags)
+            header = build_header(title, date, author, categories, tags, slug)

        filename = os.path.basename(filename)

@ -278,8 +283,8 @@ def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=Fals

 def main():
    parser = argparse.ArgumentParser(
-        description="Transform feed, Wordpress or Dotclear files to rst files."
-            "Be sure to have pandoc installed",
+        description="Transform feed, Wordpress or Dotclear files to reST (rst) "
+                    "or Markdown (md) files. Be sure to have pandoc installed.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(dest='input', help='The input file to read')
@ -298,6 +303,11 @@ def main():
    parser.add_argument('--strip-raw', action='store_true', dest='strip_raw',
        help="Strip raw HTML code that can't be converted to "
             "markup such as flash embeds or iframes (wordpress import only)")
+    parser.add_argument('--disable-slugs', action='store_true',
+        dest='disable_slugs',
+        help='Disable storing slugs from imported posts within output. '
+             'With this disabled, your Pelican URLs may not be consistent '
+             'with your original posts.')

    args = parser.parse_args()

@ -328,4 +338,5 @@ def main():

    fields2pelican(fields, args.markup, args.output,
                   dircat=args.dircat or False,
-                   strip_raw=args.strip_raw or False)
+                   strip_raw=args.strip_raw or False,
+                   disable_slugs=args.disable_slugs or False)
--- a/tests/test_importer.py
+++ b/tests/test_importer.py
@ -48,6 +48,26 @@ class TestWordpressXmlImporter(unittest.TestCase):
                         strip_raw=True))
            self.assertFalse(any('<iframe' in rst for rst in rst_files))

+    def test_can_toggle_slug_storage(self):
+
+        posts = list(self.posts)
+        r = lambda f: open(f).read()
+        silent_f2p = mute(True)(fields2pelican)
+
+        with temporary_folder() as temp:
+
+            rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp))
+            self.assertTrue(all('Slug:' in rst for rst in rst_files))
+            rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp,
+                         disable_slugs=True))
+            self.assertFalse(any('Slug:' in rst for rst in rst_files))
+
+            rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp))
+            self.assertTrue(all(':slug:' in rst for rst in rst_files))
+            rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
+                         disable_slugs=True))
+            self.assertFalse(any(':slug:' in rst for rst in rst_files))
+
    def test_decode_html_entities_in_titles(self):
        posts = list(self.posts)
        test_posts = [post for post in posts if post[2] == 'html-entity-test']