From 6888a046362316f98fb3aaf2982ca246ad724f30 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Thu, 12 Apr 2012 19:38:59 -0700 Subject: [PATCH 1/8] Issue #311 Catch BeautifulSoup ImportError. --- pelican/tools/pelican_import.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 57c4fc22..a4d64c67 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -13,7 +13,12 @@ from pelican.utils import slugify def wp2fields(xml): """Opens a wordpress XML file, and yield pelican fields""" - from BeautifulSoup import BeautifulStoneSoup + try: + from BeautifulSoup import BeautifulStoneSoup + except ImportError: + error = 'Missing dependency ' + \ + '"BeautifulSoup" required to import Wordpress files.' + sys.exit(error) xmlfile = open(xml, encoding='utf-8').read() soup = BeautifulStoneSoup(xmlfile) @@ -40,7 +45,13 @@ def wp2fields(xml): def dc2fields(file): """Opens a Dotclear export file, and yield pelican fields""" - from BeautifulSoup import BeautifulStoneSoup + try: + from BeautifulSoup import BeautifulStoneSoup + except ImportError: + error = 'Missing dependency ' + \ + '"BeautifulSoup" required to import Dotclear files.' + sys.exit(error) + in_cat = False in_post = False From 23c05ad7dbd46e61d1cd1cfe193510601d7c2299 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Thu, 12 Apr 2012 19:53:03 -0700 Subject: [PATCH 2/8] Issue #311, #312 Document BeautifulSoup & pandoc deps. --- docs/importer.rst | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/importer.rst b/docs/importer.rst index 377820af..96e9e729 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -19,6 +19,23 @@ The conversion from HTML to reStructuredText relies on `pandoc written with Markdown syntax, they will not be converted (as Pelican also supports Markdown). +Dependencies +"""""""""""" + +``pelican-import`` has two additional dependencies not included with pelican +by default: + +- BeautifulSoup +- pandoc + +BeatifulSoup can be installed like any other Python package:: + + $ pip install BeautifulSoup + +For pandoc, install a package for your operating system from the +`pandoc site `_. + + Usage """"" @@ -26,8 +43,8 @@ Usage | [--dir-cat] | input -Optional arguments: -""""""""""""""""""" +Optional arguments +"""""""""""""""""" -h, --help show this help message and exit --wpfile Wordpress XML export From 6577efc8f466acf8f6e639528959e2915a7f9413 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 00:20:54 -0700 Subject: [PATCH 3/8] Wrap paragraphs in

tags --- pelican/tools/pelican_import.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index a4d64c67..01253960 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -17,7 +17,7 @@ def wp2fields(xml): from BeautifulSoup import BeautifulStoneSoup except ImportError: error = 'Missing dependency ' + \ - '"BeautifulSoup" required to import Wordpress files.' + '"BeautifulSoup" required to import Wordpress XML files.' sys.exit(error) xmlfile = open(xml, encoding='utf-8').read() @@ -226,7 +226,10 @@ def fields2pelican(fields, out_markup, output_path, dircat=False): with open(html_filename, 'w', encoding='utf-8') as fp: # Replace simple newlines with
+newline so that the HTML file # represents the original post more accurately - content = content.replace("\n", "
\n") + paragraphs = content.split('\n\n') + paragraphs = ['

%s

' % p for p in paragraphs] + new_content = ''.join(paragraphs) + fp.write(content) cmd = 'pandoc --normalize --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( From 9491bb40d4127b29e2dc68d421d96aca3eb32e98 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 00:24:52 -0700 Subject: [PATCH 4/8] Add --no-wrap option to pandoc, fixing issue with long links names (another fix for issue #314) --- pelican/tools/pelican_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 01253960..9a19f33c 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -232,7 +232,7 @@ def fields2pelican(fields, out_markup, output_path, dircat=False): fp.write(content) - cmd = 'pandoc --normalize --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( + cmd = 'pandoc --normalize --no-wrap --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( out_markup, out_filename, html_filename) try: From cc30695b72772a5faab8bfccf217e2e1397b4f9f Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 09:29:47 -0700 Subject: [PATCH 5/8] Correct comment; switch to new style string formatting. --- pelican/tools/pelican_import.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 9a19f33c..b45d4fec 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -224,10 +224,10 @@ def fields2pelican(fields, out_markup, output_path, dircat=False): html_filename = os.path.join(output_path, filename+'.html') with open(html_filename, 'w', encoding='utf-8') as fp: - # Replace simple newlines with
+newline so that the HTML file - # represents the original post more accurately + # Replace newlines with paragraphs wrapped with

so + # HTML is valid before conversion paragraphs = content.split('\n\n') - paragraphs = ['

%s

' % p for p in paragraphs] + paragraphs = ['

{}

'.format(p) for p in paragraphs] new_content = ''.join(paragraphs) fp.write(content) From 36a53442821fbdf379e45c309906fdf0a6f30193 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 22:14:53 -0700 Subject: [PATCH 6/8] Beautify two-line string concat. --- pelican/tools/pelican_import.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index b45d4fec..fdf28d14 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -16,8 +16,8 @@ def wp2fields(xml): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: - error = 'Missing dependency ' + \ - '"BeautifulSoup" required to import Wordpress XML files.' + error = ('Missing dependency ' + '"BeautifulSoup" required to import Wordpress XML files.') sys.exit(error) xmlfile = open(xml, encoding='utf-8').read() @@ -48,8 +48,8 @@ def dc2fields(file): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: - error = 'Missing dependency ' + \ - '"BeautifulSoup" required to import Dotclear files.' + error = ('Missing dependency ' + '"BeautifulSoup" required to import Dotclear files.') sys.exit(error) From 5cad4c46f06963c58d43dbaf6e2f5addbec663ea Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 22:17:43 -0700 Subject: [PATCH 7/8] Improve wording of docs re: pelican-import deps. --- docs/importer.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/importer.rst b/docs/importer.rst index 96e9e729..0147f900 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -22,8 +22,7 @@ supports Markdown). Dependencies """""""""""" -``pelican-import`` has two additional dependencies not included with pelican -by default: +``pelican-import`` has two dependencies not required by the rest of pelican: - BeautifulSoup - pandoc From 5710dc771d6951519eea5209f388fe17b79b973c Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 18 Apr 2012 22:28:49 -0700 Subject: [PATCH 8/8] Remove --no-wrap; change para formatting to unicode string. --- pelican/tools/pelican_import.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index fdf28d14..050b1010 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -227,12 +227,12 @@ def fields2pelican(fields, out_markup, output_path, dircat=False): # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion paragraphs = content.split('\n\n') - paragraphs = ['

{}

'.format(p) for p in paragraphs] + paragraphs = [u'

{}

'.format(p) for p in paragraphs] new_content = ''.join(paragraphs) fp.write(content) - cmd = 'pandoc --normalize --no-wrap --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( + cmd = 'pandoc --normalize --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( out_markup, out_filename, html_filename) try: