diff --git a/docs/faq.rst b/docs/faq.rst index 80e14d21..bb9377e6 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -205,3 +205,22 @@ You can also disable generation of tag-related pages via:: TAGS_SAVE_AS = '' TAG_SAVE_AS = '' + +Why does Pelican always write all HTML files even with content caching enabled? +=============================================================================== + +In order to reliably determine whether the HTML output is different +before writing it, a large part of the generation environment +including the template contexts, imported plugins, etc. would have to +be saved and compared, at least in the form of a hash (which would +require special handling of unhashable types), because of all the +possible combinations of plugins, pagination, etc. which may change in +many different ways. This would require a lot more processing time +and memory and storage space. Simply writing the files each time is a +lot faster and a lot more reliable. + +However, this means that the modification time of the files changes +every time, so a ``rsync`` based upload will transfer them even if +their content hasn't changed. A simple solution is to make ``rsync`` +use the ``--checksum`` option, which will make it compare the file +checksums in a much faster way than Pelican would. diff --git a/docs/settings.rst b/docs/settings.rst index 36cc3f9a..d8690230 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -173,6 +173,12 @@ Setting name (default value) `SLUGIFY_SOURCE` (``'input'``) Specifies where you want the slug to be automatically generated from. Can be set to 'title' to use the 'Title:' metadata tag or 'basename' to use the articles basename when creating the slug. +`CACHE_CONTENT` (``True``) If ``True``, save read content in a cache file. + See :ref:`reading_only_modified_content` for details about caching. +`CACHE_DIRECTORY` (``cache``) Directory in which to store cache files. +`CHECK_MODIFIED_METHOD` (``mtime``) Controls how files are checked for modifications. +`LOAD_CONTENT_CACHE` (``True``) If ``True``, load unmodified content from cache. +`GZIP_CACHE` (``True``) If ``True``, use gzip to (de)compress the cache files. =============================================================================== ===================================================================== .. [#] Default is the system locale. @@ -602,7 +608,7 @@ Setting name (default value) What does it do? .. [3] %s is the language Ordering content -================= +================ ================================================ ===================================================== Setting name (default value) What does it do? @@ -697,7 +703,6 @@ adding the following to your configuration:: CSS_FILE = "wide.css" - Logging ======= @@ -713,6 +718,61 @@ be filtered out. For example: ``[(logging.WARN, 'TAG_SAVE_AS is set to False')]`` +.. _reading_only_modified_content: + +Reading only modified content +============================= + +To speed up the build process, pelican can optionally read only articles +and pages with modified content. + +When Pelican is about to read some content source file: + +1. The hash or modification time information for the file from a + previous build are loaded from a cache file if `LOAD_CONTENT_CACHE` + is ``True``. These files are stored in the `CACHE_DIRECTORY` + directory. If the file has no record in the cache file, it is read + as usual. +2. The file is checked according to `CHECK_MODIFIED_METHOD`: + + - If set to ``'mtime'``, the modification time of the file is + checked. + - If set to a name of a function provided by the ``hashlib`` + module, e.g. ``'md5'``, the file hash is checked. + - If set to anything else or the necessary information about the + file cannot be found in the cache file, the content is read as + usual. + +3. If the file is considered unchanged, the content object saved in a + previous build corresponding to the file is loaded from the cache + and the file is not read. +4. If the file is considered changed, the file is read and the new + modification information and the content object are saved to the + cache if `CACHE_CONTENT` is ``True``. + +Modification time based checking is faster than comparing file hashes, +but is not as reliable, because mtime information can be lost when +e.g. copying the content sources using the ``cp`` or ``rsync`` +commands without the mtime preservation mode (invoked e.g. by +``--archive``). + +The cache files are Python pickles, so they may not be readable by +different versions of Python as the pickle format often changes. If +such an error is encountered, the cache files have to be rebuilt +using the pelican command-line option ``--full-rebuild``. +The cache files also have to be rebuilt when changing the +`GZIP_CACHE` setting for cache file reading to work. + +The ``--full-rebuild`` command-line option is also useful when the +whole site needs to be regenerated due to e.g. modifications to the +settings file or theme files. When pelican runs in autorealod mode, +modification of the settings file or theme will trigger a full rebuild +automatically. + +Note that even when using cached content, all output is always +written, so the modification times of the ``*.html`` files always +change. Therefore, ``rsync`` based upload may benefit from the +``--checksum`` option. Example settings ================ diff --git a/pelican/__init__.py b/pelican/__init__.py index 494e7e43..b6bfe326 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -260,6 +260,10 @@ def parse_arguments(): action='store_true', help='Relaunch pelican each time a modification occurs' ' on the content files.') + + parser.add_argument('-f', '--full-rebuild', action='store_true', + dest='full_rebuild', help='Rebuild everything by not loading from cache') + return parser.parse_args() @@ -275,6 +279,8 @@ def get_config(args): config['THEME'] = abstheme if os.path.exists(abstheme) else args.theme if args.delete_outputdir is not None: config['DELETE_OUTPUT_DIRECTORY'] = args.delete_outputdir + if args.full_rebuild: + config['LOAD_CONTENT_CACHE'] = False # argparse returns bytes in Py2. There is no definite answer as to which # encoding argparse (or sys.argv) uses. @@ -327,6 +333,7 @@ def main(): print(' --- AutoReload Mode: Monitoring `content`, `theme` and' ' `settings` for changes. ---') + first_run = True # load cache on first run while True: try: # Check source dir for changed files ending with the given @@ -335,9 +342,14 @@ def main(): # have changed, no matter what extension the filenames # have. modified = {k: next(v) for k, v in watchers.items()} + original_load_cache = settings['LOAD_CONTENT_CACHE'] if modified['settings']: pelican, settings = get_instance(args) + if not first_run: + original_load_cache = settings['LOAD_CONTENT_CACHE'] + # invalidate cache + pelican.settings['LOAD_CONTENT_CACHE'] = False if any(modified.values()): print('\n-> Modified: {}. re-generating...'.format( @@ -349,8 +361,15 @@ def main(): if modified['theme'] is None: logger.warning('Empty theme folder. Using `basic` ' 'theme.') + elif modified['theme']: + # theme modified, needs full rebuild -> no cache + if not first_run: # but not on first run + pelican.settings['LOAD_CONTENT_CACHE'] = False pelican.run() + first_run = False + # restore original caching policy + pelican.settings['LOAD_CONTENT_CACHE'] = original_load_cache except KeyboardInterrupt: logger.warning("Keyboard interrupt, quitting.") diff --git a/pelican/contents.py b/pelican/contents.py index 615a7fd8..c02047b8 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -325,6 +325,13 @@ class Content(object): os.path.abspath(self.settings['PATH'])) ) + def __eq__(self, other): + """Compare with metadata and content of other Content object""" + return other and self.metadata == other.metadata and self.content == other.content + + # keep basic hashing functionality for caching to work + __hash__ = object.__hash__ + class Page(Content): mandatory_properties = ('title',) diff --git a/pelican/generators.py b/pelican/generators.py index bfdac1a5..7c2dbbf2 100644 --- a/pelican/generators.py +++ b/pelican/generators.py @@ -20,14 +20,15 @@ from jinja2 import (Environment, FileSystemLoader, PrefixLoader, ChoiceLoader, from pelican.contents import Article, Draft, Page, Static, is_valid_content from pelican.readers import Readers -from pelican.utils import copy, process_translations, mkdir_p, DateFormatter +from pelican.utils import (copy, process_translations, mkdir_p, DateFormatter, + FileStampDataCacher) from pelican import signals logger = logging.getLogger(__name__) -class Generator(object): +class Generator(FileStampDataCacher): """Baseclass generator""" def __init__(self, context, settings, path, theme, output_path, **kwargs): @@ -73,6 +74,10 @@ class Generator(object): custom_filters = self.settings['JINJA_FILTERS'] self.env.filters.update(custom_filters) + # set up caching + super(Generator, self).__init__(settings, 'CACHE_CONTENT', + 'LOAD_CONTENT_CACHE') + signals.generator_init.send(self) def get_template(self, name): @@ -408,20 +413,24 @@ class ArticlesGenerator(Generator): for f in self.get_files( self.settings['ARTICLE_DIR'], exclude=self.settings['ARTICLE_EXCLUDES']): - try: - article = self.readers.read_file( - base_path=self.path, path=f, content_class=Article, - context=self.context, - preread_signal=signals.article_generator_preread, - preread_sender=self, - context_signal=signals.article_generator_context, - context_sender=self) - except Exception as e: - logger.warning('Could not process {}\n{}'.format(f, e)) - continue + article = self.get_cached_data(f, None) + if article is None: + try: + article = self.readers.read_file( + base_path=self.path, path=f, content_class=Article, + context=self.context, + preread_signal=signals.article_generator_preread, + preread_sender=self, + context_signal=signals.article_generator_context, + context_sender=self) + except Exception as e: + logger.warning('Could not process {}\n{}'.format(f, e)) + continue - if not is_valid_content(article, f): - continue + if not is_valid_content(article, f): + continue + + self.cache_data(f, article) self.add_source_path(article) @@ -502,7 +511,7 @@ class ArticlesGenerator(Generator): self._update_context(('articles', 'dates', 'tags', 'categories', 'tag_cloud', 'authors', 'related_posts')) - + self.save_cache() signals.article_generator_finalized.send(self) def generate_output(self, writer): @@ -527,20 +536,24 @@ class PagesGenerator(Generator): for f in self.get_files( self.settings['PAGE_DIR'], exclude=self.settings['PAGE_EXCLUDES']): - try: - page = self.readers.read_file( - base_path=self.path, path=f, content_class=Page, - context=self.context, - preread_signal=signals.page_generator_preread, - preread_sender=self, - context_signal=signals.page_generator_context, - context_sender=self) - except Exception as e: - logger.warning('Could not process {}\n{}'.format(f, e)) - continue + page = self.get_cached_data(f, None) + if page is None: + try: + page = self.readers.read_file( + base_path=self.path, path=f, content_class=Page, + context=self.context, + preread_signal=signals.page_generator_preread, + preread_sender=self, + context_signal=signals.page_generator_context, + context_sender=self) + except Exception as e: + logger.warning('Could not process {}\n{}'.format(f, e)) + continue - if not is_valid_content(page, f): - continue + if not is_valid_content(page, f): + continue + + self.cache_data(f, page) self.add_source_path(page) @@ -560,6 +573,7 @@ class PagesGenerator(Generator): self._update_context(('pages', )) self.context['PAGES'] = self.pages + self.save_cache() signals.page_generator_finalized.send(self) def generate_output(self, writer): diff --git a/pelican/settings.py b/pelican/settings.py index 7277c121..baf2a497 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -119,7 +119,12 @@ DEFAULT_CONFIG = { 'IGNORE_FILES': ['.#*'], 'SLUG_SUBSTITUTIONS': (), 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', - 'SLUGIFY_SOURCE': 'title' + 'SLUGIFY_SOURCE': 'title', + 'CACHE_CONTENT': True, + 'CACHE_DIRECTORY': 'cache', + 'GZIP_CACHE': True, + 'CHECK_MODIFIED_METHOD': 'mtime', + 'LOAD_CONTENT_CACHE': True, } PYGMENTS_RST_OPTIONS = None diff --git a/pelican/tests/test_generators.py b/pelican/tests/test_generators.py index 6f13aeb6..a500f87a 100644 --- a/pelican/tests/test_generators.py +++ b/pelican/tests/test_generators.py @@ -42,6 +42,7 @@ class TestArticlesGenerator(unittest.TestCase): settings['DEFAULT_CATEGORY'] = 'Default' settings['DEFAULT_DATE'] = (1970, 1, 1) settings['READERS'] = {'asc': None} + settings['CACHE_CONTENT'] = False # cache not needed for this logic tests cls.generator = ArticlesGenerator( context=settings.copy(), settings=settings, @@ -50,8 +51,15 @@ class TestArticlesGenerator(unittest.TestCase): cls.articles = [[page.title, page.status, page.category.name, page.template] for page in cls.generator.articles] + def setUp(self): + self.temp_cache = mkdtemp(prefix='pelican_cache.') + + def tearDown(self): + rmtree(self.temp_cache) + def test_generate_feeds(self): settings = get_settings() + settings['CACHE_DIRECTORY'] = self.temp_cache generator = ArticlesGenerator( context=settings, settings=settings, path=None, theme=settings['THEME'], output_path=None) @@ -127,6 +135,7 @@ class TestArticlesGenerator(unittest.TestCase): settings['DEFAULT_CATEGORY'] = 'Default' settings['DEFAULT_DATE'] = (1970, 1, 1) settings['USE_FOLDER_AS_CATEGORY'] = False + settings['CACHE_DIRECTORY'] = self.temp_cache settings['READERS'] = {'asc': None} settings['filenames'] = {} generator = ArticlesGenerator( @@ -151,6 +160,7 @@ class TestArticlesGenerator(unittest.TestCase): def test_direct_templates_save_as_default(self): settings = get_settings(filenames={}) + settings['CACHE_DIRECTORY'] = self.temp_cache generator = ArticlesGenerator( context=settings, settings=settings, path=None, theme=settings['THEME'], output_path=None) @@ -165,6 +175,7 @@ class TestArticlesGenerator(unittest.TestCase): settings = get_settings() settings['DIRECT_TEMPLATES'] = ['archives'] settings['ARCHIVES_SAVE_AS'] = 'archives/index.html' + settings['CACHE_DIRECTORY'] = self.temp_cache generator = ArticlesGenerator( context=settings, settings=settings, path=None, theme=settings['THEME'], output_path=None) @@ -180,6 +191,7 @@ class TestArticlesGenerator(unittest.TestCase): settings = get_settings() settings['DIRECT_TEMPLATES'] = ['archives'] settings['ARCHIVES_SAVE_AS'] = 'archives/index.html' + settings['CACHE_DIRECTORY'] = self.temp_cache generator = ArticlesGenerator( context=settings, settings=settings, path=None, theme=settings['THEME'], output_path=None) @@ -206,6 +218,7 @@ class TestArticlesGenerator(unittest.TestCase): settings = get_settings(filenames={}) settings['YEAR_ARCHIVE_SAVE_AS'] = 'posts/{date:%Y}/index.html' + settings['CACHE_DIRECTORY'] = self.temp_cache generator = ArticlesGenerator( context=settings, settings=settings, path=CONTENT_DIR, theme=settings['THEME'], output_path=None) @@ -268,6 +281,25 @@ class TestArticlesGenerator(unittest.TestCase): authors_expected = ['alexis-metaireau', 'first-author', 'second-author'] self.assertEqual(sorted(authors), sorted(authors_expected)) + def test_content_caching(self): + """Test that the articles are read only once when caching""" + settings = get_settings(filenames={}) + settings['CACHE_DIRECTORY'] = self.temp_cache + settings['READERS'] = {'asc': None} + + generator = ArticlesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + generator.generate_context() + self.assertTrue(hasattr(generator, '_cache')) + + generator = ArticlesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + generator.readers.read_file = MagicMock() + generator.generate_context() + generator.readers.read_file.assert_called_count == 0 + class TestPageGenerator(unittest.TestCase): # Note: Every time you want to test for a new field; Make sure the test @@ -275,12 +307,19 @@ class TestPageGenerator(unittest.TestCase): # distill_pages Then update the assertEqual in test_generate_context # to match expected + def setUp(self): + self.temp_cache = mkdtemp(prefix='pelican_cache.') + + def tearDown(self): + rmtree(self.temp_cache) + def distill_pages(self, pages): return [[page.title, page.status, page.template] for page in pages] def test_generate_context(self): settings = get_settings(filenames={}) settings['PAGE_DIR'] = 'TestPages' # relative to CUR_DIR + settings['CACHE_DIRECTORY'] = self.temp_cache settings['DEFAULT_DATE'] = (1970, 1, 1) generator = PagesGenerator( @@ -306,6 +345,26 @@ class TestPageGenerator(unittest.TestCase): self.assertEqual(sorted(pages_expected), sorted(pages)) self.assertEqual(sorted(hidden_pages_expected), sorted(hidden_pages)) + def test_content_caching(self): + """Test that the pages are read only once when caching""" + settings = get_settings(filenames={}) + settings['CACHE_DIRECTORY'] = 'cache_dir' #TODO + settings['CACHE_DIRECTORY'] = self.temp_cache + settings['READERS'] = {'asc': None} + + generator = PagesGenerator( + context=settings.copy(), settings=settings, + path=CUR_DIR, theme=settings['THEME'], output_path=None) + generator.generate_context() + self.assertTrue(hasattr(generator, '_cache')) + + generator = PagesGenerator( + context=settings.copy(), settings=settings, + path=CUR_DIR, theme=settings['THEME'], output_path=None) + generator.readers.read_file = MagicMock() + generator.generate_context() + generator.readers.read_file.assert_called_count == 0 + class TestTemplatePagesGenerator(unittest.TestCase): diff --git a/pelican/tests/test_pelican.py b/pelican/tests/test_pelican.py index 2d4bbdfc..15876095 100644 --- a/pelican/tests/test_pelican.py +++ b/pelican/tests/test_pelican.py @@ -43,12 +43,14 @@ class TestPelican(LoggedTestCase): def setUp(self): super(TestPelican, self).setUp() self.temp_path = mkdtemp(prefix='pelicantests.') + self.temp_cache = mkdtemp(prefix='pelican_cache.') self.old_locale = locale.setlocale(locale.LC_ALL) self.maxDiff = None locale.setlocale(locale.LC_ALL, str('C')) def tearDown(self): rmtree(self.temp_path) + rmtree(self.temp_cache) locale.setlocale(locale.LC_ALL, self.old_locale) super(TestPelican, self).tearDown() @@ -77,6 +79,7 @@ class TestPelican(LoggedTestCase): settings = read_settings(path=None, override={ 'PATH': INPUT_PATH, 'OUTPUT_PATH': self.temp_path, + 'CACHE_DIRECTORY': self.temp_cache, 'LOCALE': locale.normalize('en_US'), }) pelican = Pelican(settings=settings) @@ -92,6 +95,7 @@ class TestPelican(LoggedTestCase): settings = read_settings(path=SAMPLE_CONFIG, override={ 'PATH': INPUT_PATH, 'OUTPUT_PATH': self.temp_path, + 'CACHE_DIRECTORY': self.temp_cache, 'LOCALE': locale.normalize('en_US'), }) pelican = Pelican(settings=settings) @@ -103,6 +107,7 @@ class TestPelican(LoggedTestCase): settings = read_settings(path=SAMPLE_CONFIG, override={ 'PATH': INPUT_PATH, 'OUTPUT_PATH': self.temp_path, + 'CACHE_DIRECTORY': self.temp_cache, 'THEME_STATIC_PATHS': [os.path.join(SAMPLES_PATH, 'very'), os.path.join(SAMPLES_PATH, 'kinda'), os.path.join(SAMPLES_PATH, 'theme_standard')] @@ -123,6 +128,7 @@ class TestPelican(LoggedTestCase): settings = read_settings(path=SAMPLE_CONFIG, override={ 'PATH': INPUT_PATH, 'OUTPUT_PATH': self.temp_path, + 'CACHE_DIRECTORY': self.temp_cache, 'THEME_STATIC_PATHS': [os.path.join(SAMPLES_PATH, 'theme_standard')] }) diff --git a/pelican/utils.py b/pelican/utils.py index c5aacaa3..8c416921 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -12,6 +12,8 @@ import pytz import re import shutil import traceback +import pickle +import hashlib from collections import Hashable from contextlib import contextmanager @@ -545,3 +547,114 @@ def split_all(path): break path = head return components + + +class FileDataCacher(object): + '''Class that can cache data contained in files''' + + def __init__(self, settings, cache_policy_key, load_policy_key): + '''Load the specified cache within CACHE_DIRECTORY + + only if load_policy_key in setttings is True, + May use gzip if GZIP_CACHE. + Sets caching policy according to *cache_policy_key* + in *settings* + ''' + self.settings = settings + name = self.__class__.__name__ + self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'], name) + self._cache_data_policy = self.settings[cache_policy_key] + if not self.settings[load_policy_key]: + self._cache = {} + return + if self.settings['GZIP_CACHE']: + import gzip + self._cache_open = gzip.open + else: + self._cache_open = open + try: + with self._cache_open(self._cache_path, 'rb') as f: + self._cache = pickle.load(f) + except Exception as e: + self._cache = {} + + def cache_data(self, filename, data): + '''Cache data for given file''' + if not self._cache_data_policy: + return + self._cache[filename] = data + + def get_cached_data(self, filename, default={}): + '''Get cached data for the given file + + if no data is cached, return the default object + ''' + return self._cache.get(filename, default) + + def save_cache(self): + '''Save the updated cache''' + if not self._cache_data_policy: + return + try: + mkdir_p(self.settings['CACHE_DIRECTORY']) + with self._cache_open(self._cache_path, 'wb') as f: + pickle.dump(self._cache, f) + except Exception as e: + logger.warning('Could not save cache {}\n{}'.format( + self._cache_path, e)) + + +class FileStampDataCacher(FileDataCacher): + '''Subclass that also caches the stamp of the file''' + + def __init__(self, settings, cache_policy_key, load_policy_key): + '''This sublcass additionaly sets filestamp function''' + super(FileStampDataCacher, self).__init__(settings, cache_policy_key, + load_policy_key) + + method = self.settings['CHECK_MODIFIED_METHOD'] + if method == 'mtime': + self._filestamp_func = os.path.getmtime + else: + try: + hash_func = getattr(hashlib, method) + def filestamp_func(buf): + return hash_func(buf).digest() + self._filestamp_func = filestamp_func + except ImportError: + self._filestamp_func = None + + def cache_data(self, filename, data): + '''Cache stamp and data for the given file''' + stamp = self._get_file_stamp(filename) + super(FileStampDataCacher, self).cache_data(filename, (stamp, data)) + + def _get_file_stamp(self, filename): + '''Check if the given file has been modified + since the previous build. + + depending on CHECK_MODIFIED_METHOD + a float may be returned for 'mtime', + a hash for a function name in the hashlib module + or an empty bytes string otherwise + ''' + filename = os.path.join(self.path, filename) + try: + with open(filename, 'rb') as f: + return self._filestamp_func(f.read()) + except Exception: + return b'' + + def get_cached_data(self, filename, default=None): + '''Get the cached data for the given filename + if the file has not been modified. + + If no record exists or file has been modified, return default. + Modification is checked by compaing the cached + and current file stamp. + ''' + stamp, data = super(FileStampDataCacher, self).get_cached_data( + filename, (None, default)) + if stamp != self._get_file_stamp(filename): + return default + return data