From c1324b0206a70b8179689d2305c8de678d5e7b1d Mon Sep 17 00:00:00 2001 From: Ondrej Grover Date: Sun, 20 Apr 2014 14:34:52 +0200 Subject: [PATCH] split content caching into two layers This is a reworked and improved version of content caching. Notable changes: - by default only raw content and metadata returned by readers are cached which should prevent conficts with plugins, the speed benefit of content objects caching is not very big with a simple setup - renamed --full-rebuild to --ignore-cache - added more elaborate logging to caching code --- README.rst | 1 + docs/index.rst | 1 + docs/settings.rst | 41 +++++++++++++------ pelican/__init__.py | 24 +++++------ pelican/generators.py | 46 ++++++++++++++++----- pelican/readers.py | 20 +++++++-- pelican/settings.py | 10 +++++ pelican/tests/test_generators.py | 67 ++++++++++++++++++++++++++----- pelican/utils.py | 69 ++++++++++++++++++-------------- 9 files changed, 199 insertions(+), 80 deletions(-) diff --git a/README.rst b/README.rst index 20c3f217..bf506c5f 100644 --- a/README.rst +++ b/README.rst @@ -29,6 +29,7 @@ Pelican currently supports: * Code syntax highlighting * Import from WordPress, Dotclear, or RSS feeds * Integration with external tools: Twitter, Google Analytics, etc. (optional) +* Fast rebuild times thanks to content caching and selective output writing. Have a look at the `Pelican documentation`_ for more information. diff --git a/docs/index.rst b/docs/index.rst index 43193e9e..c2deb6de 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,7 @@ Pelican |version| currently supports: * Code syntax highlighting * Import from WordPress, Dotclear, or RSS feeds * Integration with external tools: Twitter, Google Analytics, etc. (optional) +* Fast rebuild times thanks to content caching and selective output writing. Why the name "Pelican"? ----------------------- diff --git a/docs/settings.rst b/docs/settings.rst index 0de811ec..1b4bae94 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -161,6 +161,7 @@ Setting name (default value) `_ `WITH_FUTURE_DATES` (``True``) If disabled, content with dates in the future will get a default status of ``draft``. + see :ref:`reading_only_modified_content` for details. `INTRASITE_LINK_REGEX` (``'[{|](?P.*?)[|}]'``) Regular expression that is used to parse internal links. Default syntax of links to internal files, tags, etc., is to enclose the identifier, say ``filename``, in ``{}`` or ``||``. @@ -173,12 +174,16 @@ Setting name (default value) `SLUGIFY_SOURCE` (``'input'``) Specifies where you want the slug to be automatically generated from. Can be set to 'title' to use the 'Title:' metadata tag or 'basename' to use the articles basename when creating the slug. -`CACHE_CONTENT` (``True``) If ``True``, save read content in a cache file. +`CACHE_CONTENT` (``True``) If ``True``, save content in a cache file. See :ref:`reading_only_modified_content` for details about caching. +`CONTENT_CACHING_LAYER` (``'reader'``) If set to ``'reader'``, save only the raw content and metadata returned + by readers, if set to ``'generator'``, save processed content objects. `CACHE_DIRECTORY` (``cache``) Directory in which to store cache files. +`GZIP_CACHE` (``True``) If ``True``, use gzip to (de)compress the cache files. `CHECK_MODIFIED_METHOD` (``mtime``) Controls how files are checked for modifications. `LOAD_CONTENT_CACHE` (``True``) If ``True``, load unmodified content from cache. -`GZIP_CACHE` (``True``) If ``True``, use gzip to (de)compress the cache files. +`AUTORELOAD_IGNORE_CACHE` (``False``) If ``True``, do not load content cache in autoreload mode + when the settings file changes. `WRITE_SELECTED` (``[]``) If this list is not empty, **only** output files with their paths in this list are written. Paths should be either relative to the current working directory of Pelican or absolute. For possible use cases see @@ -749,13 +754,21 @@ When Pelican is about to read some content source file: file cannot be found in the cache file, the content is read as usual. -3. If the file is considered unchanged, the content object saved in a +3. If the file is considered unchanged, the content data saved in a previous build corresponding to the file is loaded from the cache and the file is not read. 4. If the file is considered changed, the file is read and the new - modification information and the content object are saved to the + modification information and the content data are saved to the cache if `CACHE_CONTENT` is ``True``. +Depending on `CONTENT_CACHING_LAYER` either the raw content and +metadata returned by a reader are cached if set to ``'reader'``, or +the processed content object is cached if set to ``'generator'``. +Caching the processed content object may conflict with plugins (as +some reading related signals may be skipped) or e.g. the +`WITH_FUTURE_DATES` functionality (as the ``draft`` status of the +cached content objects would not change automatically over time). + Modification time based checking is faster than comparing file hashes, but is not as reliable, because mtime information can be lost when e.g. copying the content sources using the ``cp`` or ``rsync`` @@ -764,16 +777,18 @@ commands without the mtime preservation mode (invoked e.g. by The cache files are Python pickles, so they may not be readable by different versions of Python as the pickle format often changes. If -such an error is encountered, the cache files have to be rebuilt -using the pelican command-line option ``--full-rebuild``. -The cache files also have to be rebuilt when changing the -`GZIP_CACHE` setting for cache file reading to work. +such an error is encountered, the cache files have to be rebuilt by +running pelican after removing them or by using the pelican +command-line option ``--ignore-cache``. The cache files also have to +be rebuilt when changing the `GZIP_CACHE` setting for cache file +reading to work. -The ``--full-rebuild`` command-line option is also useful when the -whole site needs to be regenerated due to e.g. modifications to the -settings file or theme files. When pelican runs in autorealod mode, -modification of the settings file or theme will trigger a full rebuild -automatically. +The ``--ignore-cache`` command-line option is also useful when the +whole cache needs to be regenerated due to e.g. modifications to the +settings file which should change the cached content or just for +debugging purposes. When pelican runs in autoreload mode, modification +of the settings file will make it ignore the cache automatically if +`AUTORELOAD_IGNORE_CACHE` is ``True``. Note that even when using cached content, all output is always written, so the modification times of the ``*.html`` files always diff --git a/pelican/__init__.py b/pelican/__init__.py index 077859bb..8cae468c 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -262,8 +262,9 @@ def parse_arguments(): help='Relaunch pelican each time a modification occurs' ' on the content files.') - parser.add_argument('-f', '--full-rebuild', action='store_true', - dest='full_rebuild', help='Rebuild everything by not loading from cache') + parser.add_argument('-c', '--ignore-cache', action='store_true', + dest='ignore_cache', help='Ignore content cache ' + 'from previous runs by not loading cache files.') parser.add_argument('-w', '--write-selected', type=str, dest='selected_paths', default=None, @@ -284,7 +285,7 @@ def get_config(args): config['THEME'] = abstheme if os.path.exists(abstheme) else args.theme if args.delete_outputdir is not None: config['DELETE_OUTPUT_DIRECTORY'] = args.delete_outputdir - if args.full_rebuild: + if args.ignore_cache: config['LOAD_CONTENT_CACHE'] = False if args.selected_paths: config['WRITE_SELECTED'] = args.selected_paths.split(',') @@ -340,7 +341,10 @@ def main(): print(' --- AutoReload Mode: Monitoring `content`, `theme` and' ' `settings` for changes. ---') - first_run = True # load cache on first run + def _ignore_cache(pelican_obj): + if pelican_obj.settings['AUTORELOAD_IGNORE_CACHE']: + pelican_obj.settings['LOAD_CONTENT_CACHE'] = False + while True: try: # Check source dir for changed files ending with the given @@ -353,10 +357,9 @@ def main(): if modified['settings']: pelican, settings = get_instance(args) - if not first_run: - original_load_cache = settings['LOAD_CONTENT_CACHE'] - # invalidate cache - pelican.settings['LOAD_CONTENT_CACHE'] = False + original_load_cache = settings['LOAD_CONTENT_CACHE'] + print(pelican.settings['AUTORELOAD_IGNORE_CACHE']) + _ignore_cache(pelican) if any(modified.values()): print('\n-> Modified: {}. re-generating...'.format( @@ -368,13 +371,8 @@ def main(): if modified['theme'] is None: logger.warning('Empty theme folder. Using `basic` ' 'theme.') - elif modified['theme']: - # theme modified, needs full rebuild -> no cache - if not first_run: # but not on first run - pelican.settings['LOAD_CONTENT_CACHE'] = False pelican.run() - first_run = False # restore original caching policy pelican.settings['LOAD_CONTENT_CACHE'] = original_load_cache diff --git a/pelican/generators.py b/pelican/generators.py index a2d7320a..3cc84fa8 100644 --- a/pelican/generators.py +++ b/pelican/generators.py @@ -28,10 +28,11 @@ from pelican import signals logger = logging.getLogger(__name__) -class Generator(FileStampDataCacher): +class Generator(object): """Baseclass generator""" - def __init__(self, context, settings, path, theme, output_path, **kwargs): + def __init__(self, context, settings, path, theme, output_path, + readers_cache_name='', **kwargs): self.context = context self.settings = settings self.path = path @@ -41,7 +42,7 @@ class Generator(FileStampDataCacher): for arg, value in kwargs.items(): setattr(self, arg, value) - self.readers = Readers(self.settings) + self.readers = Readers(self.settings, readers_cache_name) # templates cache self._templates = {} @@ -74,10 +75,6 @@ class Generator(FileStampDataCacher): custom_filters = self.settings['JINJA_FILTERS'] self.env.filters.update(custom_filters) - # set up caching - super(Generator, self).__init__(settings, 'CACHE_CONTENT', - 'LOAD_CONTENT_CACHE') - signals.generator_init.send(self) def get_template(self, name): @@ -153,6 +150,35 @@ class Generator(FileStampDataCacher): self.context[item] = value +class CachingGenerator(Generator, FileStampDataCacher): + '''Subclass of Generator and FileStampDataCacher classes + + enables content caching, either at the generator or reader level + ''' + + def __init__(self, *args, **kwargs): + '''Initialize the generator, then set up caching + + note the multiple inheritance structure + ''' + cls_name = self.__class__.__name__ + Generator.__init__(self, *args, + readers_cache_name=(cls_name + '-Readers'), + **kwargs) + + cache_this_level = self.settings['CONTENT_CACHING_LAYER'] == 'generator' + caching_policy = cache_this_level and self.settings['CACHE_CONTENT'] + load_policy = cache_this_level and self.settings['LOAD_CONTENT_CACHE'] + FileStampDataCacher.__init__(self, self.settings, cls_name, + caching_policy, load_policy + ) + + def _get_file_stamp(self, filename): + '''Get filestamp for path relative to generator.path''' + filename = os.path.join(self.path, filename) + return super(Generator, self)._get_file_stamp(filename) + + class _FileLoader(BaseLoader): def __init__(self, path, basedir): @@ -183,7 +209,7 @@ class TemplatePagesGenerator(Generator): del self.env.loader.loaders[0] -class ArticlesGenerator(Generator): +class ArticlesGenerator(CachingGenerator): """Generate blog articles""" def __init__(self, *args, **kwargs): @@ -537,6 +563,7 @@ class ArticlesGenerator(Generator): self._update_context(('articles', 'dates', 'tags', 'categories', 'tag_cloud', 'authors', 'related_posts')) self.save_cache() + self.readers.save_cache() signals.article_generator_finalized.send(self) def generate_output(self, writer): @@ -545,7 +572,7 @@ class ArticlesGenerator(Generator): signals.article_writer_finalized.send(self, writer=writer) -class PagesGenerator(Generator): +class PagesGenerator(CachingGenerator): """Generate pages""" def __init__(self, *args, **kwargs): @@ -599,6 +626,7 @@ class PagesGenerator(Generator): self.context['PAGES'] = self.pages self.save_cache() + self.readers.save_cache() signals.page_generator_finalized.send(self) def generate_output(self, writer): diff --git a/pelican/readers.py b/pelican/readers.py index fa9d92ae..c63b8981 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -33,7 +33,7 @@ except ImportError: from pelican import signals from pelican.contents import Page, Category, Tag, Author -from pelican.utils import get_date, pelican_open +from pelican.utils import get_date, pelican_open, FileStampDataCacher METADATA_PROCESSORS = { @@ -382,7 +382,7 @@ class AsciiDocReader(BaseReader): return content, metadata -class Readers(object): +class Readers(FileStampDataCacher): """Interface for all readers. This class contains a mapping of file extensions / Reader classes, to know @@ -392,7 +392,7 @@ class Readers(object): """ - def __init__(self, settings=None): + def __init__(self, settings=None, cache_name=''): self.settings = settings or {} self.readers = {} self.reader_classes = {} @@ -417,6 +417,15 @@ class Readers(object): self.readers[fmt] = reader_class(self.settings) + # set up caching + cache_this_level = (cache_name != '' and + self.settings['CONTENT_CACHING_LAYER'] == 'reader') + caching_policy = cache_this_level and self.settings['CACHE_CONTENT'] + load_policy = cache_this_level and self.settings['LOAD_CONTENT_CACHE'] + super(Readers, self).__init__(settings, cache_name, + caching_policy, load_policy, + ) + @property def extensions(self): return self.readers.keys() @@ -455,7 +464,10 @@ class Readers(object): source_path=source_path, settings=self.settings, process=reader.process_metadata)) - content, reader_metadata = reader.read(path) + content, reader_metadata = self.get_cached_data(path, (None, None)) + if content is None: + content, reader_metadata = reader.read(path) + self.cache_data(path, (content, reader_metadata)) metadata.update(reader_metadata) if content: diff --git a/pelican/settings.py b/pelican/settings.py index 1d0ada0c..abf16b32 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -123,10 +123,12 @@ DEFAULT_CONFIG = { 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', 'SLUGIFY_SOURCE': 'title', 'CACHE_CONTENT': True, + 'CONTENT_CACHING_LAYER': 'reader', 'CACHE_DIRECTORY': 'cache', 'GZIP_CACHE': True, 'CHECK_MODIFIED_METHOD': 'mtime', 'LOAD_CONTENT_CACHE': True, + 'AUTORELOAD_IGNORE_CACHE': False, 'WRITE_SELECTED': [], } @@ -266,6 +268,14 @@ def configure_settings(settings): if not 'FEED_DOMAIN' in settings: settings['FEED_DOMAIN'] = settings['SITEURL'] + # check content caching layer and warn of incompatibilities + if (settings.get('CACHE_CONTENT', False) and + settings.get('CONTENT_CACHING_LAYER', '') == 'generator' and + settings.get('WITH_FUTURE_DATES', DEFAULT_CONFIG['WITH_FUTURE_DATES'])): + logger.warning('WITH_FUTURE_DATES conflicts with ' + "CONTENT_CACHING_LAYER set to 'generator', " + "use 'reader' layer instead") + # Warn if feeds are generated with both SITEURL & FEED_DOMAIN undefined feed_keys = [ 'FEED_ATOM', 'FEED_RSS', diff --git a/pelican/tests/test_generators.py b/pelican/tests/test_generators.py index f951f0cb..9463047e 100644 --- a/pelican/tests/test_generators.py +++ b/pelican/tests/test_generators.py @@ -288,10 +288,11 @@ class TestArticlesGenerator(unittest.TestCase): authors_expected = ['alexis-metaireau', 'first-author', 'second-author'] self.assertEqual(sorted(authors), sorted(authors_expected)) - def test_content_caching(self): - """Test that the articles are read only once when caching""" + def test_article_object_caching(self): + """Test Article objects caching at the generator level""" settings = get_settings(filenames={}) settings['CACHE_DIRECTORY'] = self.temp_cache + settings['CONTENT_CACHING_LAYER'] = 'generator' settings['READERS'] = {'asc': None} generator = ArticlesGenerator( @@ -307,10 +308,32 @@ class TestArticlesGenerator(unittest.TestCase): generator.generate_context() generator.readers.read_file.assert_called_count == 0 - def test_full_rebuild(self): + def test_reader_content_caching(self): + """Test raw content caching at the reader level""" + settings = get_settings(filenames={}) + settings['CACHE_DIRECTORY'] = self.temp_cache + settings['READERS'] = {'asc': None} + + generator = ArticlesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + generator.generate_context() + self.assertTrue(hasattr(generator.readers, '_cache')) + + generator = ArticlesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + readers = generator.readers.readers + for reader in readers.values(): + reader.read = MagicMock() + generator.generate_context() + for reader in readers.values(): + reader.read.assert_called_count == 0 + + def test_ignore_cache(self): """Test that all the articles are read again when not loading cache - used in --full-rebuild or autoreload mode""" + used in --ignore-cache or autoreload mode""" settings = get_settings(filenames={}) settings['CACHE_DIRECTORY'] = self.temp_cache settings['READERS'] = {'asc': None} @@ -376,30 +399,52 @@ class TestPageGenerator(unittest.TestCase): self.assertEqual(sorted(pages_expected), sorted(pages)) self.assertEqual(sorted(hidden_pages_expected), sorted(hidden_pages)) - def test_content_caching(self): - """Test that the pages are read only once when caching""" + def test_page_object_caching(self): + """Test Page objects caching at the generator level""" settings = get_settings(filenames={}) - settings['CACHE_DIRECTORY'] = 'cache_dir' #TODO settings['CACHE_DIRECTORY'] = self.temp_cache + settings['CONTENT_CACHING_LAYER'] = 'generator' settings['READERS'] = {'asc': None} generator = PagesGenerator( context=settings.copy(), settings=settings, - path=CUR_DIR, theme=settings['THEME'], output_path=None) + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) generator.generate_context() self.assertTrue(hasattr(generator, '_cache')) generator = PagesGenerator( context=settings.copy(), settings=settings, - path=CUR_DIR, theme=settings['THEME'], output_path=None) + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) generator.readers.read_file = MagicMock() generator.generate_context() generator.readers.read_file.assert_called_count == 0 - def test_full_rebuild(self): + def test_reader_content_caching(self): + """Test raw content caching at the reader level""" + settings = get_settings(filenames={}) + settings['CACHE_DIRECTORY'] = self.temp_cache + settings['READERS'] = {'asc': None} + + generator = PagesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + generator.generate_context() + self.assertTrue(hasattr(generator.readers, '_cache')) + + generator = PagesGenerator( + context=settings.copy(), settings=settings, + path=CONTENT_DIR, theme=settings['THEME'], output_path=None) + readers = generator.readers.readers + for reader in readers.values(): + reader.read = MagicMock() + generator.generate_context() + for reader in readers.values(): + reader.read.assert_called_count == 0 + + def test_ignore_cache(self): """Test that all the pages are read again when not loading cache - used in --full-rebuild or autoreload mode""" + used in --ignore_cache or autoreload mode""" settings = get_settings(filenames={}) settings['CACHE_DIRECTORY'] = self.temp_cache settings['READERS'] = {'asc': None} diff --git a/pelican/utils.py b/pelican/utils.py index cda3108e..7b58a231 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -552,28 +552,30 @@ def split_all(path): class FileDataCacher(object): '''Class that can cache data contained in files''' - def __init__(self, settings, cache_policy_key, load_policy_key): - '''Load the specified cache within CACHE_DIRECTORY + def __init__(self, settings, cache_name, caching_policy, load_policy): + '''Load the specified cache within CACHE_DIRECTORY in settings - only if load_policy_key in setttings is True, - May use gzip if GZIP_CACHE. - Sets caching policy according to *cache_policy_key* - in *settings* + only if *load_policy* is True, + May use gzip if GZIP_CACHE ins settings is True. + Sets caching policy according to *caching_policy*. ''' self.settings = settings - name = self.__class__.__name__ - self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'], name) - self._cache_data_policy = self.settings[cache_policy_key] + self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'], + cache_name) + self._cache_data_policy = caching_policy if self.settings['GZIP_CACHE']: import gzip self._cache_open = gzip.open else: self._cache_open = open - if self.settings[load_policy_key]: + if load_policy: try: - with self._cache_open(self._cache_path, 'rb') as f: - self._cache = pickle.load(f) - except Exception as e: + with self._cache_open(self._cache_path, 'rb') as fhandle: + self._cache = pickle.load(fhandle) + except (IOError, OSError, pickle.UnpicklingError) as err: + logger.warning(('Cannot load cache {}, ' + 'proceeding with empty cache.\n{}').format( + self._cache_path, err)) self._cache = {} else: self._cache = {} @@ -583,7 +585,7 @@ class FileDataCacher(object): if self._cache_data_policy: self._cache[filename] = data - def get_cached_data(self, filename, default={}): + def get_cached_data(self, filename, default=None): '''Get cached data for the given file if no data is cached, return the default object @@ -595,20 +597,23 @@ class FileDataCacher(object): if self._cache_data_policy: try: mkdir_p(self.settings['CACHE_DIRECTORY']) - with self._cache_open(self._cache_path, 'wb') as f: - pickle.dump(self._cache, f) - except Exception as e: + with self._cache_open(self._cache_path, 'wb') as fhandle: + pickle.dump(self._cache, fhandle) + except (IOError, OSError, pickle.PicklingError) as err: logger.warning('Could not save cache {}\n{}'.format( - self._cache_path, e)) + self._cache_path, err)) class FileStampDataCacher(FileDataCacher): '''Subclass that also caches the stamp of the file''' - def __init__(self, settings, cache_policy_key, load_policy_key): - '''This sublcass additionaly sets filestamp function''' - super(FileStampDataCacher, self).__init__(settings, cache_policy_key, - load_policy_key) + def __init__(self, settings, cache_name, caching_policy, load_policy): + '''This sublcass additionaly sets filestamp function + and base path for filestamping operations + ''' + super(FileStampDataCacher, self).__init__(settings, cache_name, + caching_policy, + load_policy) method = self.settings['CHECK_MODIFIED_METHOD'] if method == 'mtime': @@ -616,10 +621,14 @@ class FileStampDataCacher(FileDataCacher): else: try: hash_func = getattr(hashlib, method) - def filestamp_func(buf): - return hash_func(buf).digest() + def filestamp_func(filename): + '''return hash of file contents''' + with open(filename, 'rb') as fhandle: + return hash_func(fhandle.read()).digest() self._filestamp_func = filestamp_func - except ImportError: + except AttributeError as err: + logger.warning('Could not get hashing function\n{}'.format( + err)) self._filestamp_func = None def cache_data(self, filename, data): @@ -636,11 +645,11 @@ class FileStampDataCacher(FileDataCacher): a hash for a function name in the hashlib module or an empty bytes string otherwise ''' - filename = os.path.join(self.path, filename) try: - with open(filename, 'rb') as f: - return self._filestamp_func(f.read()) - except Exception: + return self._filestamp_func(filename) + except (IOError, OSError, TypeError) as err: + logger.warning('Cannot get modification stamp for {}\n{}'.format( + filename, err)) return b'' def get_cached_data(self, filename, default=None): @@ -648,7 +657,7 @@ class FileStampDataCacher(FileDataCacher): if the file has not been modified. If no record exists or file has been modified, return default. - Modification is checked by compaing the cached + Modification is checked by comparing the cached and current file stamp. ''' stamp, data = super(FileStampDataCacher, self).get_cached_data(