split content caching into two layers

This is a reworked and improved version of content caching.
Notable changes:
- by default only raw content and metadata returned by readers are
  cached which should prevent conficts with plugins, the speed benefit
  of content objects caching is not very big with a simple setup
- renamed --full-rebuild to --ignore-cache
- added more elaborate logging to caching code
This commit is contained in:
Ondrej Grover 2014-04-20 14:34:52 +02:00
commit c1324b0206
9 changed files with 200 additions and 81 deletions

View file

@ -262,8 +262,9 @@ def parse_arguments():
help='Relaunch pelican each time a modification occurs'
' on the content files.')
parser.add_argument('-f', '--full-rebuild', action='store_true',
dest='full_rebuild', help='Rebuild everything by not loading from cache')
parser.add_argument('-c', '--ignore-cache', action='store_true',
dest='ignore_cache', help='Ignore content cache '
'from previous runs by not loading cache files.')
parser.add_argument('-w', '--write-selected', type=str,
dest='selected_paths', default=None,
@ -284,7 +285,7 @@ def get_config(args):
config['THEME'] = abstheme if os.path.exists(abstheme) else args.theme
if args.delete_outputdir is not None:
config['DELETE_OUTPUT_DIRECTORY'] = args.delete_outputdir
if args.full_rebuild:
if args.ignore_cache:
config['LOAD_CONTENT_CACHE'] = False
if args.selected_paths:
config['WRITE_SELECTED'] = args.selected_paths.split(',')
@ -340,7 +341,10 @@ def main():
print(' --- AutoReload Mode: Monitoring `content`, `theme` and'
' `settings` for changes. ---')
first_run = True # load cache on first run
def _ignore_cache(pelican_obj):
if pelican_obj.settings['AUTORELOAD_IGNORE_CACHE']:
pelican_obj.settings['LOAD_CONTENT_CACHE'] = False
while True:
try:
# Check source dir for changed files ending with the given
@ -353,10 +357,9 @@ def main():
if modified['settings']:
pelican, settings = get_instance(args)
if not first_run:
original_load_cache = settings['LOAD_CONTENT_CACHE']
# invalidate cache
pelican.settings['LOAD_CONTENT_CACHE'] = False
original_load_cache = settings['LOAD_CONTENT_CACHE']
print(pelican.settings['AUTORELOAD_IGNORE_CACHE'])
_ignore_cache(pelican)
if any(modified.values()):
print('\n-> Modified: {}. re-generating...'.format(
@ -368,13 +371,8 @@ def main():
if modified['theme'] is None:
logger.warning('Empty theme folder. Using `basic` '
'theme.')
elif modified['theme']:
# theme modified, needs full rebuild -> no cache
if not first_run: # but not on first run
pelican.settings['LOAD_CONTENT_CACHE'] = False
pelican.run()
first_run = False
# restore original caching policy
pelican.settings['LOAD_CONTENT_CACHE'] = original_load_cache

View file

@ -28,10 +28,11 @@ from pelican import signals
logger = logging.getLogger(__name__)
class Generator(FileStampDataCacher):
class Generator(object):
"""Baseclass generator"""
def __init__(self, context, settings, path, theme, output_path, **kwargs):
def __init__(self, context, settings, path, theme, output_path,
readers_cache_name='', **kwargs):
self.context = context
self.settings = settings
self.path = path
@ -41,7 +42,7 @@ class Generator(FileStampDataCacher):
for arg, value in kwargs.items():
setattr(self, arg, value)
self.readers = Readers(self.settings)
self.readers = Readers(self.settings, readers_cache_name)
# templates cache
self._templates = {}
@ -74,10 +75,6 @@ class Generator(FileStampDataCacher):
custom_filters = self.settings['JINJA_FILTERS']
self.env.filters.update(custom_filters)
# set up caching
super(Generator, self).__init__(settings, 'CACHE_CONTENT',
'LOAD_CONTENT_CACHE')
signals.generator_init.send(self)
def get_template(self, name):
@ -153,6 +150,35 @@ class Generator(FileStampDataCacher):
self.context[item] = value
class CachingGenerator(Generator, FileStampDataCacher):
'''Subclass of Generator and FileStampDataCacher classes
enables content caching, either at the generator or reader level
'''
def __init__(self, *args, **kwargs):
'''Initialize the generator, then set up caching
note the multiple inheritance structure
'''
cls_name = self.__class__.__name__
Generator.__init__(self, *args,
readers_cache_name=(cls_name + '-Readers'),
**kwargs)
cache_this_level = self.settings['CONTENT_CACHING_LAYER'] == 'generator'
caching_policy = cache_this_level and self.settings['CACHE_CONTENT']
load_policy = cache_this_level and self.settings['LOAD_CONTENT_CACHE']
FileStampDataCacher.__init__(self, self.settings, cls_name,
caching_policy, load_policy
)
def _get_file_stamp(self, filename):
'''Get filestamp for path relative to generator.path'''
filename = os.path.join(self.path, filename)
return super(Generator, self)._get_file_stamp(filename)
class _FileLoader(BaseLoader):
def __init__(self, path, basedir):
@ -183,7 +209,7 @@ class TemplatePagesGenerator(Generator):
del self.env.loader.loaders[0]
class ArticlesGenerator(Generator):
class ArticlesGenerator(CachingGenerator):
"""Generate blog articles"""
def __init__(self, *args, **kwargs):
@ -537,6 +563,7 @@ class ArticlesGenerator(Generator):
self._update_context(('articles', 'dates', 'tags', 'categories',
'tag_cloud', 'authors', 'related_posts'))
self.save_cache()
self.readers.save_cache()
signals.article_generator_finalized.send(self)
def generate_output(self, writer):
@ -545,7 +572,7 @@ class ArticlesGenerator(Generator):
signals.article_writer_finalized.send(self, writer=writer)
class PagesGenerator(Generator):
class PagesGenerator(CachingGenerator):
"""Generate pages"""
def __init__(self, *args, **kwargs):
@ -599,6 +626,7 @@ class PagesGenerator(Generator):
self.context['PAGES'] = self.pages
self.save_cache()
self.readers.save_cache()
signals.page_generator_finalized.send(self)
def generate_output(self, writer):

View file

@ -33,7 +33,7 @@ except ImportError:
from pelican import signals
from pelican.contents import Page, Category, Tag, Author
from pelican.utils import get_date, pelican_open
from pelican.utils import get_date, pelican_open, FileStampDataCacher
METADATA_PROCESSORS = {
@ -382,7 +382,7 @@ class AsciiDocReader(BaseReader):
return content, metadata
class Readers(object):
class Readers(FileStampDataCacher):
"""Interface for all readers.
This class contains a mapping of file extensions / Reader classes, to know
@ -392,7 +392,7 @@ class Readers(object):
"""
def __init__(self, settings=None):
def __init__(self, settings=None, cache_name=''):
self.settings = settings or {}
self.readers = {}
self.reader_classes = {}
@ -417,6 +417,15 @@ class Readers(object):
self.readers[fmt] = reader_class(self.settings)
# set up caching
cache_this_level = (cache_name != '' and
self.settings['CONTENT_CACHING_LAYER'] == 'reader')
caching_policy = cache_this_level and self.settings['CACHE_CONTENT']
load_policy = cache_this_level and self.settings['LOAD_CONTENT_CACHE']
super(Readers, self).__init__(settings, cache_name,
caching_policy, load_policy,
)
@property
def extensions(self):
return self.readers.keys()
@ -455,7 +464,10 @@ class Readers(object):
source_path=source_path, settings=self.settings,
process=reader.process_metadata))
content, reader_metadata = reader.read(path)
content, reader_metadata = self.get_cached_data(path, (None, None))
if content is None:
content, reader_metadata = reader.read(path)
self.cache_data(path, (content, reader_metadata))
metadata.update(reader_metadata)
if content:

View file

@ -123,10 +123,12 @@ DEFAULT_CONFIG = {
'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
'SLUGIFY_SOURCE': 'title',
'CACHE_CONTENT': True,
'CONTENT_CACHING_LAYER': 'reader',
'CACHE_DIRECTORY': 'cache',
'GZIP_CACHE': True,
'CHECK_MODIFIED_METHOD': 'mtime',
'LOAD_CONTENT_CACHE': True,
'AUTORELOAD_IGNORE_CACHE': False,
'WRITE_SELECTED': [],
}
@ -266,6 +268,14 @@ def configure_settings(settings):
if not 'FEED_DOMAIN' in settings:
settings['FEED_DOMAIN'] = settings['SITEURL']
# check content caching layer and warn of incompatibilities
if (settings.get('CACHE_CONTENT', False) and
settings.get('CONTENT_CACHING_LAYER', '') == 'generator' and
settings.get('WITH_FUTURE_DATES', DEFAULT_CONFIG['WITH_FUTURE_DATES'])):
logger.warning('WITH_FUTURE_DATES conflicts with '
"CONTENT_CACHING_LAYER set to 'generator', "
"use 'reader' layer instead")
# Warn if feeds are generated with both SITEURL & FEED_DOMAIN undefined
feed_keys = [
'FEED_ATOM', 'FEED_RSS',

View file

@ -288,10 +288,11 @@ class TestArticlesGenerator(unittest.TestCase):
authors_expected = ['alexis-metaireau', 'first-author', 'second-author']
self.assertEqual(sorted(authors), sorted(authors_expected))
def test_content_caching(self):
"""Test that the articles are read only once when caching"""
def test_article_object_caching(self):
"""Test Article objects caching at the generator level"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['CONTENT_CACHING_LAYER'] = 'generator'
settings['READERS'] = {'asc': None}
generator = ArticlesGenerator(
@ -307,10 +308,32 @@ class TestArticlesGenerator(unittest.TestCase):
generator.generate_context()
generator.readers.read_file.assert_called_count == 0
def test_full_rebuild(self):
def test_reader_content_caching(self):
"""Test raw content caching at the reader level"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
generator = ArticlesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.generate_context()
self.assertTrue(hasattr(generator.readers, '_cache'))
generator = ArticlesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
readers = generator.readers.readers
for reader in readers.values():
reader.read = MagicMock()
generator.generate_context()
for reader in readers.values():
reader.read.assert_called_count == 0
def test_ignore_cache(self):
"""Test that all the articles are read again when not loading cache
used in --full-rebuild or autoreload mode"""
used in --ignore-cache or autoreload mode"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
@ -376,30 +399,52 @@ class TestPageGenerator(unittest.TestCase):
self.assertEqual(sorted(pages_expected), sorted(pages))
self.assertEqual(sorted(hidden_pages_expected), sorted(hidden_pages))
def test_content_caching(self):
"""Test that the pages are read only once when caching"""
def test_page_object_caching(self):
"""Test Page objects caching at the generator level"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = 'cache_dir' #TODO
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['CONTENT_CACHING_LAYER'] = 'generator'
settings['READERS'] = {'asc': None}
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CUR_DIR, theme=settings['THEME'], output_path=None)
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.generate_context()
self.assertTrue(hasattr(generator, '_cache'))
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CUR_DIR, theme=settings['THEME'], output_path=None)
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.readers.read_file = MagicMock()
generator.generate_context()
generator.readers.read_file.assert_called_count == 0
def test_full_rebuild(self):
def test_reader_content_caching(self):
"""Test raw content caching at the reader level"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.generate_context()
self.assertTrue(hasattr(generator.readers, '_cache'))
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
readers = generator.readers.readers
for reader in readers.values():
reader.read = MagicMock()
generator.generate_context()
for reader in readers.values():
reader.read.assert_called_count == 0
def test_ignore_cache(self):
"""Test that all the pages are read again when not loading cache
used in --full-rebuild or autoreload mode"""
used in --ignore_cache or autoreload mode"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}

View file

@ -552,28 +552,30 @@ def split_all(path):
class FileDataCacher(object):
'''Class that can cache data contained in files'''
def __init__(self, settings, cache_policy_key, load_policy_key):
'''Load the specified cache within CACHE_DIRECTORY
def __init__(self, settings, cache_name, caching_policy, load_policy):
'''Load the specified cache within CACHE_DIRECTORY in settings
only if load_policy_key in setttings is True,
May use gzip if GZIP_CACHE.
Sets caching policy according to *cache_policy_key*
in *settings*
only if *load_policy* is True,
May use gzip if GZIP_CACHE ins settings is True.
Sets caching policy according to *caching_policy*.
'''
self.settings = settings
name = self.__class__.__name__
self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'], name)
self._cache_data_policy = self.settings[cache_policy_key]
self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'],
cache_name)
self._cache_data_policy = caching_policy
if self.settings['GZIP_CACHE']:
import gzip
self._cache_open = gzip.open
else:
self._cache_open = open
if self.settings[load_policy_key]:
if load_policy:
try:
with self._cache_open(self._cache_path, 'rb') as f:
self._cache = pickle.load(f)
except Exception as e:
with self._cache_open(self._cache_path, 'rb') as fhandle:
self._cache = pickle.load(fhandle)
except (IOError, OSError, pickle.UnpicklingError) as err:
logger.warning(('Cannot load cache {}, '
'proceeding with empty cache.\n{}').format(
self._cache_path, err))
self._cache = {}
else:
self._cache = {}
@ -583,7 +585,7 @@ class FileDataCacher(object):
if self._cache_data_policy:
self._cache[filename] = data
def get_cached_data(self, filename, default={}):
def get_cached_data(self, filename, default=None):
'''Get cached data for the given file
if no data is cached, return the default object
@ -595,20 +597,23 @@ class FileDataCacher(object):
if self._cache_data_policy:
try:
mkdir_p(self.settings['CACHE_DIRECTORY'])
with self._cache_open(self._cache_path, 'wb') as f:
pickle.dump(self._cache, f)
except Exception as e:
with self._cache_open(self._cache_path, 'wb') as fhandle:
pickle.dump(self._cache, fhandle)
except (IOError, OSError, pickle.PicklingError) as err:
logger.warning('Could not save cache {}\n{}'.format(
self._cache_path, e))
self._cache_path, err))
class FileStampDataCacher(FileDataCacher):
'''Subclass that also caches the stamp of the file'''
def __init__(self, settings, cache_policy_key, load_policy_key):
'''This sublcass additionaly sets filestamp function'''
super(FileStampDataCacher, self).__init__(settings, cache_policy_key,
load_policy_key)
def __init__(self, settings, cache_name, caching_policy, load_policy):
'''This sublcass additionaly sets filestamp function
and base path for filestamping operations
'''
super(FileStampDataCacher, self).__init__(settings, cache_name,
caching_policy,
load_policy)
method = self.settings['CHECK_MODIFIED_METHOD']
if method == 'mtime':
@ -616,10 +621,14 @@ class FileStampDataCacher(FileDataCacher):
else:
try:
hash_func = getattr(hashlib, method)
def filestamp_func(buf):
return hash_func(buf).digest()
def filestamp_func(filename):
'''return hash of file contents'''
with open(filename, 'rb') as fhandle:
return hash_func(fhandle.read()).digest()
self._filestamp_func = filestamp_func
except ImportError:
except AttributeError as err:
logger.warning('Could not get hashing function\n{}'.format(
err))
self._filestamp_func = None
def cache_data(self, filename, data):
@ -636,11 +645,11 @@ class FileStampDataCacher(FileDataCacher):
a hash for a function name in the hashlib module
or an empty bytes string otherwise
'''
filename = os.path.join(self.path, filename)
try:
with open(filename, 'rb') as f:
return self._filestamp_func(f.read())
except Exception:
return self._filestamp_func(filename)
except (IOError, OSError, TypeError) as err:
logger.warning('Cannot get modification stamp for {}\n{}'.format(
filename, err))
return b''
def get_cached_data(self, filename, default=None):
@ -648,7 +657,7 @@ class FileStampDataCacher(FileDataCacher):
if the file has not been modified.
If no record exists or file has been modified, return default.
Modification is checked by compaing the cached
Modification is checked by comparing the cached
and current file stamp.
'''
stamp, data = super(FileStampDataCacher, self).get_cached_data(