Cache content to speed up reading. Fixes #224.

Cache read content so that it doesn't have to be read next time if its
source has not been modified.
This commit is contained in:
Ondrej Grover 2014-02-15 21:20:51 +01:00 committed by Justin Mayer
commit fd77926700
9 changed files with 336 additions and 34 deletions

View file

@ -260,6 +260,10 @@ def parse_arguments():
action='store_true',
help='Relaunch pelican each time a modification occurs'
' on the content files.')
parser.add_argument('-f', '--full-rebuild', action='store_true',
dest='full_rebuild', help='Rebuild everything by not loading from cache')
return parser.parse_args()
@ -275,6 +279,8 @@ def get_config(args):
config['THEME'] = abstheme if os.path.exists(abstheme) else args.theme
if args.delete_outputdir is not None:
config['DELETE_OUTPUT_DIRECTORY'] = args.delete_outputdir
if args.full_rebuild:
config['LOAD_CONTENT_CACHE'] = False
# argparse returns bytes in Py2. There is no definite answer as to which
# encoding argparse (or sys.argv) uses.
@ -327,6 +333,7 @@ def main():
print(' --- AutoReload Mode: Monitoring `content`, `theme` and'
' `settings` for changes. ---')
first_run = True # load cache on first run
while True:
try:
# Check source dir for changed files ending with the given
@ -335,9 +342,14 @@ def main():
# have changed, no matter what extension the filenames
# have.
modified = {k: next(v) for k, v in watchers.items()}
original_load_cache = settings['LOAD_CONTENT_CACHE']
if modified['settings']:
pelican, settings = get_instance(args)
if not first_run:
original_load_cache = settings['LOAD_CONTENT_CACHE']
# invalidate cache
pelican.settings['LOAD_CONTENT_CACHE'] = False
if any(modified.values()):
print('\n-> Modified: {}. re-generating...'.format(
@ -349,8 +361,15 @@ def main():
if modified['theme'] is None:
logger.warning('Empty theme folder. Using `basic` '
'theme.')
elif modified['theme']:
# theme modified, needs full rebuild -> no cache
if not first_run: # but not on first run
pelican.settings['LOAD_CONTENT_CACHE'] = False
pelican.run()
first_run = False
# restore original caching policy
pelican.settings['LOAD_CONTENT_CACHE'] = original_load_cache
except KeyboardInterrupt:
logger.warning("Keyboard interrupt, quitting.")

View file

@ -325,6 +325,13 @@ class Content(object):
os.path.abspath(self.settings['PATH']))
)
def __eq__(self, other):
"""Compare with metadata and content of other Content object"""
return other and self.metadata == other.metadata and self.content == other.content
# keep basic hashing functionality for caching to work
__hash__ = object.__hash__
class Page(Content):
mandatory_properties = ('title',)

View file

@ -20,14 +20,15 @@ from jinja2 import (Environment, FileSystemLoader, PrefixLoader, ChoiceLoader,
from pelican.contents import Article, Draft, Page, Static, is_valid_content
from pelican.readers import Readers
from pelican.utils import copy, process_translations, mkdir_p, DateFormatter
from pelican.utils import (copy, process_translations, mkdir_p, DateFormatter,
FileStampDataCacher)
from pelican import signals
logger = logging.getLogger(__name__)
class Generator(object):
class Generator(FileStampDataCacher):
"""Baseclass generator"""
def __init__(self, context, settings, path, theme, output_path, **kwargs):
@ -73,6 +74,10 @@ class Generator(object):
custom_filters = self.settings['JINJA_FILTERS']
self.env.filters.update(custom_filters)
# set up caching
super(Generator, self).__init__(settings, 'CACHE_CONTENT',
'LOAD_CONTENT_CACHE')
signals.generator_init.send(self)
def get_template(self, name):
@ -408,20 +413,24 @@ class ArticlesGenerator(Generator):
for f in self.get_files(
self.settings['ARTICLE_DIR'],
exclude=self.settings['ARTICLE_EXCLUDES']):
try:
article = self.readers.read_file(
base_path=self.path, path=f, content_class=Article,
context=self.context,
preread_signal=signals.article_generator_preread,
preread_sender=self,
context_signal=signals.article_generator_context,
context_sender=self)
except Exception as e:
logger.warning('Could not process {}\n{}'.format(f, e))
continue
article = self.get_cached_data(f, None)
if article is None:
try:
article = self.readers.read_file(
base_path=self.path, path=f, content_class=Article,
context=self.context,
preread_signal=signals.article_generator_preread,
preread_sender=self,
context_signal=signals.article_generator_context,
context_sender=self)
except Exception as e:
logger.warning('Could not process {}\n{}'.format(f, e))
continue
if not is_valid_content(article, f):
continue
if not is_valid_content(article, f):
continue
self.cache_data(f, article)
self.add_source_path(article)
@ -502,7 +511,7 @@ class ArticlesGenerator(Generator):
self._update_context(('articles', 'dates', 'tags', 'categories',
'tag_cloud', 'authors', 'related_posts'))
self.save_cache()
signals.article_generator_finalized.send(self)
def generate_output(self, writer):
@ -527,20 +536,24 @@ class PagesGenerator(Generator):
for f in self.get_files(
self.settings['PAGE_DIR'],
exclude=self.settings['PAGE_EXCLUDES']):
try:
page = self.readers.read_file(
base_path=self.path, path=f, content_class=Page,
context=self.context,
preread_signal=signals.page_generator_preread,
preread_sender=self,
context_signal=signals.page_generator_context,
context_sender=self)
except Exception as e:
logger.warning('Could not process {}\n{}'.format(f, e))
continue
page = self.get_cached_data(f, None)
if page is None:
try:
page = self.readers.read_file(
base_path=self.path, path=f, content_class=Page,
context=self.context,
preread_signal=signals.page_generator_preread,
preread_sender=self,
context_signal=signals.page_generator_context,
context_sender=self)
except Exception as e:
logger.warning('Could not process {}\n{}'.format(f, e))
continue
if not is_valid_content(page, f):
continue
if not is_valid_content(page, f):
continue
self.cache_data(f, page)
self.add_source_path(page)
@ -560,6 +573,7 @@ class PagesGenerator(Generator):
self._update_context(('pages', ))
self.context['PAGES'] = self.pages
self.save_cache()
signals.page_generator_finalized.send(self)
def generate_output(self, writer):

View file

@ -119,7 +119,12 @@ DEFAULT_CONFIG = {
'IGNORE_FILES': ['.#*'],
'SLUG_SUBSTITUTIONS': (),
'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
'SLUGIFY_SOURCE': 'title'
'SLUGIFY_SOURCE': 'title',
'CACHE_CONTENT': True,
'CACHE_DIRECTORY': 'cache',
'GZIP_CACHE': True,
'CHECK_MODIFIED_METHOD': 'mtime',
'LOAD_CONTENT_CACHE': True,
}
PYGMENTS_RST_OPTIONS = None

View file

@ -42,6 +42,7 @@ class TestArticlesGenerator(unittest.TestCase):
settings['DEFAULT_CATEGORY'] = 'Default'
settings['DEFAULT_DATE'] = (1970, 1, 1)
settings['READERS'] = {'asc': None}
settings['CACHE_CONTENT'] = False # cache not needed for this logic tests
cls.generator = ArticlesGenerator(
context=settings.copy(), settings=settings,
@ -50,8 +51,15 @@ class TestArticlesGenerator(unittest.TestCase):
cls.articles = [[page.title, page.status, page.category.name,
page.template] for page in cls.generator.articles]
def setUp(self):
self.temp_cache = mkdtemp(prefix='pelican_cache.')
def tearDown(self):
rmtree(self.temp_cache)
def test_generate_feeds(self):
settings = get_settings()
settings['CACHE_DIRECTORY'] = self.temp_cache
generator = ArticlesGenerator(
context=settings, settings=settings,
path=None, theme=settings['THEME'], output_path=None)
@ -127,6 +135,7 @@ class TestArticlesGenerator(unittest.TestCase):
settings['DEFAULT_CATEGORY'] = 'Default'
settings['DEFAULT_DATE'] = (1970, 1, 1)
settings['USE_FOLDER_AS_CATEGORY'] = False
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
settings['filenames'] = {}
generator = ArticlesGenerator(
@ -151,6 +160,7 @@ class TestArticlesGenerator(unittest.TestCase):
def test_direct_templates_save_as_default(self):
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
generator = ArticlesGenerator(
context=settings, settings=settings,
path=None, theme=settings['THEME'], output_path=None)
@ -165,6 +175,7 @@ class TestArticlesGenerator(unittest.TestCase):
settings = get_settings()
settings['DIRECT_TEMPLATES'] = ['archives']
settings['ARCHIVES_SAVE_AS'] = 'archives/index.html'
settings['CACHE_DIRECTORY'] = self.temp_cache
generator = ArticlesGenerator(
context=settings, settings=settings,
path=None, theme=settings['THEME'], output_path=None)
@ -180,6 +191,7 @@ class TestArticlesGenerator(unittest.TestCase):
settings = get_settings()
settings['DIRECT_TEMPLATES'] = ['archives']
settings['ARCHIVES_SAVE_AS'] = 'archives/index.html'
settings['CACHE_DIRECTORY'] = self.temp_cache
generator = ArticlesGenerator(
context=settings, settings=settings,
path=None, theme=settings['THEME'], output_path=None)
@ -206,6 +218,7 @@ class TestArticlesGenerator(unittest.TestCase):
settings = get_settings(filenames={})
settings['YEAR_ARCHIVE_SAVE_AS'] = 'posts/{date:%Y}/index.html'
settings['CACHE_DIRECTORY'] = self.temp_cache
generator = ArticlesGenerator(
context=settings, settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
@ -268,6 +281,25 @@ class TestArticlesGenerator(unittest.TestCase):
authors_expected = ['alexis-metaireau', 'first-author', 'second-author']
self.assertEqual(sorted(authors), sorted(authors_expected))
def test_content_caching(self):
"""Test that the articles are read only once when caching"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
generator = ArticlesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.generate_context()
self.assertTrue(hasattr(generator, '_cache'))
generator = ArticlesGenerator(
context=settings.copy(), settings=settings,
path=CONTENT_DIR, theme=settings['THEME'], output_path=None)
generator.readers.read_file = MagicMock()
generator.generate_context()
generator.readers.read_file.assert_called_count == 0
class TestPageGenerator(unittest.TestCase):
# Note: Every time you want to test for a new field; Make sure the test
@ -275,12 +307,19 @@ class TestPageGenerator(unittest.TestCase):
# distill_pages Then update the assertEqual in test_generate_context
# to match expected
def setUp(self):
self.temp_cache = mkdtemp(prefix='pelican_cache.')
def tearDown(self):
rmtree(self.temp_cache)
def distill_pages(self, pages):
return [[page.title, page.status, page.template] for page in pages]
def test_generate_context(self):
settings = get_settings(filenames={})
settings['PAGE_DIR'] = 'TestPages' # relative to CUR_DIR
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['DEFAULT_DATE'] = (1970, 1, 1)
generator = PagesGenerator(
@ -306,6 +345,26 @@ class TestPageGenerator(unittest.TestCase):
self.assertEqual(sorted(pages_expected), sorted(pages))
self.assertEqual(sorted(hidden_pages_expected), sorted(hidden_pages))
def test_content_caching(self):
"""Test that the pages are read only once when caching"""
settings = get_settings(filenames={})
settings['CACHE_DIRECTORY'] = 'cache_dir' #TODO
settings['CACHE_DIRECTORY'] = self.temp_cache
settings['READERS'] = {'asc': None}
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CUR_DIR, theme=settings['THEME'], output_path=None)
generator.generate_context()
self.assertTrue(hasattr(generator, '_cache'))
generator = PagesGenerator(
context=settings.copy(), settings=settings,
path=CUR_DIR, theme=settings['THEME'], output_path=None)
generator.readers.read_file = MagicMock()
generator.generate_context()
generator.readers.read_file.assert_called_count == 0
class TestTemplatePagesGenerator(unittest.TestCase):

View file

@ -43,12 +43,14 @@ class TestPelican(LoggedTestCase):
def setUp(self):
super(TestPelican, self).setUp()
self.temp_path = mkdtemp(prefix='pelicantests.')
self.temp_cache = mkdtemp(prefix='pelican_cache.')
self.old_locale = locale.setlocale(locale.LC_ALL)
self.maxDiff = None
locale.setlocale(locale.LC_ALL, str('C'))
def tearDown(self):
rmtree(self.temp_path)
rmtree(self.temp_cache)
locale.setlocale(locale.LC_ALL, self.old_locale)
super(TestPelican, self).tearDown()
@ -77,6 +79,7 @@ class TestPelican(LoggedTestCase):
settings = read_settings(path=None, override={
'PATH': INPUT_PATH,
'OUTPUT_PATH': self.temp_path,
'CACHE_DIRECTORY': self.temp_cache,
'LOCALE': locale.normalize('en_US'),
})
pelican = Pelican(settings=settings)
@ -92,6 +95,7 @@ class TestPelican(LoggedTestCase):
settings = read_settings(path=SAMPLE_CONFIG, override={
'PATH': INPUT_PATH,
'OUTPUT_PATH': self.temp_path,
'CACHE_DIRECTORY': self.temp_cache,
'LOCALE': locale.normalize('en_US'),
})
pelican = Pelican(settings=settings)
@ -103,6 +107,7 @@ class TestPelican(LoggedTestCase):
settings = read_settings(path=SAMPLE_CONFIG, override={
'PATH': INPUT_PATH,
'OUTPUT_PATH': self.temp_path,
'CACHE_DIRECTORY': self.temp_cache,
'THEME_STATIC_PATHS': [os.path.join(SAMPLES_PATH, 'very'),
os.path.join(SAMPLES_PATH, 'kinda'),
os.path.join(SAMPLES_PATH, 'theme_standard')]
@ -123,6 +128,7 @@ class TestPelican(LoggedTestCase):
settings = read_settings(path=SAMPLE_CONFIG, override={
'PATH': INPUT_PATH,
'OUTPUT_PATH': self.temp_path,
'CACHE_DIRECTORY': self.temp_cache,
'THEME_STATIC_PATHS': [os.path.join(SAMPLES_PATH, 'theme_standard')]
})

View file

@ -12,6 +12,8 @@ import pytz
import re
import shutil
import traceback
import pickle
import hashlib
from collections import Hashable
from contextlib import contextmanager
@ -545,3 +547,114 @@ def split_all(path):
break
path = head
return components
class FileDataCacher(object):
'''Class that can cache data contained in files'''
def __init__(self, settings, cache_policy_key, load_policy_key):
'''Load the specified cache within CACHE_DIRECTORY
only if load_policy_key in setttings is True,
May use gzip if GZIP_CACHE.
Sets caching policy according to *cache_policy_key*
in *settings*
'''
self.settings = settings
name = self.__class__.__name__
self._cache_path = os.path.join(self.settings['CACHE_DIRECTORY'], name)
self._cache_data_policy = self.settings[cache_policy_key]
if not self.settings[load_policy_key]:
self._cache = {}
return
if self.settings['GZIP_CACHE']:
import gzip
self._cache_open = gzip.open
else:
self._cache_open = open
try:
with self._cache_open(self._cache_path, 'rb') as f:
self._cache = pickle.load(f)
except Exception as e:
self._cache = {}
def cache_data(self, filename, data):
'''Cache data for given file'''
if not self._cache_data_policy:
return
self._cache[filename] = data
def get_cached_data(self, filename, default={}):
'''Get cached data for the given file
if no data is cached, return the default object
'''
return self._cache.get(filename, default)
def save_cache(self):
'''Save the updated cache'''
if not self._cache_data_policy:
return
try:
mkdir_p(self.settings['CACHE_DIRECTORY'])
with self._cache_open(self._cache_path, 'wb') as f:
pickle.dump(self._cache, f)
except Exception as e:
logger.warning('Could not save cache {}\n{}'.format(
self._cache_path, e))
class FileStampDataCacher(FileDataCacher):
'''Subclass that also caches the stamp of the file'''
def __init__(self, settings, cache_policy_key, load_policy_key):
'''This sublcass additionaly sets filestamp function'''
super(FileStampDataCacher, self).__init__(settings, cache_policy_key,
load_policy_key)
method = self.settings['CHECK_MODIFIED_METHOD']
if method == 'mtime':
self._filestamp_func = os.path.getmtime
else:
try:
hash_func = getattr(hashlib, method)
def filestamp_func(buf):
return hash_func(buf).digest()
self._filestamp_func = filestamp_func
except ImportError:
self._filestamp_func = None
def cache_data(self, filename, data):
'''Cache stamp and data for the given file'''
stamp = self._get_file_stamp(filename)
super(FileStampDataCacher, self).cache_data(filename, (stamp, data))
def _get_file_stamp(self, filename):
'''Check if the given file has been modified
since the previous build.
depending on CHECK_MODIFIED_METHOD
a float may be returned for 'mtime',
a hash for a function name in the hashlib module
or an empty bytes string otherwise
'''
filename = os.path.join(self.path, filename)
try:
with open(filename, 'rb') as f:
return self._filestamp_func(f.read())
except Exception:
return b''
def get_cached_data(self, filename, default=None):
'''Get the cached data for the given filename
if the file has not been modified.
If no record exists or file has been modified, return default.
Modification is checked by compaing the cached
and current file stamp.
'''
stamp, data = super(FileStampDataCacher, self).get_cached_data(
filename, (None, default))
if stamp != self._get_file_stamp(filename):
return default
return data