mirror of
https://github.com/getpelican/pelican.git
synced 2026-06-05 05:46:55 +02:00
Merge 9ed0eb1c0c into 3c69dc68d2
This commit is contained in:
commit
32f47fb847
8 changed files with 226 additions and 17 deletions
4
RELEASE.md
Normal file
4
RELEASE.md
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
Release type: minor
|
||||
|
||||
Added HEADING_METADATA feature for extracting metadata from Markdown
|
||||
headings using configurable patterns and level mappings.
|
||||
|
|
@ -24,6 +24,7 @@ Release history
|
|||
4.11.0 - 2025-01-15
|
||||
===================
|
||||
|
||||
- Add HEADING_METADATA feature for Markdown title extraction, allowing metadata to be extracted from headings based on configurable patterns and level mappings
|
||||
- Add setting to selectively omit Typogrify filters `(#3439) <https://github.com/getpelican/pelican/pull/3439>`_
|
||||
- Add more blocks to the Simple theme’s base template, making it easier to create new themes by inheriting from the Simple theme `(#3405) <https://github.com/getpelican/pelican/pull/3405>`_
|
||||
- Fix auto-reload behavior upon changes to the theme, content or settings. Make default ``IGNORE_FILES`` recursively ignore all hidden files as well as the `default filters <https://watchfiles.helpmanual.io/api/filters/#watchfiles.DefaultFilter.ignore_dirs>`_ from ``watchfiles.DefaultFilter``. `(#3441) <https://github.com/getpelican/pelican/pull/3441>`_
|
||||
|
|
|
|||
|
|
@ -175,7 +175,38 @@ example, if you would like to extract both the date and the slug, you could set
|
|||
something like: ``'(?P<date>\d{4}-\d{2}-\d{2})_(?P<slug>.*)'``
|
||||
|
||||
Please note that the metadata available inside your files takes precedence over
|
||||
the metadata extracted from the filename.
|
||||
metadata extracted from the filename.
|
||||
|
||||
Metadata extraction from Markdown headings
|
||||
==========================================
|
||||
|
||||
For Markdown files, you can also extract metadata from headings using the
|
||||
``HEADING_METADATA`` setting. This is particularly useful for articles where you
|
||||
want to use the first heading as the title without explicitly declaring it in
|
||||
the metadata section.
|
||||
|
||||
When ``HEADING_METADATA`` is enabled, Pelican can extract metadata from headings
|
||||
using either level mappings or custom regex patterns. This allows Markdown files
|
||||
like::
|
||||
|
||||
Date: 2023-12-01
|
||||
Category: tech
|
||||
|
||||
# My Article Title
|
||||
|
||||
Content goes here...
|
||||
|
||||
This allows their titles to be extracted automatically without requiring
|
||||
explicit ``Title:`` metadata. Note that Markdown files must begin with the
|
||||
metadata block, and headings should follow it.
|
||||
|
||||
The headings used for metadata extraction are automatically removed from the
|
||||
content to avoid duplication in the output.
|
||||
|
||||
See the :ref:`HEADING_METADATA <settings/HEADING_METADATA>`,
|
||||
:ref:`HEADING_METADATA_MAP <settings/HEADING_METADATA_MAP>`, and
|
||||
:ref:`HEADING_METADATA_PATTERNS <settings/HEADING_METADATA_PATTERNS>` settings
|
||||
in the documentation for configuration options.
|
||||
|
||||
Pages
|
||||
=====
|
||||
|
|
|
|||
|
|
@ -977,8 +977,71 @@ Metadata
|
|||
"static/robots.txt": {"path": "robots.txt"},
|
||||
}
|
||||
|
||||
.. _group name notation:
|
||||
https://docs.python.org/3/library/re.html#regular-expression-syntax
|
||||
.. data:: HEADING_METADATA
|
||||
|
||||
Enable or disable metadata extraction from Markdown headings. When set to
|
||||
``True``, Pelican will extract metadata from headings according to the
|
||||
``HEADING_METADATA_MAP`` and ``HEADING_METADATA_PATTERNS`` settings. Pelican
|
||||
removes extracted headings from the content to avoid duplication. The default
|
||||
is ``False``.
|
||||
|
||||
.. data:: HEADING_METADATA_MAP
|
||||
|
||||
A mapping of heading levels to metadata field names. This pallows metadata to
|
||||
be extracted from specific heading levels. For example, ``{1: 'title'}`` will
|
||||
map level 1 headings (``# Heading``) to the ``title`` metadata field.
|
||||
Multiple levels can be mapped to different fields. The default is::
|
||||
|
||||
HEADING_METADATA_MAP = {
|
||||
1: 'title', # # Heading → title
|
||||
2: 'subtitle', # ## Heading → subtitle
|
||||
3: 'summary', # ### Heading → summary
|
||||
}
|
||||
|
||||
.. data:: HEADING_METADATA_PATTERNS
|
||||
|
||||
Custom regex patterns for extracting metadata from headings. This provides
|
||||
more flexibility than ``HEADING_METADATA_MAP`` by allowing you to match
|
||||
specific heading patterns. The patterns are processed before the level
|
||||
mapping, if both are configured. The default is::
|
||||
|
||||
HEADING_METADATA_PATTERNS = {
|
||||
'title': r'^#\s+(.+)$',
|
||||
'subtitle': r'^##\s+(.+)$',
|
||||
'summary': r'^###\s+(.+)$',
|
||||
'author': r'^###\s+Author[:\s]+(.+)$',
|
||||
'date': r'^###\s+Date[:\s]+(.+)$',
|
||||
}
|
||||
|
||||
Example usage::
|
||||
|
||||
# Enable heading metadata extraction
|
||||
HEADING_METADATA = True
|
||||
|
||||
# Simple title extraction from first level heading
|
||||
HEADING_METADATA_PATTERNS = {
|
||||
'title': r'^#\s+(.+)$'
|
||||
}
|
||||
|
||||
# Or use level mapping instead
|
||||
HEADING_METADATA_MAP = {
|
||||
1: 'title'
|
||||
}
|
||||
|
||||
This allows Markdown files like::
|
||||
|
||||
Date: 2023-12-01
|
||||
Category: tech
|
||||
|
||||
# My Article Title
|
||||
|
||||
Content goes here...
|
||||
|
||||
This allows their titles to be extracted automatically without requiring
|
||||
explicit ``Title:`` metadata.
|
||||
|
||||
.. _`group name notation`:
|
||||
https://docs.python.org/3/library/re.html#regular-expression-syntax
|
||||
|
||||
The default is ``{}``.
|
||||
|
||||
|
|
|
|||
|
|
@ -255,7 +255,7 @@ class PrintSettings(argparse.Action):
|
|||
init_logging(name=__name__)
|
||||
|
||||
try:
|
||||
instance, settings = get_instance(namespace)
|
||||
_instance, settings = get_instance(namespace)
|
||||
except Exception as e:
|
||||
logger.critical("%s", e.__class__.__name__, exc_info=True)
|
||||
console.print_exception()
|
||||
|
|
|
|||
|
|
@ -341,18 +341,115 @@ class MarkdownReader(BaseReader):
|
|||
output[name] = self.process_metadata(name, value[0])
|
||||
return output
|
||||
|
||||
def _extract_heading_metadata(self, text, source_path):
|
||||
"""Extract metadata from Markdown headings based on configuration.
|
||||
|
||||
Args:
|
||||
text (str): Raw Markdown text
|
||||
source_path (str): Path to source file for error reporting
|
||||
|
||||
Returns:
|
||||
tuple: (extracted_metadata, lines_to_remove)
|
||||
- extracted_metadata (dict): Extracted metadata from headings
|
||||
- lines_to_remove (set): Line numbers of headings used for metadata
|
||||
"""
|
||||
metadata = OrderedDict()
|
||||
lines_to_remove = set()
|
||||
|
||||
# Get configuration
|
||||
heading_map = self.settings.get("HEADING_METADATA_MAP", {})
|
||||
heading_patterns = self.settings.get("HEADING_METADATA_PATTERNS", {})
|
||||
|
||||
# Extract metadata using both level mapping and custom patterns
|
||||
lines = text.split("\n")
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
custom_matched = False
|
||||
|
||||
# Try custom patterns first
|
||||
for field_name, pattern in heading_patterns.items():
|
||||
match = re.match(pattern, line.strip())
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
if field_name not in metadata: # Only keep first occurrence
|
||||
metadata[field_name] = [value]
|
||||
lines_to_remove.add(line_num)
|
||||
custom_matched = True
|
||||
break
|
||||
|
||||
# Try level mapping (always run, not just if no custom matched)
|
||||
if heading_map and not custom_matched:
|
||||
# Match heading levels (# ## ### etc.)
|
||||
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line.strip())
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
title = heading_match.group(2).strip()
|
||||
|
||||
# Map level to field name
|
||||
field_name = heading_map.get(level)
|
||||
if field_name and field_name not in metadata:
|
||||
metadata[field_name] = [title]
|
||||
lines_to_remove.add(line_num)
|
||||
|
||||
# Process extracted metadata through existing processors
|
||||
processed_metadata = {}
|
||||
metadata_processors = self.settings.get("METADATA_PROCESSORS", {})
|
||||
|
||||
for field_name, values in metadata.items():
|
||||
if isinstance(values, list) and len(values) == 1:
|
||||
values = values[0]
|
||||
|
||||
# Apply metadata processor if available
|
||||
if field_name in metadata_processors:
|
||||
processor = metadata_processors[field_name]
|
||||
try:
|
||||
values = processor(values, self.settings)
|
||||
except (ValueError, TypeError, AttributeError) as e:
|
||||
logger.warning(
|
||||
f"Metadata processor for '{field_name}' failed on "
|
||||
f"'{values}' in '{source_path}': {e}"
|
||||
)
|
||||
|
||||
processed_metadata[field_name] = values
|
||||
|
||||
return processed_metadata, lines_to_remove
|
||||
|
||||
def read(self, source_path):
|
||||
"""Parse content and metadata of markdown files"""
|
||||
|
||||
self._source_path = source_path
|
||||
self._md = Markdown(**self.settings["MARKDOWN"])
|
||||
with pelican_open(source_path) as text:
|
||||
# Extract heading metadata before markdown conversion
|
||||
heading_metadata = {}
|
||||
lines_to_remove = set()
|
||||
if self.settings.get("HEADING_METADATA", False):
|
||||
heading_metadata, lines_to_remove = self._extract_heading_metadata(
|
||||
text, source_path
|
||||
)
|
||||
|
||||
# Remove the heading lines that were used for metadata extraction
|
||||
if lines_to_remove:
|
||||
text_lines = text.split("\n")
|
||||
# Filter out the lines that were used for metadata
|
||||
filtered_lines = [
|
||||
line
|
||||
for line_num, line in enumerate(text_lines, 1)
|
||||
if line_num not in lines_to_remove
|
||||
]
|
||||
text = "\n".join(filtered_lines)
|
||||
|
||||
content = self._md.convert(text)
|
||||
|
||||
if hasattr(self._md, "Meta"):
|
||||
metadata = self._parse_metadata(self._md.Meta)
|
||||
else:
|
||||
metadata = {}
|
||||
|
||||
# Merge heading metadata with regular metadata
|
||||
# Regular metadata takes precedence over heading metadata
|
||||
heading_metadata.update(metadata)
|
||||
metadata = heading_metadata
|
||||
|
||||
return content, metadata
|
||||
|
||||
def disabled_message(self) -> str:
|
||||
|
|
@ -805,7 +902,7 @@ def parse_path_metadata(source_path, settings=None, process=None):
|
|||
"""
|
||||
metadata = {}
|
||||
dirname, basename = os.path.split(source_path)
|
||||
base, ext = os.path.splitext(basename)
|
||||
base, _ext = os.path.splitext(basename)
|
||||
subdir = os.path.basename(dirname)
|
||||
if settings:
|
||||
checks = []
|
||||
|
|
|
|||
|
|
@ -177,6 +177,19 @@ DEFAULT_CONFIG = {
|
|||
"CHECK_MODIFIED_METHOD": "mtime",
|
||||
"LOAD_CONTENT_CACHE": False,
|
||||
"FORMATTED_FIELDS": ["summary"],
|
||||
"HEADING_METADATA": False,
|
||||
"HEADING_METADATA_MAP": {
|
||||
1: "title", # # Heading → title
|
||||
2: "subtitle", # ## Heading → subtitle
|
||||
3: "summary", # ### Heading → summary
|
||||
},
|
||||
"HEADING_METADATA_PATTERNS": {
|
||||
"author": r"^###\s+Author[:\s]+(.+)$",
|
||||
"date": r"^###\s+Date[:\s]+(.+)$",
|
||||
"title": r"^#\s+(.+)$",
|
||||
"subtitle": r"^##\s+(.+)$",
|
||||
"summary": r"^###\s+(.+)$",
|
||||
},
|
||||
"PORT": 8000,
|
||||
"BIND": "127.0.0.1",
|
||||
}
|
||||
|
|
@ -242,7 +255,7 @@ def get_settings_from_module(module: ModuleType | None = None) -> Settings:
|
|||
def get_settings_from_file(path: str) -> Settings:
|
||||
"""Loads settings from a file path, returning a dict."""
|
||||
|
||||
name, ext = os.path.splitext(os.path.basename(path))
|
||||
name, _ext = os.path.splitext(os.path.basename(path))
|
||||
module = load_source(name, path)
|
||||
return get_settings_from_module(module)
|
||||
|
||||
|
|
|
|||
|
|
@ -244,7 +244,7 @@ class RstReaderTest(ReaderTest):
|
|||
def test_article_metadata_key_lowercase(self):
|
||||
# Keys of metadata should be lowercase.
|
||||
reader = readers.RstReader(settings=get_settings())
|
||||
content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))
|
||||
_content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))
|
||||
|
||||
self.assertIn("category", metadata, "Key should be lowercase.")
|
||||
self.assertEqual("Yeah", metadata.get("category"), "Value keeps case.")
|
||||
|
|
@ -627,7 +627,7 @@ class RstReaderTest(ReaderTest):
|
|||
class MdReaderTest(ReaderTest):
|
||||
def test_article_with_metadata(self):
|
||||
reader = readers.MarkdownReader(settings=get_settings())
|
||||
content, metadata = reader.read(_path("article_with_md_extension.md"))
|
||||
_, metadata = reader.read(_path("article_with_md_extension.md"))
|
||||
expected = {
|
||||
"category": "test",
|
||||
"title": "Test md File",
|
||||
|
|
@ -638,7 +638,7 @@ class MdReaderTest(ReaderTest):
|
|||
}
|
||||
self.assertDictHasSubset(metadata, expected)
|
||||
|
||||
content, metadata = reader.read(
|
||||
_content, metadata = reader.read(
|
||||
_path("article_with_markdown_and_nonascii_summary.md")
|
||||
)
|
||||
expected = {
|
||||
|
|
@ -700,7 +700,7 @@ class MdReaderTest(ReaderTest):
|
|||
reader = readers.MarkdownReader(settings=get_settings())
|
||||
# test to ensure the md file extension is being processed by the
|
||||
# correct reader
|
||||
content, metadata = reader.read(_path("article_with_md_extension.md"))
|
||||
content, _ = reader.read(_path("article_with_md_extension.md"))
|
||||
expected = (
|
||||
"<h1>Test Markdown File Header</h1>\n"
|
||||
"<h2>Used for pelican test</h2>\n"
|
||||
|
|
@ -709,7 +709,7 @@ class MdReaderTest(ReaderTest):
|
|||
self.assertEqual(content, expected)
|
||||
# test to ensure the mkd file extension is being processed by the
|
||||
# correct reader
|
||||
content, metadata = reader.read(_path("article_with_mkd_extension.mkd"))
|
||||
content, _ = reader.read(_path("article_with_mkd_extension.mkd"))
|
||||
expected = (
|
||||
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
|
||||
" test</h2>\n<p>This is another markdown test file. Uses"
|
||||
|
|
@ -718,9 +718,7 @@ class MdReaderTest(ReaderTest):
|
|||
self.assertEqual(content, expected)
|
||||
# test to ensure the markdown file extension is being processed by the
|
||||
# correct reader
|
||||
content, metadata = reader.read(
|
||||
_path("article_with_markdown_extension.markdown")
|
||||
)
|
||||
content, _ = reader.read(_path("article_with_markdown_extension.markdown"))
|
||||
expected = (
|
||||
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
|
||||
" test</h2>\n<p>This is another markdown test file. Uses"
|
||||
|
|
@ -729,7 +727,7 @@ class MdReaderTest(ReaderTest):
|
|||
self.assertEqual(content, expected)
|
||||
# test to ensure the mdown file extension is being processed by the
|
||||
# correct reader
|
||||
content, metadata = reader.read(_path("article_with_mdown_extension.mdown"))
|
||||
content, _metadata = reader.read(_path("article_with_mdown_extension.mdown"))
|
||||
expected = (
|
||||
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
|
||||
" test</h2>\n<p>This is another markdown test file. Uses"
|
||||
|
|
@ -825,7 +823,9 @@ class MdReaderTest(ReaderTest):
|
|||
|
||||
def test_duplicate_tags_or_authors_are_removed(self):
|
||||
reader = readers.MarkdownReader(settings=get_settings())
|
||||
content, metadata = reader.read(_path("article_with_duplicate_tags_authors.md"))
|
||||
_content, metadata = reader.read(
|
||||
_path("article_with_duplicate_tags_authors.md")
|
||||
)
|
||||
expected = {
|
||||
"tags": ["foo", "bar", "foobar"],
|
||||
"authors": ["Author, First", "Author, Second"],
|
||||
|
|
@ -837,7 +837,7 @@ class MdReaderTest(ReaderTest):
|
|||
settings["FORMATTED_FIELDS"] = ["summary"]
|
||||
|
||||
reader = readers.MarkdownReader(settings=settings)
|
||||
content, metadata = reader.read(
|
||||
_content, metadata = reader.read(
|
||||
_path("article_with_markdown_and_nested_metadata.md")
|
||||
)
|
||||
expected = {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue