This commit is contained in:
Matěj Cepl 2026-05-02 13:16:03 +00:00 committed by GitHub
commit 32f47fb847
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 226 additions and 17 deletions

4
RELEASE.md Normal file
View file

@ -0,0 +1,4 @@
Release type: minor
Added HEADING_METADATA feature for extracting metadata from Markdown
headings using configurable patterns and level mappings.

View file

@ -24,6 +24,7 @@ Release history
4.11.0 - 2025-01-15
===================
- Add HEADING_METADATA feature for Markdown title extraction, allowing metadata to be extracted from headings based on configurable patterns and level mappings
- Add setting to selectively omit Typogrify filters `(#3439) <https://github.com/getpelican/pelican/pull/3439>`_
- Add more blocks to the Simple themes base template, making it easier to create new themes by inheriting from the Simple theme `(#3405) <https://github.com/getpelican/pelican/pull/3405>`_
- Fix auto-reload behavior upon changes to the theme, content or settings. Make default ``IGNORE_FILES`` recursively ignore all hidden files as well as the `default filters <https://watchfiles.helpmanual.io/api/filters/#watchfiles.DefaultFilter.ignore_dirs>`_ from ``watchfiles.DefaultFilter``. `(#3441) <https://github.com/getpelican/pelican/pull/3441>`_

View file

@ -175,7 +175,38 @@ example, if you would like to extract both the date and the slug, you could set
something like: ``'(?P<date>\d{4}-\d{2}-\d{2})_(?P<slug>.*)'``
Please note that the metadata available inside your files takes precedence over
the metadata extracted from the filename.
metadata extracted from the filename.
Metadata extraction from Markdown headings
==========================================
For Markdown files, you can also extract metadata from headings using the
``HEADING_METADATA`` setting. This is particularly useful for articles where you
want to use the first heading as the title without explicitly declaring it in
the metadata section.
When ``HEADING_METADATA`` is enabled, Pelican can extract metadata from headings
using either level mappings or custom regex patterns. This allows Markdown files
like::
Date: 2023-12-01
Category: tech
# My Article Title
Content goes here...
This allows their titles to be extracted automatically without requiring
explicit ``Title:`` metadata. Note that Markdown files must begin with the
metadata block, and headings should follow it.
The headings used for metadata extraction are automatically removed from the
content to avoid duplication in the output.
See the :ref:`HEADING_METADATA <settings/HEADING_METADATA>`,
:ref:`HEADING_METADATA_MAP <settings/HEADING_METADATA_MAP>`, and
:ref:`HEADING_METADATA_PATTERNS <settings/HEADING_METADATA_PATTERNS>` settings
in the documentation for configuration options.
Pages
=====

View file

@ -977,8 +977,71 @@ Metadata
"static/robots.txt": {"path": "robots.txt"},
}
.. _group name notation:
https://docs.python.org/3/library/re.html#regular-expression-syntax
.. data:: HEADING_METADATA
Enable or disable metadata extraction from Markdown headings. When set to
``True``, Pelican will extract metadata from headings according to the
``HEADING_METADATA_MAP`` and ``HEADING_METADATA_PATTERNS`` settings. Pelican
removes extracted headings from the content to avoid duplication. The default
is ``False``.
.. data:: HEADING_METADATA_MAP
A mapping of heading levels to metadata field names. This pallows metadata to
be extracted from specific heading levels. For example, ``{1: 'title'}`` will
map level 1 headings (``# Heading``) to the ``title`` metadata field.
Multiple levels can be mapped to different fields. The default is::
HEADING_METADATA_MAP = {
1: 'title', # # Heading → title
2: 'subtitle', # ## Heading → subtitle
3: 'summary', # ### Heading → summary
}
.. data:: HEADING_METADATA_PATTERNS
Custom regex patterns for extracting metadata from headings. This provides
more flexibility than ``HEADING_METADATA_MAP`` by allowing you to match
specific heading patterns. The patterns are processed before the level
mapping, if both are configured. The default is::
HEADING_METADATA_PATTERNS = {
'title': r'^#\s+(.+)$',
'subtitle': r'^##\s+(.+)$',
'summary': r'^###\s+(.+)$',
'author': r'^###\s+Author[:\s]+(.+)$',
'date': r'^###\s+Date[:\s]+(.+)$',
}
Example usage::
# Enable heading metadata extraction
HEADING_METADATA = True
# Simple title extraction from first level heading
HEADING_METADATA_PATTERNS = {
'title': r'^#\s+(.+)$'
}
# Or use level mapping instead
HEADING_METADATA_MAP = {
1: 'title'
}
This allows Markdown files like::
Date: 2023-12-01
Category: tech
# My Article Title
Content goes here...
This allows their titles to be extracted automatically without requiring
explicit ``Title:`` metadata.
.. _`group name notation`:
https://docs.python.org/3/library/re.html#regular-expression-syntax
The default is ``{}``.

View file

@ -255,7 +255,7 @@ class PrintSettings(argparse.Action):
init_logging(name=__name__)
try:
instance, settings = get_instance(namespace)
_instance, settings = get_instance(namespace)
except Exception as e:
logger.critical("%s", e.__class__.__name__, exc_info=True)
console.print_exception()

View file

@ -341,18 +341,115 @@ class MarkdownReader(BaseReader):
output[name] = self.process_metadata(name, value[0])
return output
def _extract_heading_metadata(self, text, source_path):
"""Extract metadata from Markdown headings based on configuration.
Args:
text (str): Raw Markdown text
source_path (str): Path to source file for error reporting
Returns:
tuple: (extracted_metadata, lines_to_remove)
- extracted_metadata (dict): Extracted metadata from headings
- lines_to_remove (set): Line numbers of headings used for metadata
"""
metadata = OrderedDict()
lines_to_remove = set()
# Get configuration
heading_map = self.settings.get("HEADING_METADATA_MAP", {})
heading_patterns = self.settings.get("HEADING_METADATA_PATTERNS", {})
# Extract metadata using both level mapping and custom patterns
lines = text.split("\n")
for line_num, line in enumerate(lines, 1):
custom_matched = False
# Try custom patterns first
for field_name, pattern in heading_patterns.items():
match = re.match(pattern, line.strip())
if match:
value = match.group(1).strip()
if field_name not in metadata: # Only keep first occurrence
metadata[field_name] = [value]
lines_to_remove.add(line_num)
custom_matched = True
break
# Try level mapping (always run, not just if no custom matched)
if heading_map and not custom_matched:
# Match heading levels (# ## ### etc.)
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line.strip())
if heading_match:
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
# Map level to field name
field_name = heading_map.get(level)
if field_name and field_name not in metadata:
metadata[field_name] = [title]
lines_to_remove.add(line_num)
# Process extracted metadata through existing processors
processed_metadata = {}
metadata_processors = self.settings.get("METADATA_PROCESSORS", {})
for field_name, values in metadata.items():
if isinstance(values, list) and len(values) == 1:
values = values[0]
# Apply metadata processor if available
if field_name in metadata_processors:
processor = metadata_processors[field_name]
try:
values = processor(values, self.settings)
except (ValueError, TypeError, AttributeError) as e:
logger.warning(
f"Metadata processor for '{field_name}' failed on "
f"'{values}' in '{source_path}': {e}"
)
processed_metadata[field_name] = values
return processed_metadata, lines_to_remove
def read(self, source_path):
"""Parse content and metadata of markdown files"""
self._source_path = source_path
self._md = Markdown(**self.settings["MARKDOWN"])
with pelican_open(source_path) as text:
# Extract heading metadata before markdown conversion
heading_metadata = {}
lines_to_remove = set()
if self.settings.get("HEADING_METADATA", False):
heading_metadata, lines_to_remove = self._extract_heading_metadata(
text, source_path
)
# Remove the heading lines that were used for metadata extraction
if lines_to_remove:
text_lines = text.split("\n")
# Filter out the lines that were used for metadata
filtered_lines = [
line
for line_num, line in enumerate(text_lines, 1)
if line_num not in lines_to_remove
]
text = "\n".join(filtered_lines)
content = self._md.convert(text)
if hasattr(self._md, "Meta"):
metadata = self._parse_metadata(self._md.Meta)
else:
metadata = {}
# Merge heading metadata with regular metadata
# Regular metadata takes precedence over heading metadata
heading_metadata.update(metadata)
metadata = heading_metadata
return content, metadata
def disabled_message(self) -> str:
@ -805,7 +902,7 @@ def parse_path_metadata(source_path, settings=None, process=None):
"""
metadata = {}
dirname, basename = os.path.split(source_path)
base, ext = os.path.splitext(basename)
base, _ext = os.path.splitext(basename)
subdir = os.path.basename(dirname)
if settings:
checks = []

View file

@ -177,6 +177,19 @@ DEFAULT_CONFIG = {
"CHECK_MODIFIED_METHOD": "mtime",
"LOAD_CONTENT_CACHE": False,
"FORMATTED_FIELDS": ["summary"],
"HEADING_METADATA": False,
"HEADING_METADATA_MAP": {
1: "title", # # Heading → title
2: "subtitle", # ## Heading → subtitle
3: "summary", # ### Heading → summary
},
"HEADING_METADATA_PATTERNS": {
"author": r"^###\s+Author[:\s]+(.+)$",
"date": r"^###\s+Date[:\s]+(.+)$",
"title": r"^#\s+(.+)$",
"subtitle": r"^##\s+(.+)$",
"summary": r"^###\s+(.+)$",
},
"PORT": 8000,
"BIND": "127.0.0.1",
}
@ -242,7 +255,7 @@ def get_settings_from_module(module: ModuleType | None = None) -> Settings:
def get_settings_from_file(path: str) -> Settings:
"""Loads settings from a file path, returning a dict."""
name, ext = os.path.splitext(os.path.basename(path))
name, _ext = os.path.splitext(os.path.basename(path))
module = load_source(name, path)
return get_settings_from_module(module)

View file

@ -244,7 +244,7 @@ class RstReaderTest(ReaderTest):
def test_article_metadata_key_lowercase(self):
# Keys of metadata should be lowercase.
reader = readers.RstReader(settings=get_settings())
content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))
_content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))
self.assertIn("category", metadata, "Key should be lowercase.")
self.assertEqual("Yeah", metadata.get("category"), "Value keeps case.")
@ -627,7 +627,7 @@ class RstReaderTest(ReaderTest):
class MdReaderTest(ReaderTest):
def test_article_with_metadata(self):
reader = readers.MarkdownReader(settings=get_settings())
content, metadata = reader.read(_path("article_with_md_extension.md"))
_, metadata = reader.read(_path("article_with_md_extension.md"))
expected = {
"category": "test",
"title": "Test md File",
@ -638,7 +638,7 @@ class MdReaderTest(ReaderTest):
}
self.assertDictHasSubset(metadata, expected)
content, metadata = reader.read(
_content, metadata = reader.read(
_path("article_with_markdown_and_nonascii_summary.md")
)
expected = {
@ -700,7 +700,7 @@ class MdReaderTest(ReaderTest):
reader = readers.MarkdownReader(settings=get_settings())
# test to ensure the md file extension is being processed by the
# correct reader
content, metadata = reader.read(_path("article_with_md_extension.md"))
content, _ = reader.read(_path("article_with_md_extension.md"))
expected = (
"<h1>Test Markdown File Header</h1>\n"
"<h2>Used for pelican test</h2>\n"
@ -709,7 +709,7 @@ class MdReaderTest(ReaderTest):
self.assertEqual(content, expected)
# test to ensure the mkd file extension is being processed by the
# correct reader
content, metadata = reader.read(_path("article_with_mkd_extension.mkd"))
content, _ = reader.read(_path("article_with_mkd_extension.mkd"))
expected = (
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
" test</h2>\n<p>This is another markdown test file. Uses"
@ -718,9 +718,7 @@ class MdReaderTest(ReaderTest):
self.assertEqual(content, expected)
# test to ensure the markdown file extension is being processed by the
# correct reader
content, metadata = reader.read(
_path("article_with_markdown_extension.markdown")
)
content, _ = reader.read(_path("article_with_markdown_extension.markdown"))
expected = (
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
" test</h2>\n<p>This is another markdown test file. Uses"
@ -729,7 +727,7 @@ class MdReaderTest(ReaderTest):
self.assertEqual(content, expected)
# test to ensure the mdown file extension is being processed by the
# correct reader
content, metadata = reader.read(_path("article_with_mdown_extension.mdown"))
content, _metadata = reader.read(_path("article_with_mdown_extension.mdown"))
expected = (
"<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
" test</h2>\n<p>This is another markdown test file. Uses"
@ -825,7 +823,9 @@ class MdReaderTest(ReaderTest):
def test_duplicate_tags_or_authors_are_removed(self):
reader = readers.MarkdownReader(settings=get_settings())
content, metadata = reader.read(_path("article_with_duplicate_tags_authors.md"))
_content, metadata = reader.read(
_path("article_with_duplicate_tags_authors.md")
)
expected = {
"tags": ["foo", "bar", "foobar"],
"authors": ["Author, First", "Author, Second"],
@ -837,7 +837,7 @@ class MdReaderTest(ReaderTest):
settings["FORMATTED_FIELDS"] = ["summary"]
reader = readers.MarkdownReader(settings=settings)
content, metadata = reader.read(
_content, metadata = reader.read(
_path("article_with_markdown_and_nested_metadata.md")
)
expected = {