Merge 9ed0eb1c0c into 3c69dc68d2

2026-06-05 05:46:55 +02:00 · 2026-05-02 13:16:03 +00:00 · 2026-05-02 13:16:03 +00:00 · 32f47fb847
commit 32f47fb847
parent 3c69dc68d2 9ed0eb1c0c
8 changed files with 226 additions and 17 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -0,0 +1,4 @@
+Release type: minor
+
+Added HEADING_METADATA feature for extracting metadata from Markdown
+headings using configurable patterns and level mappings.
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -24,6 +24,7 @@ Release history
 4.11.0 - 2025-01-15
 ===================

+- Add HEADING_METADATA feature for Markdown title extraction, allowing metadata to be extracted from headings based on configurable patterns and level mappings
 - Add setting to selectively omit Typogrify filters `(#3439) <https://github.com/getpelican/pelican/pull/3439>`_
 - Add more blocks to the Simple theme’s base template, making it easier to create new themes by inheriting from the Simple theme `(#3405) <https://github.com/getpelican/pelican/pull/3405>`_
 - Fix auto-reload behavior upon changes to the theme, content or settings. Make default ``IGNORE_FILES`` recursively ignore all hidden files as well as the `default filters <https://watchfiles.helpmanual.io/api/filters/#watchfiles.DefaultFilter.ignore_dirs>`_ from ``watchfiles.DefaultFilter``. `(#3441) <https://github.com/getpelican/pelican/pull/3441>`_
--- a/docs/content.rst
+++ b/docs/content.rst
@ -175,7 +175,38 @@ example, if you would like to extract both the date and the slug, you could set
 something like: ``'(?P<date>\d{4}-\d{2}-\d{2})_(?P<slug>.*)'``

 Please note that the metadata available inside your files takes precedence over
-the metadata extracted from the filename.
+metadata extracted from the filename.
+
+Metadata extraction from Markdown headings
+==========================================
+
+For Markdown files, you can also extract metadata from headings using the
+``HEADING_METADATA`` setting. This is particularly useful for articles where you
+want to use the first heading as the title without explicitly declaring it in
+the metadata section.
+
+When ``HEADING_METADATA`` is enabled, Pelican can extract metadata from headings
+using either level mappings or custom regex patterns. This allows Markdown files
+like::
+
+    Date: 2023-12-01
+    Category: tech
+
+    # My Article Title
+
+    Content goes here...
+
+This allows their titles to be extracted automatically without requiring
+explicit ``Title:`` metadata. Note that Markdown files must begin with the
+metadata block, and headings should follow it.
+
+The headings used for metadata extraction are automatically removed from the
+content to avoid duplication in the output.
+
+See the :ref:`HEADING_METADATA <settings/HEADING_METADATA>`,
+:ref:`HEADING_METADATA_MAP <settings/HEADING_METADATA_MAP>`, and
+:ref:`HEADING_METADATA_PATTERNS <settings/HEADING_METADATA_PATTERNS>` settings
+in the documentation for configuration options.

 Pages
 =====
--- a/docs/settings.rst
+++ b/docs/settings.rst
@ -977,8 +977,71 @@ Metadata
           "static/robots.txt": {"path": "robots.txt"},
       }

-   .. _group name notation:
-      https://docs.python.org/3/library/re.html#regular-expression-syntax
+.. data:: HEADING_METADATA
+
+   Enable or disable metadata extraction from Markdown headings. When set to
+   ``True``, Pelican will extract metadata from headings according to the
+   ``HEADING_METADATA_MAP`` and ``HEADING_METADATA_PATTERNS`` settings. Pelican
+   removes extracted headings from the content to avoid duplication. The default
+   is ``False``.
+
+.. data:: HEADING_METADATA_MAP
+
+   A mapping of heading levels to metadata field names. This pallows metadata to
+   be extracted from specific heading levels. For example, ``{1: 'title'}`` will
+   map level 1 headings (``# Heading``) to the ``title`` metadata field.
+   Multiple levels can be mapped to different fields. The default is::
+
+       HEADING_METADATA_MAP = {
+           1: 'title',      # # Heading → title
+           2: 'subtitle',   # ## Heading → subtitle
+           3: 'summary',    # ### Heading → summary
+       }
+
+.. data:: HEADING_METADATA_PATTERNS
+
+   Custom regex patterns for extracting metadata from headings. This provides
+   more flexibility than ``HEADING_METADATA_MAP`` by allowing you to match
+   specific heading patterns. The patterns are processed before the level
+   mapping, if both are configured. The default is::
+
+       HEADING_METADATA_PATTERNS = {
+           'title': r'^#\s+(.+)$',
+           'subtitle': r'^##\s+(.+)$',
+           'summary': r'^###\s+(.+)$',
+           'author': r'^###\s+Author[:\s]+(.+)$',
+           'date': r'^###\s+Date[:\s]+(.+)$',
+       }
+
+   Example usage::
+
+       # Enable heading metadata extraction
+       HEADING_METADATA = True
+
+       # Simple title extraction from first level heading
+       HEADING_METADATA_PATTERNS = {
+           'title': r'^#\s+(.+)$'
+       }
+
+       # Or use level mapping instead
+       HEADING_METADATA_MAP = {
+           1: 'title'
+       }
+
+   This allows Markdown files like::
+
+       Date: 2023-12-01
+       Category: tech
+
+       # My Article Title
+
+       Content goes here...
+
+   This allows their titles to be extracted automatically without requiring
+   explicit ``Title:`` metadata.
+
+.. _`group name notation`:
+   https://docs.python.org/3/library/re.html#regular-expression-syntax

   The default is ``{}``.

--- a/pelican/init.py
+++ b/pelican/init.py
@ -255,7 +255,7 @@ class PrintSettings(argparse.Action):
        init_logging(name=__name__)

        try:
-            instance, settings = get_instance(namespace)
+            _instance, settings = get_instance(namespace)
        except Exception as e:
            logger.critical("%s", e.__class__.__name__, exc_info=True)
            console.print_exception()
--- a/pelican/readers.py
+++ b/pelican/readers.py
@ -341,18 +341,115 @@ class MarkdownReader(BaseReader):
                output[name] = self.process_metadata(name, value[0])
        return output

+    def _extract_heading_metadata(self, text, source_path):
+        """Extract metadata from Markdown headings based on configuration.
+
+        Args:
+            text (str): Raw Markdown text
+            source_path (str): Path to source file for error reporting
+
+        Returns:
+            tuple: (extracted_metadata, lines_to_remove)
+                - extracted_metadata (dict): Extracted metadata from headings
+                - lines_to_remove (set): Line numbers of headings used for metadata
+        """
+        metadata = OrderedDict()
+        lines_to_remove = set()
+
+        # Get configuration
+        heading_map = self.settings.get("HEADING_METADATA_MAP", {})
+        heading_patterns = self.settings.get("HEADING_METADATA_PATTERNS", {})
+
+        # Extract metadata using both level mapping and custom patterns
+        lines = text.split("\n")
+        for line_num, line in enumerate(lines, 1):
+            custom_matched = False
+
+            # Try custom patterns first
+            for field_name, pattern in heading_patterns.items():
+                match = re.match(pattern, line.strip())
+                if match:
+                    value = match.group(1).strip()
+                    if field_name not in metadata:  # Only keep first occurrence
+                        metadata[field_name] = [value]
+                        lines_to_remove.add(line_num)
+                    custom_matched = True
+                    break
+
+            # Try level mapping (always run, not just if no custom matched)
+            if heading_map and not custom_matched:
+                # Match heading levels (# ## ### etc.)
+                heading_match = re.match(r"^(#{1,6})\s+(.+)$", line.strip())
+                if heading_match:
+                    level = len(heading_match.group(1))
+                    title = heading_match.group(2).strip()
+
+                    # Map level to field name
+                    field_name = heading_map.get(level)
+                    if field_name and field_name not in metadata:
+                        metadata[field_name] = [title]
+                        lines_to_remove.add(line_num)
+
+        # Process extracted metadata through existing processors
+        processed_metadata = {}
+        metadata_processors = self.settings.get("METADATA_PROCESSORS", {})
+
+        for field_name, values in metadata.items():
+            if isinstance(values, list) and len(values) == 1:
+                values = values[0]
+
+            # Apply metadata processor if available
+            if field_name in metadata_processors:
+                processor = metadata_processors[field_name]
+                try:
+                    values = processor(values, self.settings)
+                except (ValueError, TypeError, AttributeError) as e:
+                    logger.warning(
+                        f"Metadata processor for '{field_name}' failed on "
+                        f"'{values}' in '{source_path}': {e}"
+                    )
+
+            processed_metadata[field_name] = values
+
+        return processed_metadata, lines_to_remove
+
    def read(self, source_path):
        """Parse content and metadata of markdown files"""

        self._source_path = source_path
        self._md = Markdown(**self.settings["MARKDOWN"])
        with pelican_open(source_path) as text:
+            # Extract heading metadata before markdown conversion
+            heading_metadata = {}
+            lines_to_remove = set()
+            if self.settings.get("HEADING_METADATA", False):
+                heading_metadata, lines_to_remove = self._extract_heading_metadata(
+                    text, source_path
+                )
+
+            # Remove the heading lines that were used for metadata extraction
+            if lines_to_remove:
+                text_lines = text.split("\n")
+                # Filter out the lines that were used for metadata
+                filtered_lines = [
+                    line
+                    for line_num, line in enumerate(text_lines, 1)
+                    if line_num not in lines_to_remove
+                ]
+                text = "\n".join(filtered_lines)
+
            content = self._md.convert(text)

        if hasattr(self._md, "Meta"):
            metadata = self._parse_metadata(self._md.Meta)
        else:
            metadata = {}
+
+        # Merge heading metadata with regular metadata
+        # Regular metadata takes precedence over heading metadata
+        heading_metadata.update(metadata)
+        metadata = heading_metadata
+
        return content, metadata

    def disabled_message(self) -> str:
@ -805,7 +902,7 @@ def parse_path_metadata(source_path, settings=None, process=None):
    """
    metadata = {}
    dirname, basename = os.path.split(source_path)
-    base, ext = os.path.splitext(basename)
+    base, _ext = os.path.splitext(basename)
    subdir = os.path.basename(dirname)
    if settings:
        checks = []
--- a/pelican/settings.py
+++ b/pelican/settings.py
@ -177,6 +177,19 @@ DEFAULT_CONFIG = {
    "CHECK_MODIFIED_METHOD": "mtime",
    "LOAD_CONTENT_CACHE": False,
    "FORMATTED_FIELDS": ["summary"],
+    "HEADING_METADATA": False,
+    "HEADING_METADATA_MAP": {
+        1: "title",  # # Heading → title
+        2: "subtitle",  # ## Heading → subtitle
+        3: "summary",  # ### Heading → summary
+    },
+    "HEADING_METADATA_PATTERNS": {
+        "author": r"^###\s+Author[:\s]+(.+)$",
+        "date": r"^###\s+Date[:\s]+(.+)$",
+        "title": r"^#\s+(.+)$",
+        "subtitle": r"^##\s+(.+)$",
+        "summary": r"^###\s+(.+)$",
+    },
    "PORT": 8000,
    "BIND": "127.0.0.1",
 }
@ -242,7 +255,7 @@ def get_settings_from_module(module: ModuleType | None = None) -> Settings:
 def get_settings_from_file(path: str) -> Settings:
    """Loads settings from a file path, returning a dict."""

-    name, ext = os.path.splitext(os.path.basename(path))
+    name, _ext = os.path.splitext(os.path.basename(path))
    module = load_source(name, path)
    return get_settings_from_module(module)

--- a/pelican/tests/test_readers.py
+++ b/pelican/tests/test_readers.py
@ -244,7 +244,7 @@ class RstReaderTest(ReaderTest):
    def test_article_metadata_key_lowercase(self):
        # Keys of metadata should be lowercase.
        reader = readers.RstReader(settings=get_settings())
-        content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))
+        _content, metadata = reader.read(_path("article_with_uppercase_metadata.rst"))

        self.assertIn("category", metadata, "Key should be lowercase.")
        self.assertEqual("Yeah", metadata.get("category"), "Value keeps case.")
@ -627,7 +627,7 @@ class RstReaderTest(ReaderTest):
 class MdReaderTest(ReaderTest):
    def test_article_with_metadata(self):
        reader = readers.MarkdownReader(settings=get_settings())
-        content, metadata = reader.read(_path("article_with_md_extension.md"))
+        _, metadata = reader.read(_path("article_with_md_extension.md"))
        expected = {
            "category": "test",
            "title": "Test md File",
@ -638,7 +638,7 @@ class MdReaderTest(ReaderTest):
        }
        self.assertDictHasSubset(metadata, expected)

-        content, metadata = reader.read(
+        _content, metadata = reader.read(
            _path("article_with_markdown_and_nonascii_summary.md")
        )
        expected = {
@ -700,7 +700,7 @@ class MdReaderTest(ReaderTest):
        reader = readers.MarkdownReader(settings=get_settings())
        # test to ensure the md file extension is being processed by the
        # correct reader
-        content, metadata = reader.read(_path("article_with_md_extension.md"))
+        content, _ = reader.read(_path("article_with_md_extension.md"))
        expected = (
            "<h1>Test Markdown File Header</h1>\n"
            "<h2>Used for pelican test</h2>\n"
@ -709,7 +709,7 @@ class MdReaderTest(ReaderTest):
        self.assertEqual(content, expected)
        # test to ensure the mkd file extension is being processed by the
        # correct reader
-        content, metadata = reader.read(_path("article_with_mkd_extension.mkd"))
+        content, _ = reader.read(_path("article_with_mkd_extension.mkd"))
        expected = (
            "<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
            " test</h2>\n<p>This is another markdown test file.  Uses"
@ -718,9 +718,7 @@ class MdReaderTest(ReaderTest):
        self.assertEqual(content, expected)
        # test to ensure the markdown file extension is being processed by the
        # correct reader
-        content, metadata = reader.read(
-            _path("article_with_markdown_extension.markdown")
-        )
+        content, _ = reader.read(_path("article_with_markdown_extension.markdown"))
        expected = (
            "<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
            " test</h2>\n<p>This is another markdown test file.  Uses"
@ -729,7 +727,7 @@ class MdReaderTest(ReaderTest):
        self.assertEqual(content, expected)
        # test to ensure the mdown file extension is being processed by the
        # correct reader
-        content, metadata = reader.read(_path("article_with_mdown_extension.mdown"))
+        content, _metadata = reader.read(_path("article_with_mdown_extension.mdown"))
        expected = (
            "<h1>Test Markdown File Header</h1>\n<h2>Used for pelican"
            " test</h2>\n<p>This is another markdown test file.  Uses"
@ -825,7 +823,9 @@ class MdReaderTest(ReaderTest):

    def test_duplicate_tags_or_authors_are_removed(self):
        reader = readers.MarkdownReader(settings=get_settings())
-        content, metadata = reader.read(_path("article_with_duplicate_tags_authors.md"))
+        _content, metadata = reader.read(
+            _path("article_with_duplicate_tags_authors.md")
+        )
        expected = {
            "tags": ["foo", "bar", "foobar"],
            "authors": ["Author, First", "Author, Second"],
@ -837,7 +837,7 @@ class MdReaderTest(ReaderTest):
        settings["FORMATTED_FIELDS"] = ["summary"]

        reader = readers.MarkdownReader(settings=settings)
-        content, metadata = reader.read(
+        _content, metadata = reader.read(
            _path("article_with_markdown_and_nested_metadata.md")
        )
        expected = {