diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..056b6a5e --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,4 @@ +Release type: minor + +Added HEADING_METADATA feature for extracting metadata from Markdown +headings using configurable patterns and level mappings. diff --git a/docs/changelog.rst b/docs/changelog.rst index 71cb851d..8ba7f819 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -24,6 +24,7 @@ Release history 4.11.0 - 2025-01-15 =================== +- Add HEADING_METADATA feature for Markdown title extraction, allowing metadata to be extracted from headings based on configurable patterns and level mappings - Add setting to selectively omit Typogrify filters `(#3439) `_ - Add more blocks to the Simple theme’s base template, making it easier to create new themes by inheriting from the Simple theme `(#3405) `_ - Fix auto-reload behavior upon changes to the theme, content or settings. Make default ``IGNORE_FILES`` recursively ignore all hidden files as well as the `default filters `_ from ``watchfiles.DefaultFilter``. `(#3441) `_ diff --git a/docs/content.rst b/docs/content.rst index 6c58aa11..e3101abc 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -175,7 +175,38 @@ example, if you would like to extract both the date and the slug, you could set something like: ``'(?P\d{4}-\d{2}-\d{2})_(?P.*)'`` Please note that the metadata available inside your files takes precedence over -the metadata extracted from the filename. +metadata extracted from the filename. + +Metadata extraction from Markdown headings +========================================== + +For Markdown files, you can also extract metadata from headings using the +``HEADING_METADATA`` setting. This is particularly useful for articles where you +want to use the first heading as the title without explicitly declaring it in +the metadata section. + +When ``HEADING_METADATA`` is enabled, Pelican can extract metadata from headings +using either level mappings or custom regex patterns. This allows Markdown files +like:: + + Date: 2023-12-01 + Category: tech + + # My Article Title + + Content goes here... + +This allows their titles to be extracted automatically without requiring +explicit ``Title:`` metadata. Note that Markdown files must begin with the +metadata block, and headings should follow it. + +The headings used for metadata extraction are automatically removed from the +content to avoid duplication in the output. + +See the :ref:`HEADING_METADATA `, +:ref:`HEADING_METADATA_MAP `, and +:ref:`HEADING_METADATA_PATTERNS ` settings +in the documentation for configuration options. Pages ===== diff --git a/docs/settings.rst b/docs/settings.rst index 3fac5f18..cdbc66d4 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -977,8 +977,71 @@ Metadata "static/robots.txt": {"path": "robots.txt"}, } - .. _group name notation: - https://docs.python.org/3/library/re.html#regular-expression-syntax +.. data:: HEADING_METADATA + + Enable or disable metadata extraction from Markdown headings. When set to + ``True``, Pelican will extract metadata from headings according to the + ``HEADING_METADATA_MAP`` and ``HEADING_METADATA_PATTERNS`` settings. Pelican + removes extracted headings from the content to avoid duplication. The default + is ``False``. + +.. data:: HEADING_METADATA_MAP + + A mapping of heading levels to metadata field names. This pallows metadata to + be extracted from specific heading levels. For example, ``{1: 'title'}`` will + map level 1 headings (``# Heading``) to the ``title`` metadata field. + Multiple levels can be mapped to different fields. The default is:: + + HEADING_METADATA_MAP = { + 1: 'title', # # Heading → title + 2: 'subtitle', # ## Heading → subtitle + 3: 'summary', # ### Heading → summary + } + +.. data:: HEADING_METADATA_PATTERNS + + Custom regex patterns for extracting metadata from headings. This provides + more flexibility than ``HEADING_METADATA_MAP`` by allowing you to match + specific heading patterns. The patterns are processed before the level + mapping, if both are configured. The default is:: + + HEADING_METADATA_PATTERNS = { + 'title': r'^#\s+(.+)$', + 'subtitle': r'^##\s+(.+)$', + 'summary': r'^###\s+(.+)$', + 'author': r'^###\s+Author[:\s]+(.+)$', + 'date': r'^###\s+Date[:\s]+(.+)$', + } + + Example usage:: + + # Enable heading metadata extraction + HEADING_METADATA = True + + # Simple title extraction from first level heading + HEADING_METADATA_PATTERNS = { + 'title': r'^#\s+(.+)$' + } + + # Or use level mapping instead + HEADING_METADATA_MAP = { + 1: 'title' + } + + This allows Markdown files like:: + + Date: 2023-12-01 + Category: tech + + # My Article Title + + Content goes here... + + This allows their titles to be extracted automatically without requiring + explicit ``Title:`` metadata. + +.. _`group name notation`: + https://docs.python.org/3/library/re.html#regular-expression-syntax The default is ``{}``. diff --git a/pelican/__init__.py b/pelican/__init__.py index a69e574b..19b6605c 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -255,7 +255,7 @@ class PrintSettings(argparse.Action): init_logging(name=__name__) try: - instance, settings = get_instance(namespace) + _instance, settings = get_instance(namespace) except Exception as e: logger.critical("%s", e.__class__.__name__, exc_info=True) console.print_exception() diff --git a/pelican/readers.py b/pelican/readers.py index 508d655f..9a3559ba 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -341,18 +341,115 @@ class MarkdownReader(BaseReader): output[name] = self.process_metadata(name, value[0]) return output + def _extract_heading_metadata(self, text, source_path): + """Extract metadata from Markdown headings based on configuration. + + Args: + text (str): Raw Markdown text + source_path (str): Path to source file for error reporting + + Returns: + tuple: (extracted_metadata, lines_to_remove) + - extracted_metadata (dict): Extracted metadata from headings + - lines_to_remove (set): Line numbers of headings used for metadata + """ + metadata = OrderedDict() + lines_to_remove = set() + + # Get configuration + heading_map = self.settings.get("HEADING_METADATA_MAP", {}) + heading_patterns = self.settings.get("HEADING_METADATA_PATTERNS", {}) + + # Extract metadata using both level mapping and custom patterns + lines = text.split("\n") + for line_num, line in enumerate(lines, 1): + custom_matched = False + + # Try custom patterns first + for field_name, pattern in heading_patterns.items(): + match = re.match(pattern, line.strip()) + if match: + value = match.group(1).strip() + if field_name not in metadata: # Only keep first occurrence + metadata[field_name] = [value] + lines_to_remove.add(line_num) + custom_matched = True + break + + # Try level mapping (always run, not just if no custom matched) + if heading_map and not custom_matched: + # Match heading levels (# ## ### etc.) + heading_match = re.match(r"^(#{1,6})\s+(.+)$", line.strip()) + if heading_match: + level = len(heading_match.group(1)) + title = heading_match.group(2).strip() + + # Map level to field name + field_name = heading_map.get(level) + if field_name and field_name not in metadata: + metadata[field_name] = [title] + lines_to_remove.add(line_num) + + # Process extracted metadata through existing processors + processed_metadata = {} + metadata_processors = self.settings.get("METADATA_PROCESSORS", {}) + + for field_name, values in metadata.items(): + if isinstance(values, list) and len(values) == 1: + values = values[0] + + # Apply metadata processor if available + if field_name in metadata_processors: + processor = metadata_processors[field_name] + try: + values = processor(values, self.settings) + except (ValueError, TypeError, AttributeError) as e: + logger.warning( + f"Metadata processor for '{field_name}' failed on " + f"'{values}' in '{source_path}': {e}" + ) + + processed_metadata[field_name] = values + + return processed_metadata, lines_to_remove + def read(self, source_path): """Parse content and metadata of markdown files""" self._source_path = source_path self._md = Markdown(**self.settings["MARKDOWN"]) with pelican_open(source_path) as text: + # Extract heading metadata before markdown conversion + heading_metadata = {} + lines_to_remove = set() + if self.settings.get("HEADING_METADATA", False): + heading_metadata, lines_to_remove = self._extract_heading_metadata( + text, source_path + ) + + # Remove the heading lines that were used for metadata extraction + if lines_to_remove: + text_lines = text.split("\n") + # Filter out the lines that were used for metadata + filtered_lines = [ + line + for line_num, line in enumerate(text_lines, 1) + if line_num not in lines_to_remove + ] + text = "\n".join(filtered_lines) + content = self._md.convert(text) if hasattr(self._md, "Meta"): metadata = self._parse_metadata(self._md.Meta) else: metadata = {} + + # Merge heading metadata with regular metadata + # Regular metadata takes precedence over heading metadata + heading_metadata.update(metadata) + metadata = heading_metadata + return content, metadata def disabled_message(self) -> str: @@ -805,7 +902,7 @@ def parse_path_metadata(source_path, settings=None, process=None): """ metadata = {} dirname, basename = os.path.split(source_path) - base, ext = os.path.splitext(basename) + base, _ext = os.path.splitext(basename) subdir = os.path.basename(dirname) if settings: checks = [] diff --git a/pelican/settings.py b/pelican/settings.py index 45957fd0..f3425d1a 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -177,6 +177,19 @@ DEFAULT_CONFIG = { "CHECK_MODIFIED_METHOD": "mtime", "LOAD_CONTENT_CACHE": False, "FORMATTED_FIELDS": ["summary"], + "HEADING_METADATA": False, + "HEADING_METADATA_MAP": { + 1: "title", # # Heading → title + 2: "subtitle", # ## Heading → subtitle + 3: "summary", # ### Heading → summary + }, + "HEADING_METADATA_PATTERNS": { + "author": r"^###\s+Author[:\s]+(.+)$", + "date": r"^###\s+Date[:\s]+(.+)$", + "title": r"^#\s+(.+)$", + "subtitle": r"^##\s+(.+)$", + "summary": r"^###\s+(.+)$", + }, "PORT": 8000, "BIND": "127.0.0.1", } @@ -242,7 +255,7 @@ def get_settings_from_module(module: ModuleType | None = None) -> Settings: def get_settings_from_file(path: str) -> Settings: """Loads settings from a file path, returning a dict.""" - name, ext = os.path.splitext(os.path.basename(path)) + name, _ext = os.path.splitext(os.path.basename(path)) module = load_source(name, path) return get_settings_from_module(module) diff --git a/pelican/tests/test_readers.py b/pelican/tests/test_readers.py index 751088f7..e171ef10 100644 --- a/pelican/tests/test_readers.py +++ b/pelican/tests/test_readers.py @@ -244,7 +244,7 @@ class RstReaderTest(ReaderTest): def test_article_metadata_key_lowercase(self): # Keys of metadata should be lowercase. reader = readers.RstReader(settings=get_settings()) - content, metadata = reader.read(_path("article_with_uppercase_metadata.rst")) + _content, metadata = reader.read(_path("article_with_uppercase_metadata.rst")) self.assertIn("category", metadata, "Key should be lowercase.") self.assertEqual("Yeah", metadata.get("category"), "Value keeps case.") @@ -627,7 +627,7 @@ class RstReaderTest(ReaderTest): class MdReaderTest(ReaderTest): def test_article_with_metadata(self): reader = readers.MarkdownReader(settings=get_settings()) - content, metadata = reader.read(_path("article_with_md_extension.md")) + _, metadata = reader.read(_path("article_with_md_extension.md")) expected = { "category": "test", "title": "Test md File", @@ -638,7 +638,7 @@ class MdReaderTest(ReaderTest): } self.assertDictHasSubset(metadata, expected) - content, metadata = reader.read( + _content, metadata = reader.read( _path("article_with_markdown_and_nonascii_summary.md") ) expected = { @@ -700,7 +700,7 @@ class MdReaderTest(ReaderTest): reader = readers.MarkdownReader(settings=get_settings()) # test to ensure the md file extension is being processed by the # correct reader - content, metadata = reader.read(_path("article_with_md_extension.md")) + content, _ = reader.read(_path("article_with_md_extension.md")) expected = ( "

Test Markdown File Header

\n" "

Used for pelican test

\n" @@ -709,7 +709,7 @@ class MdReaderTest(ReaderTest): self.assertEqual(content, expected) # test to ensure the mkd file extension is being processed by the # correct reader - content, metadata = reader.read(_path("article_with_mkd_extension.mkd")) + content, _ = reader.read(_path("article_with_mkd_extension.mkd")) expected = ( "

Test Markdown File Header

\n

Used for pelican" " test

\n

This is another markdown test file. Uses" @@ -718,9 +718,7 @@ class MdReaderTest(ReaderTest): self.assertEqual(content, expected) # test to ensure the markdown file extension is being processed by the # correct reader - content, metadata = reader.read( - _path("article_with_markdown_extension.markdown") - ) + content, _ = reader.read(_path("article_with_markdown_extension.markdown")) expected = ( "

Test Markdown File Header

\n

Used for pelican" " test

\n

This is another markdown test file. Uses" @@ -729,7 +727,7 @@ class MdReaderTest(ReaderTest): self.assertEqual(content, expected) # test to ensure the mdown file extension is being processed by the # correct reader - content, metadata = reader.read(_path("article_with_mdown_extension.mdown")) + content, _metadata = reader.read(_path("article_with_mdown_extension.mdown")) expected = ( "

Test Markdown File Header

\n

Used for pelican" " test

\n

This is another markdown test file. Uses" @@ -825,7 +823,9 @@ class MdReaderTest(ReaderTest): def test_duplicate_tags_or_authors_are_removed(self): reader = readers.MarkdownReader(settings=get_settings()) - content, metadata = reader.read(_path("article_with_duplicate_tags_authors.md")) + _content, metadata = reader.read( + _path("article_with_duplicate_tags_authors.md") + ) expected = { "tags": ["foo", "bar", "foobar"], "authors": ["Author, First", "Author, Second"], @@ -837,7 +837,7 @@ class MdReaderTest(ReaderTest): settings["FORMATTED_FIELDS"] = ["summary"] reader = readers.MarkdownReader(settings=settings) - content, metadata = reader.read( + _content, metadata = reader.read( _path("article_with_markdown_and_nested_metadata.md") ) expected = {