From 45e7c1270eefe740051175b3e96ed5718d1e369a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 11 Jul 2025 16:17:18 +0200
Subject: [PATCH 01/85] implement md-header-splitter and add tests

---
 .../preprocessors/markdown_header_splitter.py | 298 ++++++++++++++++++
 .../test_markdown_header_splitter.py          |  87 +++++
 2 files changed, 385 insertions(+)
 create mode 100644 haystack/components/preprocessors/markdown_header_splitter.py
 create mode 100644 test/components/preprocessors/test_markdown_header_splitter.py

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
new file mode 100644
index 0000000000..91247237e9
--- /dev/null
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -0,0 +1,298 @@
+import logging
+import re
+from typing import Any, Dict, List, Literal, Optional
+
+from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.components.preprocessors import DocumentSplitter
+
+logger = logging.getLogger(__name__)
+
+
+class CustomDocumentSplitter(DocumentSplitter):
+    """
+    Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'.
+    """
+
+    def __init__(self, *args, page_break_character="\\f", **kwargs):
+        super().__init__(*args, **kwargs)
+        self.page_break_character = page_break_character
+
+    def _flatten_dict(self, d: Dict, prefix: str = "", target_dict: Optional[Dict] = None) -> Dict:
+        """Helper method to flatten a nested dictionary."""
+        if target_dict is None:
+            target_dict = {}
+
+        for key, value in d.items():
+            new_key = f"{prefix}{key}" if prefix else key
+
+            if isinstance(value, dict):
+                self._flatten_dict(value, f"{new_key}_", target_dict)
+            else:
+                target_dict[new_key] = value
+
+        return target_dict
+
+    def _process_split_content(self, split_content: str, split_index: int) -> int:
+        """Process the content of a split and return the number of page breaks."""
+        if not isinstance(split_content, str):
+            return 0
+
+        page_breaks = split_content.count(self.page_break_character)
+        if page_breaks > 0:
+            logger.debug(f"Found {page_breaks} page breaks in split {split_index}")
+        return page_breaks
+
+    def _split_by_function(self, doc: Document) -> List[Document]:
+        """Split document using a custom function that returns dictionaries with 'content' and 'meta'."""
+        logger.debug(f"Splitting document with id={doc.id}")
+        splits = self.splitting_function(doc.content)
+        docs = []
+
+        # calculate total pages and set current page
+        total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
+        current_page = doc.meta.get("page_number", 1)
+        logger.debug(f"Starting page number: {current_page}, Total pages: {total_pages}")
+
+        # get meta for each split
+        for i, split in enumerate(splits):
+            meta = {}
+            if doc.meta:
+                meta = self._flatten_dict(doc.meta)
+
+            # add standard metadata
+            meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page})
+
+            # get page number based on page breaks
+            page_breaks = self._process_split_content(split["content"], i)
+            current_page += page_breaks
+
+            # add split-specific metadata
+            if split.get("meta"):
+                meta.update(self._flatten_dict(split.get("meta")))
+
+            docs.append(Document(content=split["content"], meta=meta))
+
+        logger.debug(f"Split into {len(docs)} documents for id={doc.id}, final page: {current_page}")
+        return docs
+
+
+@component
+class MarkdownHeaderSplitter:
+    """
+    A custom component that splits documents at markdown headers with optional secondary splitting.
+
+    :param enforce_first_header: If True, ensures the first header is always included in the parent headers.
+        This is useful for docling outputs where header levels are uniformly detected and the first header
+        is often overwritten. Defaults to False.
+    :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
+    :param secondary_split: Optional secondary split condition after header splitting.
+        Options are "none", "word", "passage", "period", "line". Defaults to "none".
+    :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
+    :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0.
+    :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        enforce_first_header: bool = False,
+        page_break_character: str = "\\f",
+        secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",
+        split_length: int = 200,
+        split_overlap: int = 0,
+        split_threshold: int = 0,
+    ):
+        self.enforce_first_header = enforce_first_header
+        self.page_break_character = page_break_character
+        self.secondary_split = secondary_split
+        self.split_length = split_length
+        self.split_overlap = split_overlap
+        self.split_threshold = split_threshold
+
+    def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[bool] = None) -> List[Dict]:
+        """Split text by markdown headers and create chunks with appropriate metadata."""
+        logger.debug("Splitting text by markdown headers")
+
+        # find headers
+        pattern = r"(?m)^(#{1,6}) (.+)$"
+        matches = list(re.finditer(pattern, text))
+
+        # return unsplit if no headers found
+        if not matches:
+            logger.info("No headers found in document; returning full document as single chunk.")
+            return [{"content": text, "meta": {"header": None, "parentheaders": []}}]
+
+        # process headers and build chunks
+        chunks = []
+        header_stack = [None] * 6
+        active_parents = []
+        first_header = matches[0].group(2).strip()
+
+        for i, match in enumerate(matches):
+            # Extract header info
+            header_prefix = match.group(1)
+            header_text = match.group(2).strip()
+            level = len(header_prefix)
+
+            # get content
+            start = match.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+            content = text[start:end].strip()
+
+            # update header stack to track nesting
+            header_stack[level - 1] = header_text
+            for j in range(level, 6):
+                header_stack[j] = None
+
+            # skip splits w/o content
+            if not content:
+                # Add as parent for subsequent headers
+                active_parents = [h for h in header_stack[: level - 1] if h]
+                active_parents.append(header_text)
+                continue
+
+            # get parent headers
+            parentheaders = list(active_parents)
+
+            # enforce first header if needed
+            if enforce_first_header and first_header and (not parentheaders or parentheaders[0] != first_header):
+                parentheaders = [first_header] + [h for h in parentheaders if h != first_header]
+
+            logger.debug(f"Creating chunk for header '{header_text}' at level {level}")
+
+            chunks.append(
+                {
+                    "content": f"{header_prefix} {header_text}\n{content}",
+                    "meta": {"header": header_text, "parentheaders": parentheaders},
+                }
+            )
+
+            # reset active parents
+            active_parents = [h for h in header_stack[: level - 1] if h]
+
+        logger.info(f"Split into {len(chunks)} chunks by markdown headers.")
+        return chunks
+
+    def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document]:
+        """
+        Apply secondary splitting while preserving header metadata and structure.
+
+        Ensures page counting is maintained across splits.
+        """
+        if self.secondary_split == "none":
+            return documents
+
+        logger.info(f"Applying secondary splitting by {self.secondary_split}")
+        result_docs = []
+
+        for doc in documents:
+            # extract header information
+            header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content)
+            if header_match:
+                header_prefix = header_match.group(0) + "\n"
+                content_for_splitting = doc.content[header_match.end() :]
+            else:
+                header_prefix = ""
+                content_for_splitting = doc.content
+
+            if not content_for_splitting.strip():  # skip empty content
+                result_docs.append(doc)
+                continue
+
+            # track page from meta
+            current_page = doc.meta.get("page_number", 1)
+
+            secondary_splitter = DocumentSplitter(
+                split_by=self.secondary_split,
+                split_length=self.split_length,
+                split_overlap=self.split_overlap,
+                split_threshold=self.split_threshold,
+            )
+
+            # apply secondary splitting
+            temp_doc = Document(content=content_for_splitting, meta=doc.meta)
+            secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"]
+            parent_headers = doc.meta.get("parentheaders", [])
+            first_header = parent_headers[0] if parent_headers else None
+            accumulated_page_breaks = 0  # track page breaks
+
+            # split processing
+            for i, split in enumerate(secondary_splits):
+                # calculate page number for this split
+                if i > 0:  # page break counting
+                    prev_content = secondary_splits[i - 1].content
+                    page_breaks = prev_content.count(self.page_break_character)
+                    accumulated_page_breaks += page_breaks
+
+                # set page number to meta
+                split.meta["page_number"] = current_page + accumulated_page_breaks
+
+                if header_prefix:  # add header prefix to content
+                    split.content = header_prefix + split.content
+
+                # preserve header metadata
+                for key in ["header", "parentheaders"]:
+                    if key in doc.meta:
+                        split.meta[key] = doc.meta[key]
+
+                # enforce first header if needed
+                if self.enforce_first_header and first_header:
+                    parentheaders = split.meta.get("parentheaders", [])
+                    if not parentheaders:
+                        split.meta["parentheaders"] = [first_header]
+                    elif parentheaders[0] != first_header:
+                        split.meta["parentheaders"] = [first_header] + [h for h in parentheaders if h != first_header]
+                # preserve primary split ID
+                if "split_id" in doc.meta:
+                    split.meta["header_split_id"] = doc.meta["split_id"]
+
+                result_docs.append(split)
+
+        logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.")
+        return result_docs
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document], enforce_first_header: Optional[bool] = None) -> Dict[str, List[Document]]:
+        """
+        Run the markdown header splitter with optional secondary splitting.
+
+        :param documents: List of documents to split
+        :param enforce_first_header: If True, ensures the first header is included in all parentheaders.
+            If None, uses the value from initialization.
+        """
+        logger.info(f"Processing {len(documents)} documents with enforce_first_header={enforce_first_header}")
+
+        # split by markdown headers
+        header_splitter = CustomDocumentSplitter(
+            split_by="function",
+            splitting_function=lambda text: self._split_by_markdown_headers(text, enforce_first_header),
+            page_break_character=self.page_break_character,
+        )
+
+        # get splits
+        header_split_docs = header_splitter.run(documents=documents)["documents"]
+        logger.info(f"Header splitting produced {len(header_split_docs)} documents")
+
+        # apply secondary splitting if requested
+        if self.secondary_split != "none":
+            final_docs = self._apply_secondary_splitting(header_split_docs)
+        else:
+            final_docs = header_split_docs
+
+        return {"documents": final_docs}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize component to dictionary."""
+        return default_to_dict(
+            self,
+            enforce_first_header=self.enforce_first_header,
+            page_break_character=self.page_break_character,
+            secondary_split=self.secondary_split,
+            split_length=self.split_length,
+            split_overlap=self.split_overlap,
+            split_threshold=self.split_threshold,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter":
+        """Deserialize component from dictionary."""
+        return default_from_dict(cls, data)
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
new file mode 100644
index 0000000000..7ae3aeb039
--- /dev/null
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -0,0 +1,87 @@
+import pytest
+from haystack import Document
+
+from deepset_cloud_custom_nodes.splitters.markdown_header_splitter import (
+    MarkdownHeaderSplitter,
+)
+
+
+@pytest.fixture
+def sample_text():
+    return (
+        "# Header 1\n"
+        "Content under header 1.\n"
+        "## Header 1.1\n"
+        "### Subheader 1.1.1\n"
+        "Content under sub-header 1.1.1\n"
+        "## Header 1.2\n"
+        "### Subheader 1.2.1\n"
+        "Content under header 1.2.1.\n"
+        "### Subheader 1.2.2\n"
+        "Content under header 1.2.2.\n"
+        "### Subheader 1.2.3\n"
+        "Content under header 1.2.3."
+    )
+
+
+def test_basic_split(sample_text):
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content=sample_text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+
+    # Should split into all headers with content
+    headers = [doc.meta["header"] for doc in split_docs]
+    assert "Header 1" in headers
+    assert "Subheader 1.1.1" in headers
+    assert "Subheader 1.2.1" in headers
+    assert "Subheader 1.2.2" in headers
+    assert "Subheader 1.2.3" in headers
+
+    # Check that content is present and correct
+    for doc in split_docs:
+        assert doc.content.startswith("#") or doc.content.startswith("##") or doc.content.startswith("###")
+        assert doc.meta.get("header") is not None
+
+
+def test_parentheaders(sample_text):
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content=sample_text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+
+    # Find a subheader and check parentheaders
+    subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2")
+    assert "Header 1" in subheader_doc.meta["parentheaders"]
+    assert "Header 1.2" in subheader_doc.meta["parentheaders"]
+
+
+def test_enforce_first_header(sample_text):
+    splitter = MarkdownHeaderSplitter(enforce_first_header=True)
+    docs = [Document(content=sample_text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+
+    # All parentheaders should start with the first header
+    first_header = "Header 1"
+    for doc in split_docs:
+        if doc.meta["parentheaders"]:
+            assert doc.meta["parentheaders"][0] == first_header
+
+
+def test_no_headers():
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content="Just some text without headers.")]
+    result = splitter.run(documents=docs)
+    assert len(result["documents"]) == 1
+
+
+def test_multiple_documents(sample_text):
+    splitter = MarkdownHeaderSplitter()
+    docs = [
+        Document(content=sample_text),
+        Document(content="# Another Header\nSome content."),
+    ]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    assert any(doc.meta["header"] == "Another Header" for doc in split_docs)

From edfd644a6fece9714b6ec67248920707ed52fda9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 29 Jul 2025 15:49:45 +0200
Subject: [PATCH 02/85] rework md-header splitter to rewrite md-header levels

---
 .../preprocessors/markdown_header_splitter.py | 161 +++++++++++++++---
 .../test_markdown_header_splitter.py          |  11 +-
 2 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 91247237e9..c51ff606cf 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -81,9 +81,8 @@ class MarkdownHeaderSplitter:
     """
     A custom component that splits documents at markdown headers with optional secondary splitting.
 
-    :param enforce_first_header: If True, ensures the first header is always included in the parent headers.
-        This is useful for docling outputs where header levels are uniformly detected and the first header
-        is often overwritten. Defaults to False.
+    :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
+        Useful for documents where all headers use the same level. Defaults to False.
     :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
     :param secondary_split: Optional secondary split condition after header splitting.
         Options are "none", "word", "passage", "period", "line". Defaults to "none".
@@ -94,21 +93,94 @@ class MarkdownHeaderSplitter:
 
     def __init__(
         self,
-        enforce_first_header: bool = False,
+        infer_header_levels: bool = False,
         page_break_character: str = "\\f",
         secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",
         split_length: int = 200,
         split_overlap: int = 0,
         split_threshold: int = 0,
     ):
-        self.enforce_first_header = enforce_first_header
+        self.infer_header_levels = infer_header_levels
         self.page_break_character = page_break_character
         self.secondary_split = secondary_split
         self.split_length = split_length
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
 
-    def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[bool] = None) -> List[Dict]:
+    def _infer_and_rewrite_header_levels(self, text: str) -> str:
+        """
+        Infer and rewrite header levels in the markdown text.
+
+        This function analyzes the document structure to infer proper header levels:
+        - First header is always level 1
+        - If there's content between headers, the next header stays at the same level
+        - If there's no content between headers, the next header goes one level deeper
+        - Header levels never exceed 6 (the maximum in markdown)
+
+        This is useful for documents where all headers are at the same level, such as
+        output from document conversion tools like docling.
+        """
+        logger.debug("Inferring and rewriting header levels")
+
+        # find headers
+        pattern = r"(?m)^(#{1,6}) (.+)$"
+        matches = list(re.finditer(pattern, text))
+
+        if not matches:
+            logger.info("No headers found in document; skipping header level inference.")
+            return text
+
+        modified_text = text
+        offset = 0  # track offset due to length changes in headers
+
+        # track header structure
+        current_level = 1
+        header_stack = [1]  # always start with level 1
+
+        for i, match in enumerate(matches):
+            original_header = match.group(0)
+            header_text = match.group(2).strip()
+
+            # check if there's content between this header and the previous one
+            has_content = False
+            if i > 0:
+                prev_end = matches[i - 1].end()
+                current_start = match.start()
+                content_between = text[prev_end:current_start].strip()
+                has_content = bool(content_between)
+
+            # first header is always level 1
+            if i == 0:
+                inferred_level = 1
+            elif has_content:
+                # stay at the same level if there's content
+                inferred_level = current_level
+            else:
+                # go one level deeper if there's no content
+                inferred_level = min(current_level + 1, 6)
+
+            # update tracking variables
+            current_level = inferred_level
+            header_stack = header_stack[:inferred_level]
+            while len(header_stack) < inferred_level:
+                header_stack.append(1)
+
+            # new header with inferred level
+            new_prefix = "#" * inferred_level
+            new_header = f"{new_prefix} {header_text}"
+
+            # replace old header
+            start_pos = match.start() + offset
+            end_pos = match.end() + offset
+            modified_text = modified_text[:start_pos] + new_header + modified_text[end_pos:]
+
+            # update offset
+            offset += len(new_header) - len(original_header)
+
+        logger.info(f"Rewrote {len(matches)} headers with inferred levels.")
+        return modified_text
+
+    def _split_by_markdown_headers(self, text: str) -> List[Dict]:
         """Split text by markdown headers and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
 
@@ -125,10 +197,9 @@ def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[b
         chunks = []
         header_stack = [None] * 6
         active_parents = []
-        first_header = matches[0].group(2).strip()
 
         for i, match in enumerate(matches):
-            # Extract header info
+            # extract header info
             header_prefix = match.group(1)
             header_text = match.group(2).strip()
             level = len(header_prefix)
@@ -153,10 +224,6 @@ def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[b
             # get parent headers
             parentheaders = list(active_parents)
 
-            # enforce first header if needed
-            if enforce_first_header and first_header and (not parentheaders or parentheaders[0] != first_header):
-                parentheaders = [first_header] + [h for h in parentheaders if h != first_header]
-
             logger.debug(f"Creating chunk for header '{header_text}' at level {level}")
 
             chunks.append(
@@ -211,8 +278,6 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document
             # apply secondary splitting
             temp_doc = Document(content=content_for_splitting, meta=doc.meta)
             secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"]
-            parent_headers = doc.meta.get("parentheaders", [])
-            first_header = parent_headers[0] if parent_headers else None
             accumulated_page_breaks = 0  # track page breaks
 
             # split processing
@@ -234,13 +299,6 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document
                     if key in doc.meta:
                         split.meta[key] = doc.meta[key]
 
-                # enforce first header if needed
-                if self.enforce_first_header and first_header:
-                    parentheaders = split.meta.get("parentheaders", [])
-                    if not parentheaders:
-                        split.meta["parentheaders"] = [first_header]
-                    elif parentheaders[0] != first_header:
-                        split.meta["parentheaders"] = [first_header] + [h for h in parentheaders if h != first_header]
                 # preserve primary split ID
                 if "split_id" in doc.meta:
                     split.meta["header_split_id"] = doc.meta["split_id"]
@@ -251,25 +309,34 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document
         return result_docs
 
     @component.output_types(documents=List[Document])
-    def run(self, documents: List[Document], enforce_first_header: Optional[bool] = None) -> Dict[str, List[Document]]:
+    def run(self, documents: List[Document], infer_header_levels: Optional[bool] = None) -> Dict[str, List[Document]]:
         """
         Run the markdown header splitter with optional secondary splitting.
 
         :param documents: List of documents to split
-        :param enforce_first_header: If True, ensures the first header is included in all parentheaders.
+        :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting.
             If None, uses the value from initialization.
         """
-        logger.info(f"Processing {len(documents)} documents with enforce_first_header={enforce_first_header}")
+        infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels
+
+        # process documents - preprocess if told to
+        processed_documents = []
+        for doc in documents:
+            if infer_header_levels:
+                content = self._infer_and_rewrite_header_levels(doc.content)
+                processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))
+            else:
+                processed_documents.append(doc)
 
         # split by markdown headers
         header_splitter = CustomDocumentSplitter(
             split_by="function",
-            splitting_function=lambda text: self._split_by_markdown_headers(text, enforce_first_header),
+            splitting_function=lambda text: self._split_by_markdown_headers(text),
             page_break_character=self.page_break_character,
         )
 
         # get splits
-        header_split_docs = header_splitter.run(documents=documents)["documents"]
+        header_split_docs = header_splitter.run(documents=processed_documents)["documents"]
         logger.info(f"Header splitting produced {len(header_split_docs)} documents")
 
         # apply secondary splitting if requested
@@ -284,7 +351,7 @@ def to_dict(self) -> Dict[str, Any]:
         """Serialize component to dictionary."""
         return default_to_dict(
             self,
-            enforce_first_header=self.enforce_first_header,
+            infer_header_levels=self.infer_header_levels,
             page_break_character=self.page_break_character,
             secondary_split=self.secondary_split,
             split_length=self.split_length,
@@ -296,3 +363,43 @@ def to_dict(self) -> Dict[str, Any]:
     def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter":
         """Deserialize component from dictionary."""
         return default_from_dict(cls, data)
+
+
+# TODO: move to proper test file once ready
+if __name__ == "__main__":
+    print()
+    print("===== Example 1: Regular splitting =====")
+    splitter = MarkdownHeaderSplitter()
+    content = """# Header 1
+## Subheader 1.1
+Content under subheader 1.1.
+## Subheader 1.2
+### Subheader 1.2.1
+Content under subheader 1.2.1."""
+    print("Original content:")
+    print(content)
+    example_doc = Document(content=content)
+    result = splitter.run(documents=[example_doc])
+    for doc in result["documents"]:
+        print("\n---Document---")
+        print(doc.content)
+        print(doc.meta)
+
+    print()
+    print("===== Example 2: Splitting with header inference =====")
+    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
+    content = """## Header 1
+## Subheader 1.1
+Content under subheader 1.1.
+## Subheader 1.2
+## Subheader 1.2.1
+Content under subheader 1.2.1."""
+    print("Original content:")
+    print(content)
+    example_doc = Document(content=content)
+    result = splitter.run(documents=[example_doc])
+    print("\nAfter header inference and splitting:")
+    for doc in result["documents"]:
+        print("\n---Document---")
+        print(doc.content)
+        print(doc.meta)
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 7ae3aeb039..4207ea7b8c 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -1,9 +1,7 @@
 import pytest
-from haystack import Document
 
-from deepset_cloud_custom_nodes.splitters.markdown_header_splitter import (
-    MarkdownHeaderSplitter,
-)
+from haystack import Document
+from haystack.components.preprocessors.markdown_header_splitter import MarkdownHeaderSplitter
 
 
 @pytest.fixture
@@ -78,10 +76,7 @@ def test_no_headers():
 
 def test_multiple_documents(sample_text):
     splitter = MarkdownHeaderSplitter()
-    docs = [
-        Document(content=sample_text),
-        Document(content="# Another Header\nSome content."),
-    ]
+    docs = [Document(content=sample_text), Document(content="# Another Header\nSome content.")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
     assert any(doc.meta["header"] == "Another Header" for doc in split_docs)

From cd55f132729bc26a2b3402fe3794ff357cf29a3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 29 Jul 2025 15:53:54 +0200
Subject: [PATCH 03/85] remove deprecated test

---
 .../preprocessors/test_markdown_header_splitter.py  | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 4207ea7b8c..89a551bd2a 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -54,19 +54,6 @@ def test_parentheaders(sample_text):
     assert "Header 1.2" in subheader_doc.meta["parentheaders"]
 
 
-def test_enforce_first_header(sample_text):
-    splitter = MarkdownHeaderSplitter(enforce_first_header=True)
-    docs = [Document(content=sample_text)]
-    result = splitter.run(documents=docs)
-    split_docs = result["documents"]
-
-    # All parentheaders should start with the first header
-    first_header = "Header 1"
-    for doc in split_docs:
-        if doc.meta["parentheaders"]:
-            assert doc.meta["parentheaders"][0] == first_header
-
-
 def test_no_headers():
     splitter = MarkdownHeaderSplitter()
     docs = [Document(content="Just some text without headers.")]

From dafe1bdefcf18e3a55cbd5201ea66aa2336026d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Tue, 9 Sep 2025 14:32:09 +0200
Subject: [PATCH 04/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

use haystack logging

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 haystack/components/preprocessors/markdown_header_splitter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index c51ff606cf..7104eea0f5 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -1,8 +1,7 @@
-import logging
 import re
 from typing import Any, Dict, List, Literal, Optional
 
-from haystack import Document, component, default_from_dict, default_to_dict
+from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.preprocessors import DocumentSplitter
 
 logger = logging.getLogger(__name__)

From 6da2513017821b4c4cc173e54d2925d23dc544e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 14:35:26 +0200
Subject: [PATCH 05/85] use native types

---
 .../preprocessors/markdown_header_splitter.py  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 7104eea0f5..994c587f6f 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Literal, Optional
 
 from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.preprocessors import DocumentSplitter
@@ -16,7 +16,7 @@ def __init__(self, *args, page_break_character="\\f", **kwargs):
         super().__init__(*args, **kwargs)
         self.page_break_character = page_break_character
 
-    def _flatten_dict(self, d: Dict, prefix: str = "", target_dict: Optional[Dict] = None) -> Dict:
+    def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
         """Helper method to flatten a nested dictionary."""
         if target_dict is None:
             target_dict = {}
@@ -41,7 +41,7 @@ def _process_split_content(self, split_content: str, split_index: int) -> int:
             logger.debug(f"Found {page_breaks} page breaks in split {split_index}")
         return page_breaks
 
-    def _split_by_function(self, doc: Document) -> List[Document]:
+    def _split_by_function(self, doc: Document) -> list[Document]:
         """Split document using a custom function that returns dictionaries with 'content' and 'meta'."""
         logger.debug(f"Splitting document with id={doc.id}")
         splits = self.splitting_function(doc.content)
@@ -179,7 +179,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str:
         logger.info(f"Rewrote {len(matches)} headers with inferred levels.")
         return modified_text
 
-    def _split_by_markdown_headers(self, text: str) -> List[Dict]:
+    def _split_by_markdown_headers(self, text: str) -> list[dict]:
         """Split text by markdown headers and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
 
@@ -238,7 +238,7 @@ def _split_by_markdown_headers(self, text: str) -> List[Dict]:
         logger.info(f"Split into {len(chunks)} chunks by markdown headers.")
         return chunks
 
-    def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document]:
+    def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]:
         """
         Apply secondary splitting while preserving header metadata and structure.
 
@@ -307,8 +307,8 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document
         logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.")
         return result_docs
 
-    @component.output_types(documents=List[Document])
-    def run(self, documents: List[Document], infer_header_levels: Optional[bool] = None) -> Dict[str, List[Document]]:
+    @component.output_types(documents=list[Document])
+    def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]:
         """
         Run the markdown header splitter with optional secondary splitting.
 
@@ -346,7 +346,7 @@ def run(self, documents: List[Document], infer_header_levels: Optional[bool] = N
 
         return {"documents": final_docs}
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """Serialize component to dictionary."""
         return default_to_dict(
             self,
@@ -359,7 +359,7 @@ def to_dict(self) -> Dict[str, Any]:
         )
 
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter":
+    def from_dict(cls, data: dict[str, Any]) -> "MarkdownHeaderSplitter":
         """Deserialize component from dictionary."""
         return default_from_dict(cls, data)
 

From 96e616c7bbb5eae1c474656163179555176caf78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 14:42:47 +0200
Subject: [PATCH 06/85] move to haystack logging

---
 .../preprocessors/markdown_header_splitter.py | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 994c587f6f..0aa92d5d5f 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -38,19 +38,27 @@ def _process_split_content(self, split_content: str, split_index: int) -> int:
 
         page_breaks = split_content.count(self.page_break_character)
         if page_breaks > 0:
-            logger.debug(f"Found {page_breaks} page breaks in split {split_index}")
+            logger.debug(
+                "Found {page_breaks} page breaks in split {split_index}",
+                page_breaks=page_breaks,
+                split_index=split_index,
+            )
         return page_breaks
 
     def _split_by_function(self, doc: Document) -> list[Document]:
         """Split document using a custom function that returns dictionaries with 'content' and 'meta'."""
-        logger.debug(f"Splitting document with id={doc.id}")
+        logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
         splits = self.splitting_function(doc.content)
         docs = []
 
         # calculate total pages and set current page
         total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
         current_page = doc.meta.get("page_number", 1)
-        logger.debug(f"Starting page number: {current_page}, Total pages: {total_pages}")
+        logger.debug(
+            "Starting page number: {current_page}, Total pages: {total_pages}",
+            current_page=current_page,
+            total_pages=total_pages,
+        )
 
         # get meta for each split
         for i, split in enumerate(splits):
@@ -71,7 +79,12 @@ def _split_by_function(self, doc: Document) -> list[Document]:
 
             docs.append(Document(content=split["content"], meta=meta))
 
-        logger.debug(f"Split into {len(docs)} documents for id={doc.id}, final page: {current_page}")
+        logger.debug(
+            "Split into {num_docs} documents for id={doc_id}, final page: {current_page}",
+            num_docs=len(docs),
+            doc_id=doc.id,
+            current_page=current_page,
+        )
         return docs
 
 
@@ -176,7 +189,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str:
             # update offset
             offset += len(new_header) - len(original_header)
 
-        logger.info(f"Rewrote {len(matches)} headers with inferred levels.")
+        logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches))
         return modified_text
 
     def _split_by_markdown_headers(self, text: str) -> list[dict]:
@@ -223,7 +236,9 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]:
             # get parent headers
             parentheaders = list(active_parents)
 
-            logger.debug(f"Creating chunk for header '{header_text}' at level {level}")
+            logger.debug(
+                "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
+            )
 
             chunks.append(
                 {
@@ -235,7 +250,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]:
             # reset active parents
             active_parents = [h for h in header_stack[: level - 1] if h]
 
-        logger.info(f"Split into {len(chunks)} chunks by markdown headers.")
+        logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks))
         return chunks
 
     def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]:
@@ -247,7 +262,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         if self.secondary_split == "none":
             return documents
 
-        logger.info(f"Applying secondary splitting by {self.secondary_split}")
+        logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split)
         result_docs = []
 
         for doc in documents:
@@ -304,7 +319,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
                 result_docs.append(split)
 
-        logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.")
+        logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
         return result_docs
 
     @component.output_types(documents=list[Document])
@@ -336,7 +351,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
 
         # get splits
         header_split_docs = header_splitter.run(documents=processed_documents)["documents"]
-        logger.info(f"Header splitting produced {len(header_split_docs)} documents")
+        logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
         # apply secondary splitting if requested
         if self.secondary_split != "none":

From c3e397f991689ce05337bf7f14aa27b179dfb92e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 15:17:53 +0200
Subject: [PATCH 07/85] docstrings improvements

---
 .../preprocessors/markdown_header_splitter.py | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 0aa92d5d5f..3927346534 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -10,10 +10,15 @@
 class CustomDocumentSplitter(DocumentSplitter):
     """
     Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'.
+
+    :param split_by: The method to split by. Must be "function" for this custom splitter.
+    :param splitting_function: The custom splitting function that takes a string and returns a list of dicts
+        with 'content' and optional 'meta' keys.
+    :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
     """
 
-    def __init__(self, *args, page_break_character="\\f", **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, page_break_character="\\f"):
+        super().__init__()
         self.page_break_character = page_break_character
 
     def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
@@ -92,15 +97,6 @@ def _split_by_function(self, doc: Document) -> list[Document]:
 class MarkdownHeaderSplitter:
     """
     A custom component that splits documents at markdown headers with optional secondary splitting.
-
-    :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-        Useful for documents where all headers use the same level. Defaults to False.
-    :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
-    :param secondary_split: Optional secondary split condition after header splitting.
-        Options are "none", "word", "passage", "period", "line". Defaults to "none".
-    :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
-    :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0.
-    :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
     """
 
     def __init__(
@@ -112,6 +108,18 @@ def __init__(
         split_overlap: int = 0,
         split_threshold: int = 0,
     ):
+        """
+        Initialize the MarkdownHeaderSplitter.
+
+        :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
+            Useful for documents where all headers use the same level. Defaults to False.
+        :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
+        :param secondary_split: Optional secondary split condition after header splitting.
+            Options are "none", "word", "passage", "period", "line". Defaults to "none".
+        :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
+        :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0.
+        :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
+        """
         self.infer_header_levels = infer_header_levels
         self.page_break_character = page_break_character
         self.secondary_split = secondary_split

From 1ca9803d4a3c161fdd47183e61e5fb4c1874597e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:17:01 +0200
Subject: [PATCH 08/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

remove temp toc

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 haystack/components/preprocessors/markdown_header_splitter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 3927346534..b973b623e9 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -298,8 +298,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             )
 
             # apply secondary splitting
-            temp_doc = Document(content=content_for_splitting, meta=doc.meta)
-            secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"]
+            secondary_splits = secondary_splitter.run(documents=[Document(content=content_for_splitting, meta=doc.meta)])["documents"]
             accumulated_page_breaks = 0  # track page breaks
 
             # split processing

From 6c496003f448bf576305343b58ca34b12505598a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 15:23:33 +0200
Subject: [PATCH 09/85] fix CustomDocumentSplitter arguments

---
 .../preprocessors/markdown_header_splitter.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index b973b623e9..516d391931 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -10,15 +10,17 @@
 class CustomDocumentSplitter(DocumentSplitter):
     """
     Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'.
-
-    :param split_by: The method to split by. Must be "function" for this custom splitter.
-    :param splitting_function: The custom splitting function that takes a string and returns a list of dicts
-        with 'content' and optional 'meta' keys.
-    :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
     """
 
-    def __init__(self, page_break_character="\\f"):
-        super().__init__()
+    def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"):
+        """
+        Initialize the CustomDocumentSplitter.
+
+        :param split_by: The method to split by. Must be "function" for custom splitting functions.
+        :param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'.
+        :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
+        """
+        super().__init__(split_by=split_by, splitting_function=splitting_function)
         self.page_break_character = page_break_character
 
     def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
@@ -298,7 +300,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             )
 
             # apply secondary splitting
-            secondary_splits = secondary_splitter.run(documents=[Document(content=content_for_splitting, meta=doc.meta)])["documents"]
+            secondary_splits = secondary_splitter.run(
+                documents=[Document(content=content_for_splitting, meta=doc.meta)]
+            )["documents"]
             accumulated_page_breaks = 0  # track page breaks
 
             # split processing

From 9c23202d1d5cd1bfeae5c5ac08e1f8e488b6a17f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 15:29:56 +0200
Subject: [PATCH 10/85] remove header prefix from content

---
 .../components/preprocessors/markdown_header_splitter.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 516d391931..5658797a32 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -316,8 +316,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 # set page number to meta
                 split.meta["page_number"] = current_page + accumulated_page_breaks
 
-                if header_prefix:  # add header prefix to content
-                    split.content = header_prefix + split.content
+                ## deactivated: header prefix to content
+                # if header_prefix:
+                #     split.content = header_prefix + split.content
 
                 # preserve header metadata
                 for key in ["header", "parentheaders"]:

From b24d92d23210d56539e85a69d45a75622fd709c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 15:42:27 +0200
Subject: [PATCH 11/85] rework split_id assignment to avoid collisions

---
 .../preprocessors/markdown_header_splitter.py      | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 5658797a32..7d09ce4591 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -279,10 +279,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             # extract header information
             header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content)
             if header_match:
-                header_prefix = header_match.group(0) + "\n"
                 content_for_splitting = doc.content[header_match.end() :]
             else:
-                header_prefix = ""
                 content_for_splitting = doc.content
 
             if not content_for_splitting.strip():  # skip empty content
@@ -316,21 +314,17 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 # set page number to meta
                 split.meta["page_number"] = current_page + accumulated_page_breaks
 
-                ## deactivated: header prefix to content
-                # if header_prefix:
-                #     split.content = header_prefix + split.content
-
                 # preserve header metadata
                 for key in ["header", "parentheaders"]:
                     if key in doc.meta:
                         split.meta[key] = doc.meta[key]
 
-                # preserve primary split ID
-                if "split_id" in doc.meta:
-                    split.meta["header_split_id"] = doc.meta["split_id"]
-
                 result_docs.append(split)
 
+        # assign unique, sequential split_id to all final chunks
+        for idx, doc in enumerate(result_docs):
+            doc.meta["split_id"] = idx
+
         logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
         return result_docs
 

From 7b8150e69cda684dbad44ce38890e05c5d09a2ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 15:46:30 +0200
Subject: [PATCH 12/85] remove unneeded dese methods

---
 .../preprocessors/markdown_header_splitter.py   | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 7d09ce4591..963b46ceb5 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -367,23 +367,6 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
 
         return {"documents": final_docs}
 
-    def to_dict(self) -> dict[str, Any]:
-        """Serialize component to dictionary."""
-        return default_to_dict(
-            self,
-            infer_header_levels=self.infer_header_levels,
-            page_break_character=self.page_break_character,
-            secondary_split=self.secondary_split,
-            split_length=self.split_length,
-            split_overlap=self.split_overlap,
-            split_threshold=self.split_threshold,
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "MarkdownHeaderSplitter":
-        """Deserialize component from dictionary."""
-        return default_from_dict(cls, data)
-
 
 # TODO: move to proper test file once ready
 if __name__ == "__main__":

From f0852218298c3648e963b526cda132a30a8c3f48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 16:02:30 +0200
Subject: [PATCH 13/85] cleanup

---
 .../preprocessors/markdown_header_splitter.py        | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 963b46ceb5..8b997b5173 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -1,7 +1,7 @@
 import re
-from typing import Any, Literal, Optional
+from typing import Literal, Optional
 
-from haystack import Document, component, default_from_dict, default_to_dict, logging
+from haystack import Document, component, logging
 from haystack.components.preprocessors import DocumentSplitter
 
 logger = logging.getLogger(__name__)
@@ -73,8 +73,8 @@ def _split_by_function(self, doc: Document) -> list[Document]:
             if doc.meta:
                 meta = self._flatten_dict(doc.meta)
 
-            # add standard metadata
-            meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page})
+            # add standard metadata (no split_id here)
+            meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
 
             # get page number based on page breaks
             page_breaks = self._process_split_content(split["content"], i)
@@ -365,6 +365,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         else:
             final_docs = header_split_docs
 
+        # assign unique, sequential split_id to all final chunks
+        for idx, doc in enumerate(final_docs):
+            doc.meta["split_id"] = idx
+
         return {"documents": final_docs}
 
 

From 3490d89056e419df831057266a2f8d88e97cd108 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 9 Sep 2025 16:14:43 +0200
Subject: [PATCH 14/85] cleanup

---
 .../preprocessors/markdown_header_splitter.py | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 8b997b5173..c1e739ac00 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -1,5 +1,5 @@
 import re
-from typing import Literal, Optional
+from typing import Callable, Literal, Optional
 
 from haystack import Document, component, logging
 from haystack.components.preprocessors import DocumentSplitter
@@ -7,18 +7,28 @@
 logger = logging.getLogger(__name__)
 
 
-class CustomDocumentSplitter(DocumentSplitter):
+class _CustomDocumentSplitter(DocumentSplitter):
     """
-    Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'.
+    Internal helper class that extends DocumentSplitter to support splitting functions.
+
+    This class handles splitting functions that return dictionaries with 'content' and 'meta'
+    keys instead of just strings. For internal use only within the MarkdownHeaderSplitter.
     """
 
-    def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"):
+    def __init__(
+        self,
+        split_by: str = "function",
+        splitting_function: Optional[Callable] = None,
+        page_break_character: str = "\\f",
+    ):
         """
-        Initialize the CustomDocumentSplitter.
+        Initialize the _CustomDocumentSplitter.
 
         :param split_by: The method to split by. Must be "function" for custom splitting functions.
-        :param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'.
-        :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
+        :param splitting_function: A custom function that takes a string and returns a list of dicts
+            with 'content' and optional 'meta'.
+        :param page_break_character: Character used to identify page breaks.
+            Defaults to form feed ("\\f").
         """
         super().__init__(split_by=split_by, splitting_function=splitting_function)
         self.page_break_character = page_break_character
@@ -119,7 +129,8 @@ def __init__(
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are "none", "word", "passage", "period", "line". Defaults to "none".
         :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
-        :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0.
+        :param split_overlap: The number of overlapping units for each split when using secondary splitting.
+            Defaults to 0.
         :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
         """
         self.infer_header_levels = infer_header_levels
@@ -349,7 +360,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
                 processed_documents.append(doc)
 
         # split by markdown headers
-        header_splitter = CustomDocumentSplitter(
+        header_splitter = _CustomDocumentSplitter(
             split_by="function",
             splitting_function=lambda text: self._split_by_markdown_headers(text),
             page_break_character=self.page_break_character,

From 0bf3187fc70d6f809056847f0a45329dd023a059 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 16 Sep 2025 15:55:45 +0200
Subject: [PATCH 15/85] add tests

cleanup
---
 .../test_markdown_header_splitter.py          | 203 +++++++++++++++++-
 1 file changed, 193 insertions(+), 10 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 89a551bd2a..2c97270290 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -4,6 +4,7 @@
 from haystack.components.preprocessors.markdown_header_splitter import MarkdownHeaderSplitter
 
 
+# Fixtures
 @pytest.fixture
 def sample_text():
     return (
@@ -22,6 +23,7 @@ def sample_text():
     )
 
 
+# Basic splitting and structure
 def test_basic_split(sample_text):
     splitter = MarkdownHeaderSplitter()
     docs = [Document(content=sample_text)]
@@ -42,28 +44,209 @@ def test_basic_split(sample_text):
         assert doc.meta.get("header") is not None
 
 
-def test_parentheaders(sample_text):
+def test_split_parentheaders(sample_text):
     splitter = MarkdownHeaderSplitter()
-    docs = [Document(content=sample_text)]
+    docs = [Document(content=sample_text), Document(content="# H1\n## H2\n### H3\nContent")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-
-    # Find a subheader and check parentheaders
+    # Check parentheaders for both a deep subheader and a simple one
     subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2")
     assert "Header 1" in subheader_doc.meta["parentheaders"]
     assert "Header 1.2" in subheader_doc.meta["parentheaders"]
+    h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None)
+    if h3_doc:
+        assert h3_doc.meta["parentheaders"] == ["H1", "H2"]
+
+
+def test_split_no_headers():
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content="No headers here."), Document(content="Just some text without headers.")]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # Should return one doc per input, header is None
+    assert len(split_docs) == 2
+    for doc in split_docs:
+        assert doc.meta["header"] is None
 
 
-def test_no_headers():
+def test_split_multiple_documents(sample_text):
     splitter = MarkdownHeaderSplitter()
-    docs = [Document(content="Just some text without headers.")]
+    docs = [
+        Document(content=sample_text),
+        Document(content="# Another Header\nSome content."),
+        Document(content="# H1\nA"),
+        Document(content="# H2\nB"),
+    ]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    headers = {doc.meta["header"] for doc in split_docs}
+    assert {"Another Header", "H1", "H2"}.issubset(headers)
+
+
+def test_split_only_headers():
+    text = "# H1\n# H2\n# H3"
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # Should not create chunks for headers with no content
+    assert len(split_docs) == 0
+
+
+# Header inference and overrides
+def test_split_infer_header_levels():
+    text = "## H1\n## H2\nContent"
+    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # Should rewrite headers to # and ##
+    assert split_docs[0].content.startswith("## H2") or split_docs[0].content.startswith("# H1")
+
+
+def test_infer_header_levels_complex():
+    """Test header level inference with a complex document structure."""
+    text = (
+        "## All Headers Same Level\n"
+        "Some content\n"
+        "## Second Header\n"
+        "Some content\n"  # Added content to ensure headers are processed correctly
+        "## Third Header With No Content\n"
+        "## Fourth Header With No Content\n"
+        "## Fifth Header\n"
+        "More content"
+    )
+
+    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
+    docs = [Document(content=text)]
     result = splitter.run(documents=docs)
-    assert len(result["documents"]) == 1
+    split_docs = result["documents"]
 
+    # Get docs by header content to avoid position assumptions
+    first_doc = next((doc for doc in split_docs if "All Headers Same Level" in doc.content), None)
+    second_doc = next((doc for doc in split_docs if "Second Header" in doc.content), None)
 
-def test_multiple_documents(sample_text):
+    # First header should be level 1
+    assert first_doc and "# All Headers Same Level" in first_doc.content
+
+    # Second header with content should stay at level 1
+    assert second_doc and "# Second Header" in second_doc.content
+
+
+def test_infer_header_levels_override_both_directions():
+    text = "## H1\n## H2\nContent"
+    docs = [Document(content=text)]
+
+    # False at init, True at run
+    splitter = MarkdownHeaderSplitter(infer_header_levels=False)
+    result = splitter.run(documents=docs, infer_header_levels=True)
+    assert "# " in result["documents"][0].content
+
+    # True at init, False at run
+    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
+    result = splitter.run(documents=docs, infer_header_levels=False)
+    assert all("## " in doc.content for doc in result["documents"])
+
+
+# Metadata preservation
+def test_preserve_document_metadata():
+    """Test that document metadata is preserved through splitting."""
     splitter = MarkdownHeaderSplitter()
-    docs = [Document(content=sample_text), Document(content="# Another Header\nSome content.")]
+    docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})]
+
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    assert any(doc.meta["header"] == "Another Header" for doc in split_docs)
+
+    # Original metadata should be preserved
+    assert split_docs[0].meta["source"] == "test"
+    assert split_docs[0].meta["importance"] == "high"
+    assert split_docs[0].meta["custom_field"] == 123
+
+    # New metadata should be added
+    assert "header" in split_docs[0].meta
+    assert "split_id" in split_docs[0].meta
+
+
+# Error and edge case handling
+def test_non_text_document(caplog):
+    """Test that the component correctly handles non-text documents."""
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content=None)]
+
+    # Should raise ValueError about text documents
+    with pytest.raises(ValueError, match="only works with text documents"):
+        splitter.run(documents=docs)
+
+
+def test_empty_document_list():
+    """Test handling of an empty document list."""
+    splitter = MarkdownHeaderSplitter()
+    result = splitter.run(documents=[])
+    assert result["documents"] == []
+
+
+def test_invalid_secondary_split():
+    """Test that an invalid secondary split type raises an error."""
+    # In MarkdownHeaderSplitter, this is validated at DocumentSplitter instantiation time in _apply_secondary_splitting
+    splitter = MarkdownHeaderSplitter(secondary_split="invalid_split_type")
+    docs = [Document(content="# Header\nContent")]
+
+    # Error should be raised when run is called and secondary splitter is created
+    with pytest.raises(ValueError, match="split_by must be one of"):
+        splitter.run(documents=docs)
+
+
+def test_invalid_split_parameters():
+    """Test invalid split parameter validation."""
+    # Similar to invalid_secondary_split, validation happens at DocumentSplitter instantiation
+
+    # Test split_length validation
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=0)
+    docs = [Document(content="# Header\nContent")]
+    with pytest.raises(ValueError, match="split_length must be greater than 0"):
+        splitter.run(documents=docs)
+
+    # Test split_overlap validation
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1)
+    docs = [Document(content="# Header\nContent")]
+    with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0"):
+        splitter.run(documents=docs)
+
+
+def test_empty_content_handling():
+    """Test handling of documents with empty content."""
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content="")]
+    result = splitter.run(documents=docs)
+
+    # DocumentSplitter skips empty documents by default
+    assert len(result["documents"]) == 0
+
+
+# Output format and split ID checks
+def test_document_splitting_format():
+    """Test that the format of split documents is correct."""
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content="# Header\nContent")]
+    result = splitter.run(documents=docs)
+
+    # Basic validation of the output structure
+    assert isinstance(result, dict)
+    assert "documents" in result
+    assert isinstance(result["documents"], list)
+
+
+def test_split_id_sequentiality_primary_and_secondary():
+    text = "# Header\n" + "Word " * 30
+    # Test primary splitting
+    splitter = MarkdownHeaderSplitter()
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    assert split_ids == list(range(len(split_ids)))
+
+    # Test secondary splitting
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5)
+    result = splitter.run(documents=docs)
+    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    assert split_ids == list(range(len(split_ids)))

From d87ef9736ea84dc4b0dfdfdcf92d00e2abbb727a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 16:36:53 +0200
Subject: [PATCH 16/85] move initialization of secondary-splitter out of run
 method

---
 .../preprocessors/markdown_header_splitter.py | 24 ++++++++-----
 .../test_markdown_header_splitter.py          | 36 +++++++++----------
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index c1e739ac00..a03b858341 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -140,6 +140,17 @@ def __init__(
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
 
+        # initialize secondary_splitter only if needed
+        if self.secondary_split != "none":
+            self.secondary_splitter = DocumentSplitter(
+                split_by=self.secondary_split,
+                split_length=self.split_length,
+                split_overlap=self.split_overlap,
+                split_threshold=self.split_threshold,
+            )
+        else:
+            self.secondary_splitter = None
+
     def _infer_and_rewrite_header_levels(self, text: str) -> str:
         """
         Infer and rewrite header levels in the markdown text.
@@ -301,15 +312,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             # track page from meta
             current_page = doc.meta.get("page_number", 1)
 
-            secondary_splitter = DocumentSplitter(
-                split_by=self.secondary_split,
-                split_length=self.split_length,
-                split_overlap=self.split_overlap,
-                split_threshold=self.split_threshold,
-            )
-
-            # apply secondary splitting
-            secondary_splits = secondary_splitter.run(
+            # use the pre-initialized secondary splitter
+            secondary_splits = self.secondary_splitter.run(
                 documents=[Document(content=content_for_splitting, meta=doc.meta)]
             )["documents"]
             accumulated_page_breaks = 0  # track page breaks
@@ -421,3 +425,5 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         print("\n---Document---")
         print(doc.content)
         print(doc.meta)
+        print(doc.content)
+        print(doc.meta)
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 2c97270290..a89396e2c8 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -185,32 +185,21 @@ def test_empty_document_list():
     assert result["documents"] == []
 
 
-def test_invalid_secondary_split():
-    """Test that an invalid secondary split type raises an error."""
-    # In MarkdownHeaderSplitter, this is validated at DocumentSplitter instantiation time in _apply_secondary_splitting
-    splitter = MarkdownHeaderSplitter(secondary_split="invalid_split_type")
-    docs = [Document(content="# Header\nContent")]
-
-    # Error should be raised when run is called and secondary splitter is created
+def test_invalid_secondary_split_at_init():
+    """Test that an invalid secondary split type raises an error at initialization time."""
     with pytest.raises(ValueError, match="split_by must be one of"):
-        splitter.run(documents=docs)
+        MarkdownHeaderSplitter(secondary_split="invalid_split_type")
 
 
-def test_invalid_split_parameters():
-    """Test invalid split parameter validation."""
-    # Similar to invalid_secondary_split, validation happens at DocumentSplitter instantiation
-
+def test_invalid_split_parameters_at_init():
+    """Test invalid split parameter validation at initialization time."""
     # Test split_length validation
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=0)
-    docs = [Document(content="# Header\nContent")]
     with pytest.raises(ValueError, match="split_length must be greater than 0"):
-        splitter.run(documents=docs)
+        MarkdownHeaderSplitter(secondary_split="word", split_length=0)
 
     # Test split_overlap validation
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1)
-    docs = [Document(content="# Header\nContent")]
     with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0"):
-        splitter.run(documents=docs)
+        MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1)
 
 
 def test_empty_content_handling():
@@ -250,3 +239,14 @@ def test_split_id_sequentiality_primary_and_secondary():
     result = splitter.run(documents=docs)
     split_ids = [doc.meta["split_id"] for doc in result["documents"]]
     assert split_ids == list(range(len(split_ids)))
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    assert split_ids == list(range(len(split_ids)))
+
+    # Test secondary splitting
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5)
+    result = splitter.run(documents=docs)
+    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    assert split_ids == list(range(len(split_ids)))
+    assert split_ids == list(range(len(split_ids)))

From 84e34edae3134326dd5638ca669fa73871762474 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 16:52:18 +0200
Subject: [PATCH 17/85] move _custom_document_splitter to class method

---
 .../preprocessors/markdown_header_splitter.py | 179 +++++++-----------
 1 file changed, 68 insertions(+), 111 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index a03b858341..c6e2e2b1a9 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -7,104 +7,6 @@
 logger = logging.getLogger(__name__)
 
 
-class _CustomDocumentSplitter(DocumentSplitter):
-    """
-    Internal helper class that extends DocumentSplitter to support splitting functions.
-
-    This class handles splitting functions that return dictionaries with 'content' and 'meta'
-    keys instead of just strings. For internal use only within the MarkdownHeaderSplitter.
-    """
-
-    def __init__(
-        self,
-        split_by: str = "function",
-        splitting_function: Optional[Callable] = None,
-        page_break_character: str = "\\f",
-    ):
-        """
-        Initialize the _CustomDocumentSplitter.
-
-        :param split_by: The method to split by. Must be "function" for custom splitting functions.
-        :param splitting_function: A custom function that takes a string and returns a list of dicts
-            with 'content' and optional 'meta'.
-        :param page_break_character: Character used to identify page breaks.
-            Defaults to form feed ("\\f").
-        """
-        super().__init__(split_by=split_by, splitting_function=splitting_function)
-        self.page_break_character = page_break_character
-
-    def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
-        """Helper method to flatten a nested dictionary."""
-        if target_dict is None:
-            target_dict = {}
-
-        for key, value in d.items():
-            new_key = f"{prefix}{key}" if prefix else key
-
-            if isinstance(value, dict):
-                self._flatten_dict(value, f"{new_key}_", target_dict)
-            else:
-                target_dict[new_key] = value
-
-        return target_dict
-
-    def _process_split_content(self, split_content: str, split_index: int) -> int:
-        """Process the content of a split and return the number of page breaks."""
-        if not isinstance(split_content, str):
-            return 0
-
-        page_breaks = split_content.count(self.page_break_character)
-        if page_breaks > 0:
-            logger.debug(
-                "Found {page_breaks} page breaks in split {split_index}",
-                page_breaks=page_breaks,
-                split_index=split_index,
-            )
-        return page_breaks
-
-    def _split_by_function(self, doc: Document) -> list[Document]:
-        """Split document using a custom function that returns dictionaries with 'content' and 'meta'."""
-        logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
-        splits = self.splitting_function(doc.content)
-        docs = []
-
-        # calculate total pages and set current page
-        total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
-        current_page = doc.meta.get("page_number", 1)
-        logger.debug(
-            "Starting page number: {current_page}, Total pages: {total_pages}",
-            current_page=current_page,
-            total_pages=total_pages,
-        )
-
-        # get meta for each split
-        for i, split in enumerate(splits):
-            meta = {}
-            if doc.meta:
-                meta = self._flatten_dict(doc.meta)
-
-            # add standard metadata (no split_id here)
-            meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
-
-            # get page number based on page breaks
-            page_breaks = self._process_split_content(split["content"], i)
-            current_page += page_breaks
-
-            # add split-specific metadata
-            if split.get("meta"):
-                meta.update(self._flatten_dict(split.get("meta")))
-
-            docs.append(Document(content=split["content"], meta=meta))
-
-        logger.debug(
-            "Split into {num_docs} documents for id={doc_id}, final page: {current_page}",
-            num_docs=len(docs),
-            doc_id=doc.id,
-            current_page=current_page,
-        )
-        return docs
-
-
 @component
 class MarkdownHeaderSplitter:
     """
@@ -343,6 +245,61 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
         return result_docs
 
+    def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
+        if target_dict is None:
+            target_dict = {}
+        for key, value in d.items():
+            new_key = f"{prefix}{key}" if prefix else key
+            if isinstance(value, dict):
+                self._flatten_dict(value, f"{new_key}_", target_dict)
+            else:
+                target_dict[new_key] = value
+        return target_dict
+
+    def _process_split_content(self, split_content: str, split_index: int) -> int:
+        if not isinstance(split_content, str):
+            return 0
+        page_breaks = split_content.count(self.page_break_character)
+        if page_breaks > 0:
+            logger.debug(
+                "Found {page_breaks} page breaks in split {split_index}",
+                page_breaks=page_breaks,
+                split_index=split_index,
+            )
+        return page_breaks
+
+    def _split_documents_by_function(self, documents: list[Document], splitting_function: Callable) -> list[Document]:
+        result_docs = []
+        for doc in documents:
+            logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
+            splits = splitting_function(doc.content)
+            docs = []
+            total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
+            current_page = doc.meta.get("page_number", 1)
+            logger.debug(
+                "Starting page number: {current_page}, Total pages: {total_pages}",
+                current_page=current_page,
+                total_pages=total_pages,
+            )
+            for i, split in enumerate(splits):
+                meta = {}
+                if doc.meta:
+                    meta = self._flatten_dict(doc.meta)
+                meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
+                page_breaks = self._process_split_content(split["content"], i)
+                current_page += page_breaks
+                if split.get("meta"):
+                    meta.update(self._flatten_dict(split.get("meta")))
+                docs.append(Document(content=split["content"], meta=meta))
+            logger.debug(
+                "Split into {num_docs} documents for id={doc_id}, final page: {current_page}",
+                num_docs=len(docs),
+                doc_id=doc.id,
+                current_page=current_page,
+            )
+            result_docs.extend(docs)
+        return result_docs
+
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]:
         """
@@ -352,35 +309,37 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting.
             If None, uses the value from initialization.
         """
+        # validate input documents
+        for doc in documents:
+            if not isinstance(doc.content, str):
+                raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).")
+
         infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels
 
-        # process documents - preprocess if told to
         processed_documents = []
         for doc in documents:
+            # skip empty documents
+            if not doc.content or not doc.content.strip():
+                continue
             if infer_header_levels:
                 content = self._infer_and_rewrite_header_levels(doc.content)
                 processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))
             else:
                 processed_documents.append(doc)
 
-        # split by markdown headers
-        header_splitter = _CustomDocumentSplitter(
-            split_by="function",
-            splitting_function=lambda text: self._split_by_markdown_headers(text),
-            page_break_character=self.page_break_character,
-        )
+        if not processed_documents:
+            return {"documents": []}
 
-        # get splits
-        header_split_docs = header_splitter.run(documents=processed_documents)["documents"]
+        header_split_docs = self._split_documents_by_function(
+            processed_documents, splitting_function=self._split_by_markdown_headers
+        )
         logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
-        # apply secondary splitting if requested
         if self.secondary_split != "none":
             final_docs = self._apply_secondary_splitting(header_split_docs)
         else:
             final_docs = header_split_docs
 
-        # assign unique, sequential split_id to all final chunks
         for idx, doc in enumerate(final_docs):
             doc.meta["split_id"] = idx
 
@@ -425,5 +384,3 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         print("\n---Document---")
         print(doc.content)
         print(doc.meta)
-        print(doc.content)
-        print(doc.meta)

From 32b09585015c5d6d641fa79921394eec8ccf3746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 17:02:08 +0200
Subject: [PATCH 18/85] removed the _CustomDocumentSplitter class. splitting
 logic is now encapsulated within the MarkdownHeaderSplitter class as private
 methods.

---
 .../components/preprocessors/markdown_header_splitter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index c6e2e2b1a9..aa8af1fd96 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -10,7 +10,14 @@
 @component
 class MarkdownHeaderSplitter:
     """
-    A custom component that splits documents at markdown headers with optional secondary splitting.
+    Split documents at Markdown headers, with optional secondary splitting and header level inference.
+
+    This component processes text documents by:
+    - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata.
+    - Optionally inferring and rewriting header levels for documents where header structure is ambiguous.
+    - Optionally applying a secondary split (by word, passage, period, or line) to each chunk.
+      This is done in haystack's DocumentSplitter.
+    - Preserving and propagating metadata such as parent headers, page numbers, and split IDs.
     """
 
     def __init__(

From 69b79532eff332b7c7fce8467ec7b7f477f383b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 17:18:29 +0200
Subject: [PATCH 19/85] return to standard feed-forward character and add tests
 for page break handling

---
 .../preprocessors/markdown_header_splitter.py |  4 +-
 .../test_markdown_header_splitter.py          | 62 +++++++++++++++++++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index aa8af1fd96..35389c3d62 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -23,7 +23,7 @@ class MarkdownHeaderSplitter:
     def __init__(
         self,
         infer_header_levels: bool = False,
-        page_break_character: str = "\\f",
+        page_break_character: str = "\f",
         secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",
         split_length: int = 200,
         split_overlap: int = 0,
@@ -34,7 +34,7 @@ def __init__(
 
         :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
             Useful for documents where all headers use the same level. Defaults to False.
-        :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
+        :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are "none", "word", "passage", "period", "line". Defaults to "none".
         :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index a89396e2c8..c01b624275 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -250,3 +250,65 @@ def test_split_id_sequentiality_primary_and_secondary():
     split_ids = [doc.meta["split_id"] for doc in result["documents"]]
     assert split_ids == list(range(len(split_ids)))
     assert split_ids == list(range(len(split_ids)))
+
+
+def test_secondary_split_with_overlap():
+    text = "# Header\n" + "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10"
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # Overlap of 2, so each chunk after the first should share 2 words with previous
+    assert len(split_docs) > 1
+    for i in range(1, len(split_docs)):
+        prev_words = split_docs[i - 1].content.split()
+        curr_words = split_docs[i].content.split()
+        # The overlap should be the last 2 words of previous == first 2 of current
+        assert prev_words[-2:] == curr_words[:2]
+
+
+def test_secondary_split_with_threshold():
+    text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)])
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # The last chunk should have at least split_threshold words if possible
+    for doc in split_docs[:-1]:
+        assert len(doc.content.split()) == 3
+    # The last chunk should have at least 2 words (threshold)
+    assert len(split_docs[-1].content.split()) >= 2
+
+
+def test_page_break_handling_in_secondary_split():
+    text = "# Header\nFirst page\fSecond page\fThird page"
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # The page_number should increment at each page break
+    page_numbers = [doc.meta.get("page_number") for doc in split_docs]
+    # Should start at 1 and increment at each \f
+    assert page_numbers[0] == 1
+    assert 2 in page_numbers
+    # Remove: assert 3 in page_numbers
+    # Instead, check that the max page number is 2 or 3, depending on split alignment
+    assert max(page_numbers) >= 2
+
+
+def test_page_break_handling_with_multiple_headers():
+    text = "# Header 1\nPage 1\fPage 2\n# Header 2\nPage 3\fPage 4"
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    # Collect page numbers for each header
+    header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"]
+    header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"]
+    # Both headers should have splits with page_number 1 and 2 for Header 1, and 1 and 2 for Header 2
+    # (relative to their own chunk)
+    assert min(header1_pages) == 1
+    assert max(header1_pages) >= 2
+    # header2_pages may start at 2 if the previous header's last chunk ended with a page break
+    assert min(header2_pages) >= 1
+    assert max(header2_pages) >= 2

From f5b91f06aa9a4eff864bd61c193ff6b61d124b49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 17:23:40 +0200
Subject: [PATCH 20/85] quit exposing splitting_function param since it
 shouldn't be changed anyway

---
 .../components/preprocessors/markdown_header_splitter.py  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 35389c3d62..484417db57 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -275,11 +275,11 @@ def _process_split_content(self, split_content: str, split_index: int) -> int:
             )
         return page_breaks
 
-    def _split_documents_by_function(self, documents: list[Document], splitting_function: Callable) -> list[Document]:
+    def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]:
         result_docs = []
         for doc in documents:
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
-            splits = splitting_function(doc.content)
+            splits = self._split_by_markdown_headers(doc.content)
             docs = []
             total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
             current_page = doc.meta.get("page_number", 1)
@@ -337,9 +337,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         if not processed_documents:
             return {"documents": []}
 
-        header_split_docs = self._split_documents_by_function(
-            processed_documents, splitting_function=self._split_by_markdown_headers
-        )
+        header_split_docs = self._split_documents_by_markdown_headers(processed_documents)
         logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
         if self.secondary_split != "none":

From 83e5579d4086504b39c88d44fb0ffe52bf9d5b2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 17:24:35 +0200
Subject: [PATCH 21/85] remove test section in module

---
 .../preprocessors/markdown_header_splitter.py | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 484417db57..d913bf7ba7 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -349,43 +349,3 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
             doc.meta["split_id"] = idx
 
         return {"documents": final_docs}
-
-
-# TODO: move to proper test file once ready
-if __name__ == "__main__":
-    print()
-    print("===== Example 1: Regular splitting =====")
-    splitter = MarkdownHeaderSplitter()
-    content = """# Header 1
-## Subheader 1.1
-Content under subheader 1.1.
-## Subheader 1.2
-### Subheader 1.2.1
-Content under subheader 1.2.1."""
-    print("Original content:")
-    print(content)
-    example_doc = Document(content=content)
-    result = splitter.run(documents=[example_doc])
-    for doc in result["documents"]:
-        print("\n---Document---")
-        print(doc.content)
-        print(doc.meta)
-
-    print()
-    print("===== Example 2: Splitting with header inference =====")
-    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
-    content = """## Header 1
-## Subheader 1.1
-Content under subheader 1.1.
-## Subheader 1.2
-## Subheader 1.2.1
-Content under subheader 1.2.1."""
-    print("Original content:")
-    print(content)
-    example_doc = Document(content=content)
-    result = splitter.run(documents=[example_doc])
-    print("\nAfter header inference and splitting:")
-    for doc in result["documents"]:
-        print("\n---Document---")
-        print(doc.content)
-        print(doc.meta)

From f3625f528df9f25f421ef1a46fd48077483324c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 17:26:22 +0200
Subject: [PATCH 22/85] add license header

---
 .../components/preprocessors/markdown_header_splitter.py    | 6 +++++-
 .../preprocessors/test_markdown_header_splitter.py          | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index d913bf7ba7..f95e1e01ba 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -1,5 +1,9 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import re
-from typing import Callable, Literal, Optional
+from typing import Literal, Optional
 
 from haystack import Document, component, logging
 from haystack.components.preprocessors import DocumentSplitter
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index c01b624275..86b27a1c24 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from haystack import Document

From 526ac4f87a84178b55a2b0d1e2c64ce84cc35411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 19 Sep 2025 18:02:46 +0200
Subject: [PATCH 23/85] add release note

---
 ...d-md-header-splitter-df5c024a6ddd2718.yaml | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml

diff --git a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml
new file mode 100644
index 0000000000..48cf170d4e
--- /dev/null
+++ b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml
@@ -0,0 +1,36 @@
+---
+highlights: >
+  Added a MarkdownHeaderSplitter component for splitting documents at Markdown headers, with optional header level inference and secondary splitting based on Haystack's DocumentSplitter. This enables a more appropriate splitting logic for Markdown documents where sections are defined by headers, improving the structure and relevance of the resulting document chunks for downstream tasks.
+
+features:
+  - |
+    Introduced the `MarkdownHeaderSplitter` component:
+      - Splits documents into chunks at Markdown headers (`#`, `##`, etc.), preserving header hierarchy as metadata.
+      - Optionally infers and rewrites header levels for documents where header structure is ambiguous (e.g. documents parsed using Docling).
+      - Supports secondary splitting (by word, passage, period, or line) for further chunking after header-based splitting using Haystack's `DocumentSplitter`.
+      - Preserves and propagates metadata such as parent headers and page numbers.
+      - Handles edge cases such as documents with no headers, empty content, and non-text documents.
+
+upgrade:
+  - |
+    No upgrade actions required. This is a new component and does not affect existing pipelines.
+
+enhancements:
+  - |
+    Improves preprocessing flexibility for Markdown documents, making it easier to build indexing pipelines for Markdown-files.
+
+issues:
+  - |
+    Inferring header levels can only move downwards in the header hierarchy (e.g., `##` to `###`), not back up (e.g., `###` to `##`), meaning that the algorithm may not perfectly reconstruct the original header hierarchy in cases where header levels move up again.
+
+deprecations:
+  - |
+    None.
+
+security:
+  - |
+    No security-related changes.
+
+fixes:
+  - |
+    N/A (new feature).

From a46ac62782911102c84bfb974f69c39cf04b099e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 23 Sep 2025 11:08:54 +0200
Subject: [PATCH 24/85] minor refactor for type safety

---
 .../preprocessors/markdown_header_splitter.py | 37 ++++++++++++-------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f95e1e01ba..63f163dca4 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -54,6 +54,7 @@ def __init__(
         self.split_threshold = split_threshold
 
         # initialize secondary_splitter only if needed
+        self.secondary_splitter: Optional[DocumentSplitter]
         if self.secondary_split != "none":
             self.secondary_splitter = DocumentSplitter(
                 split_by=self.secondary_split,
@@ -151,9 +152,9 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]:
             return [{"content": text, "meta": {"header": None, "parentheaders": []}}]
 
         # process headers and build chunks
-        chunks = []
-        header_stack = [None] * 6
-        active_parents = []
+        chunks: list[dict] = []
+        header_stack: list[Optional[str]] = [None] * 6
+        active_parents: list[str] = []
 
         for i, match in enumerate(matches):
             # extract header info
@@ -174,7 +175,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]:
             # skip splits w/o content
             if not content:
                 # Add as parent for subsequent headers
-                active_parents = [h for h in header_stack[: level - 1] if h]
+                active_parents = [h for h in header_stack[: level - 1] if h is not None]
                 active_parents.append(header_text)
                 continue
 
@@ -193,7 +194,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]:
             )
 
             # reset active parents
-            active_parents = [h for h in header_stack[: level - 1] if h]
+            active_parents = [h for h in header_stack[: level - 1] if h is not None]
 
         logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks))
         return chunks
@@ -211,14 +212,16 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         result_docs = []
 
         for doc in documents:
+            if doc.content is None:
+                result_docs.append(doc)
+                continue
             # extract header information
             header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content)
+            content_for_splitting: str = doc.content
             if header_match:
                 content_for_splitting = doc.content[header_match.end() :]
-            else:
-                content_for_splitting = doc.content
 
-            if not content_for_splitting.strip():  # skip empty content
+            if not content_for_splitting or not content_for_splitting.strip():  # skip empty content
                 result_docs.append(doc)
                 continue
 
@@ -226,6 +229,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             current_page = doc.meta.get("page_number", 1)
 
             # use the pre-initialized secondary splitter
+            if self.secondary_splitter is None:
+                result_docs.append(doc)
+                continue
             secondary_splits = self.secondary_splitter.run(
                 documents=[Document(content=content_for_splitting, meta=doc.meta)]
             )["documents"]
@@ -236,8 +242,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 # calculate page number for this split
                 if i > 0:  # page break counting
                     prev_content = secondary_splits[i - 1].content
-                    page_breaks = prev_content.count(self.page_break_character)
-                    accumulated_page_breaks += page_breaks
+                    if prev_content is not None:
+                        page_breaks = prev_content.count(self.page_break_character)
+                        accumulated_page_breaks += page_breaks
 
                 # set page number to meta
                 split.meta["page_number"] = current_page + accumulated_page_breaks
@@ -283,10 +290,14 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
         result_docs = []
         for doc in documents:
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
+            if doc.content is None:
+                continue
             splits = self._split_by_markdown_headers(doc.content)
             docs = []
-            total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1
-            current_page = doc.meta.get("page_number", 1)
+            total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0
+            if not total_pages:
+                total_pages = doc.content.count(self.page_break_character) + 1
+            current_page = doc.meta.get("page_number", 1) if doc.meta else 1
             logger.debug(
                 "Starting page number: {current_page}, Total pages: {total_pages}",
                 current_page=current_page,
@@ -300,7 +311,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 page_breaks = self._process_split_content(split["content"], i)
                 current_page += page_breaks
                 if split.get("meta"):
-                    meta.update(self._flatten_dict(split.get("meta")))
+                    meta.update(self._flatten_dict(split.get("meta") or {}))
                 docs.append(Document(content=split["content"], meta=meta))
             logger.debug(
                 "Split into {num_docs} documents for id={doc_id}, final page: {current_page}",

From 821d907015f9970033658cdc07452c142d177d7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Tue, 23 Sep 2025 11:22:14 +0200
Subject: [PATCH 25/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 haystack/components/preprocessors/markdown_header_splitter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 63f163dca4..20f591ae46 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -26,6 +26,7 @@ class MarkdownHeaderSplitter:
 
     def __init__(
         self,
+        *,
         infer_header_levels: bool = False,
         page_break_character: str = "\f",
         secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",

From c630e14f7de8a9fd54d7d0b72ef0add4941bc653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 23 Sep 2025 11:35:11 +0200
Subject: [PATCH 26/85] remove unneeded release notes entries

---
 ...d-md-header-splitter-df5c024a6ddd2718.yaml | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml
index 48cf170d4e..bb5cbec612 100644
--- a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml
+++ b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml
@@ -1,7 +1,4 @@
 ---
-highlights: >
-  Added a MarkdownHeaderSplitter component for splitting documents at Markdown headers, with optional header level inference and secondary splitting based on Haystack's DocumentSplitter. This enables a more appropriate splitting logic for Markdown documents where sections are defined by headers, improving the structure and relevance of the resulting document chunks for downstream tasks.
-
 features:
   - |
     Introduced the `MarkdownHeaderSplitter` component:
@@ -10,27 +7,3 @@ features:
       - Supports secondary splitting (by word, passage, period, or line) for further chunking after header-based splitting using Haystack's `DocumentSplitter`.
       - Preserves and propagates metadata such as parent headers and page numbers.
       - Handles edge cases such as documents with no headers, empty content, and non-text documents.
-
-upgrade:
-  - |
-    No upgrade actions required. This is a new component and does not affect existing pipelines.
-
-enhancements:
-  - |
-    Improves preprocessing flexibility for Markdown documents, making it easier to build indexing pipelines for Markdown-files.
-
-issues:
-  - |
-    Inferring header levels can only move downwards in the header hierarchy (e.g., `##` to `###`), not back up (e.g., `###` to `##`), meaning that the algorithm may not perfectly reconstruct the original header hierarchy in cases where header levels move up again.
-
-deprecations:
-  - |
-    None.
-
-security:
-  - |
-    No security-related changes.
-
-fixes:
-  - |
-    N/A (new feature).

From fa53e1b8b59ed64dd7015c551e4c953d5a9dbc77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 23 Sep 2025 11:42:05 +0200
Subject: [PATCH 27/85] improved documentation for methods

---
 .../components/preprocessors/markdown_header_splitter.py   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 20f591ae46..0ec25d7ff4 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -139,7 +139,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str:
         logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches))
         return modified_text
 
-    def _split_by_markdown_headers(self, text: str) -> list[dict]:
+    def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
         """Split text by markdown headers and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
 
@@ -265,6 +265,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         return result_docs
 
     def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
+        """Flatten a nested dictionary, concatenating keys with underscores."""
         if target_dict is None:
             target_dict = {}
         for key, value in d.items():
@@ -276,6 +277,7 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] =
         return target_dict
 
     def _process_split_content(self, split_content: str, split_index: int) -> int:
+        """Count page breaks in the split content and log if any are found."""
         if not isinstance(split_content, str):
             return 0
         page_breaks = split_content.count(self.page_break_character)
@@ -288,12 +290,13 @@ def _process_split_content(self, split_content: str, split_index: int) -> int:
         return page_breaks
 
     def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]:
+        """Split a list of documents by markdown headers, preserving metadata."""
         result_docs = []
         for doc in documents:
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
             if doc.content is None:
                 continue
-            splits = self._split_by_markdown_headers(doc.content)
+            splits = self._split_text_by_markdown_headers(doc.content)
             docs = []
             total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0
             if not total_pages:

From 1e6cbe39d7ded6508f98b6d47e60561a9ec7ac23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 23 Sep 2025 14:07:43 +0200
Subject: [PATCH 28/85] improve method naming

---
 .../components/preprocessors/markdown_header_splitter.py  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 0ec25d7ff4..b83080840e 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -66,7 +66,7 @@ def __init__(
         else:
             self.secondary_splitter = None
 
-    def _infer_and_rewrite_header_levels(self, text: str) -> str:
+    def _infer_header_levels(self, text: str) -> str:
         """
         Infer and rewrite header levels in the markdown text.
 
@@ -276,7 +276,7 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] =
                 target_dict[new_key] = value
         return target_dict
 
-    def _process_split_content(self, split_content: str, split_index: int) -> int:
+    def _count_page_breaks(self, split_content: str, split_index: int) -> int:
         """Count page breaks in the split content and log if any are found."""
         if not isinstance(split_content, str):
             return 0
@@ -312,7 +312,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)
                 meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
-                page_breaks = self._process_split_content(split["content"], i)
+                page_breaks = self._count_page_breaks(split["content"], i)
                 current_page += page_breaks
                 if split.get("meta"):
                     meta.update(self._flatten_dict(split.get("meta") or {}))
@@ -348,7 +348,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
             if not doc.content or not doc.content.strip():
                 continue
             if infer_header_levels:
-                content = self._infer_and_rewrite_header_levels(doc.content)
+                content = self._infer_header_levels(doc.content)
                 processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))
             else:
                 processed_documents.append(doc)

From e756d998f242889533fe1456f880db0d0681de30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 23 Sep 2025 14:21:07 +0200
Subject: [PATCH 29/85] improved page-number assignment & added return in
 docstring

minor cleanup
---
 .../preprocessors/markdown_header_splitter.py    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index b83080840e..07429f2202 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -259,6 +259,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
         # assign unique, sequential split_id to all final chunks
         for idx, doc in enumerate(result_docs):
+            if doc.meta is None:
+                doc.meta = {}
             doc.meta["split_id"] = idx
 
         logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
@@ -334,6 +336,12 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         :param documents: List of documents to split
         :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting.
             If None, uses the value from initialization.
+
+        :returns: A dictionary with the following key:
+            - `documents`: List of documents with the split texts. Each document includes:
+            - A metadata field `source_id` to track the original document.
+            - A metadata field `page_number` to track the original page number.
+            - All other metadata copied from the original document.
         """
         # validate input documents
         for doc in documents:
@@ -363,8 +371,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
             final_docs = self._apply_secondary_splitting(header_split_docs)
         else:
             final_docs = header_split_docs
-
-        for idx, doc in enumerate(final_docs):
-            doc.meta["split_id"] = idx
+            # assign split_id only if secondary splitting is not applied
+            for idx, doc in enumerate(final_docs):
+                if doc.meta is None:
+                    doc.meta = {}
+                doc.meta["split_id"] = idx
 
         return {"documents": final_docs}

From c48bdcf7a1162b372ac62c1745e760c88cc03c1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 10:31:51 +0200
Subject: [PATCH 30/85] unified page-counting

---
 .../preprocessors/markdown_header_splitter.py | 55 ++++++++++++-------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 07429f2202..070df0cffb 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -236,19 +236,15 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             secondary_splits = self.secondary_splitter.run(
                 documents=[Document(content=content_for_splitting, meta=doc.meta)]
             )["documents"]
-            accumulated_page_breaks = 0  # track page breaks
 
             # split processing
             for i, split in enumerate(secondary_splits):
                 # calculate page number for this split
-                if i > 0:  # page break counting
-                    prev_content = secondary_splits[i - 1].content
-                    if prev_content is not None:
-                        page_breaks = prev_content.count(self.page_break_character)
-                        accumulated_page_breaks += page_breaks
+                if i > 0 and secondary_splits[i - 1].content:
+                    _, current_page = self._count_page_breaks_and_update(secondary_splits[i - 1].content, current_page)
 
                 # set page number to meta
-                split.meta["page_number"] = current_page + accumulated_page_breaks
+                split.meta["page_number"] = current_page
 
                 # preserve header metadata
                 for key in ["header", "parentheaders"]:
@@ -278,18 +274,29 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] =
                 target_dict[new_key] = value
         return target_dict
 
-    def _count_page_breaks(self, split_content: str, split_index: int) -> int:
-        """Count page breaks in the split content and log if any are found."""
-        if not isinstance(split_content, str):
-            return 0
-        page_breaks = split_content.count(self.page_break_character)
+    def _count_page_breaks_and_update(self, content: str, current_page: int) -> tuple[int, int]:
+        """
+        Count page breaks in content and return updated page count.
+
+        :param content: Content to check for page breaks
+        :param current_page: Current page number
+        :return: Tuple of (page_breaks_count, new_current_page)
+        """
+        if not isinstance(content, str):
+            return 0, current_page
+
+        page_breaks = content.count(self.page_break_character)
+        new_page_number = current_page + page_breaks
+
         if page_breaks > 0:
             logger.debug(
-                "Found {page_breaks} page breaks in split {split_index}",
+                "Found {page_breaks} page breaks, page number updated: {old} → {new}",
                 page_breaks=page_breaks,
-                split_index=split_index,
+                old=current_page,
+                new=new_page_number,
             )
-        return page_breaks
+
+        return page_breaks, new_page_number
 
     def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]:
         """Split a list of documents by markdown headers, preserving metadata."""
@@ -300,9 +307,8 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 continue
             splits = self._split_text_by_markdown_headers(doc.content)
             docs = []
-            total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0
-            if not total_pages:
-                total_pages = doc.content.count(self.page_break_character) + 1
+            total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0)
+
             current_page = doc.meta.get("page_number", 1) if doc.meta else 1
             logger.debug(
                 "Starting page number: {current_page}, Total pages: {total_pages}",
@@ -314,8 +320,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)
                 meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
-                page_breaks = self._count_page_breaks(split["content"], i)
-                current_page += page_breaks
+                _, current_page = self._count_page_breaks_and_update(split["content"], current_page)
                 if split.get("meta"):
                     meta.update(self._flatten_dict(split.get("meta") or {}))
                 docs.append(Document(content=split["content"], meta=meta))
@@ -328,6 +333,16 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
             result_docs.extend(docs)
         return result_docs
 
+    def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int:
+        """Calculate total pages based on content and existing metadata."""
+        if existing_total > 0:
+            return existing_total
+
+        if not isinstance(content, str):
+            return 1
+
+        return content.count(self.page_break_character) + 1
+
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]:
         """

From decaadffdff553b5fa6634c7c675224e843a568c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 10:38:12 +0200
Subject: [PATCH 31/85] simplify conditional secondary-split initialization and
 usage

---
 .../preprocessors/markdown_header_splitter.py | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 070df0cffb..95bcc13f95 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -55,7 +55,6 @@ def __init__(
         self.split_threshold = split_threshold
 
         # initialize secondary_splitter only if needed
-        self.secondary_splitter: Optional[DocumentSplitter]
         if self.secondary_split != "none":
             self.secondary_splitter = DocumentSplitter(
                 split_by=self.secondary_split,
@@ -63,8 +62,6 @@ def __init__(
                 split_overlap=self.split_overlap,
                 split_threshold=self.split_threshold,
             )
-        else:
-            self.secondary_splitter = None
 
     def _infer_header_levels(self, text: str) -> str:
         """
@@ -216,6 +213,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             if doc.content is None:
                 result_docs.append(doc)
                 continue
+
             # extract header information
             header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content)
             content_for_splitting: str = doc.content
@@ -229,10 +227,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             # track page from meta
             current_page = doc.meta.get("page_number", 1)
 
-            # use the pre-initialized secondary splitter
-            if self.secondary_splitter is None:
-                result_docs.append(doc)
-                continue
             secondary_splits = self.secondary_splitter.run(
                 documents=[Document(content=content_for_splitting, meta=doc.meta)]
             )["documents"]
@@ -382,11 +376,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         header_split_docs = self._split_documents_by_markdown_headers(processed_documents)
         logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
-        if self.secondary_split != "none":
-            final_docs = self._apply_secondary_splitting(header_split_docs)
-        else:
-            final_docs = header_split_docs
-            # assign split_id only if secondary splitting is not applied
+        # secondary splitting if configured
+        final_docs = (
+            self._apply_secondary_splitting(header_split_docs) if self.secondary_split != "none" else header_split_docs
+        )
+
+        # assign split_id if not already done in secondary splitting
+        if self.secondary_split == "none":
             for idx, doc in enumerate(final_docs):
                 if doc.meta is None:
                     doc.meta = {}

From 3ef71c4fb376f287764c97bec5640e2684440831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 13:54:12 +0200
Subject: [PATCH 32/85] fix linting error

---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 95bcc13f95..62b4520f0b 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -309,7 +309,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 current_page=current_page,
                 total_pages=total_pages,
             )
-            for i, split in enumerate(splits):
+            for split in splits:
                 meta = {}
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)

From 0fbea3a220e89987e083ba0b76d1abcadfa2b48e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 16:39:25 +0200
Subject: [PATCH 33/85] clearly specify the use of ATX-style headers (#) only

---
 haystack/components/preprocessors/markdown_header_splitter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 62b4520f0b..8fcb5c7b3a 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -14,7 +14,7 @@
 @component
 class MarkdownHeaderSplitter:
     """
-    Split documents at Markdown headers, with optional secondary splitting and header level inference.
+    Split documents at ATX-style Markdown headers (#), with optional secondary splitting and header level inference.
 
     This component processes text documents by:
     - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata.
@@ -137,7 +137,7 @@ def _infer_header_levels(self, text: str) -> str:
         return modified_text
 
     def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
-        """Split text by markdown headers and create chunks with appropriate metadata."""
+        """Split text by ATX-style headers (#) and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
 
         # find headers

From 38119a6a087d7ae01e33965d118ffb77dc49ecb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 16:47:37 +0200
Subject: [PATCH 34/85] reference doc_id when logging no headers found

---
 .../preprocessors/markdown_header_splitter.py        | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 8fcb5c7b3a..077b7d981e 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -63,7 +63,7 @@ def __init__(
                 split_threshold=self.split_threshold,
             )
 
-    def _infer_header_levels(self, text: str) -> str:
+    def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str:
         """
         Infer and rewrite header levels in the markdown text.
 
@@ -75,6 +75,9 @@ def _infer_header_levels(self, text: str) -> str:
 
         This is useful for documents where all headers are at the same level, such as
         output from document conversion tools like docling.
+
+        :param text: The text to process
+        :param doc_id: Optional document ID for logging context
         """
         logger.debug("Inferring and rewriting header levels")
 
@@ -83,7 +86,10 @@ def _infer_header_levels(self, text: str) -> str:
         matches = list(re.finditer(pattern, text))
 
         if not matches:
-            logger.info("No headers found in document; skipping header level inference.")
+            logger.info(
+                "No headers found in document{doc_ref}; skipping header level inference.",
+                doc_ref=f" (id: {doc_id})" if doc_id else "",
+            )
             return text
 
         modified_text = text
@@ -365,7 +371,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
             if not doc.content or not doc.content.strip():
                 continue
             if infer_header_levels:
-                content = self._infer_header_levels(doc.content)
+                content = self._infer_header_levels(doc.content, doc_id=doc.id)
                 processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))
             else:
                 processed_documents.append(doc)

From e12e7f75e28ef7c1c88150b3cf87e12a175c0f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Wed, 24 Sep 2025 16:50:42 +0200
Subject: [PATCH 35/85] initialize md-header pattern as private variable once

---
 .../components/preprocessors/markdown_header_splitter.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 077b7d981e..44dc40beeb 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -53,6 +53,7 @@ def __init__(
         self.split_length = split_length
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
+        self._header_pattern = r"(?m)^(#{1,6}) (.+)$"  # ATX-style .md-headers
 
         # initialize secondary_splitter only if needed
         if self.secondary_split != "none":
@@ -82,8 +83,7 @@ def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str:
         logger.debug("Inferring and rewriting header levels")
 
         # find headers
-        pattern = r"(?m)^(#{1,6}) (.+)$"
-        matches = list(re.finditer(pattern, text))
+        matches = list(re.finditer(self._header_pattern, text))
 
         if not matches:
             logger.info(
@@ -147,8 +147,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
         logger.debug("Splitting text by markdown headers")
 
         # find headers
-        pattern = r"(?m)^(#{1,6}) (.+)$"
-        matches = list(re.finditer(pattern, text))
+        matches = list(re.finditer(self._header_pattern, text))
 
         # return unsplit if no headers found
         if not matches:
@@ -221,7 +220,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 continue
 
             # extract header information
-            header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content)
+            header_match = re.search(self._header_pattern, doc.content)
             content_for_splitting: str = doc.content
             if header_match:
                 content_for_splitting = doc.content[header_match.end() :]

From f31528e83242f7c70f97a06151104001e053852a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 25 Sep 2025 10:35:41 +0200
Subject: [PATCH 36/85] add example to for inferring header levels to docstring

---
 .../preprocessors/markdown_header_splitter.py | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 44dc40beeb..923beba75a 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -38,7 +38,20 @@ def __init__(
         Initialize the MarkdownHeaderSplitter.
 
         :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-            Useful for documents where all headers use the same level. Defaults to False.
+            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
+            For example, a document like:
+                "## Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            Would be normalized to:
+                "# Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            This attempts to maintain proper hierarchical structure. Defaults to False.
         :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are "none", "word", "passage", "period", "line". Defaults to "none".
@@ -348,8 +361,22 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         Run the markdown header splitter with optional secondary splitting.
 
         :param documents: List of documents to split
-        :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting.
-            If None, uses the value from initialization.
+        :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
+            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
+            For example, a document like:
+                "## Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            Would be normalized to:
+                "# Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            This attempts to maintain proper hierarchical structure. Defaults to False.
+            If None, uses the instance's initialized infer_header_levels setting.
 
         :returns: A dictionary with the following key:
             - `documents`: List of documents with the split texts. Each document includes:

From cee156c216164c3dfa54d6b60a4bd4519166d50b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 25 Sep 2025 10:48:49 +0200
Subject: [PATCH 37/85] improve empty document handling

add more logging for empty documents
---
 .../preprocessors/markdown_header_splitter.py | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 923beba75a..0f248ced89 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -33,6 +33,7 @@ def __init__(
         split_length: int = 200,
         split_overlap: int = 0,
         split_threshold: int = 0,
+        skip_empty_documents: bool = True,
     ):
         """
         Initialize the MarkdownHeaderSplitter.
@@ -59,6 +60,8 @@ def __init__(
         :param split_overlap: The number of overlapping units for each split when using secondary splitting.
             Defaults to 0.
         :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
+        :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
+            Defaults to True.
         """
         self.infer_header_levels = infer_header_levels
         self.page_break_character = page_break_character
@@ -66,6 +69,7 @@ def __init__(
         self.split_length = split_length
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
+        self.skip_empty_documents = skip_empty_documents
         self._header_pattern = r"(?m)^(#{1,6}) (.+)$"  # ATX-style .md-headers
 
         # initialize secondary_splitter only if needed
@@ -386,6 +390,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
         """
         # validate input documents
         for doc in documents:
+            if doc.content is None:
+                raise ValueError(
+                    (
+                        "MarkdownHeaderSplitter only works with text documents but content for document ID"
+                        f" {doc.id} is None."
+                    )
+                )
             if not isinstance(doc.content, str):
                 raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).")
 
@@ -393,9 +404,20 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
 
         processed_documents = []
         for doc in documents:
-            # skip empty documents
+            # handle empty documents
             if not doc.content or not doc.content.strip():
-                continue
+                if self.skip_empty_documents:
+                    logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
+                    continue
+                else:
+                    # keep empty documents
+                    processed_documents.append(doc)
+                    logger.warning(
+                        "Document ID {doc_id} has an empty content. Keeping this document as per configuration.",
+                        doc_id=doc.id,
+                    )
+                    continue
+
             if infer_header_levels:
                 content = self._infer_header_levels(doc.content, doc_id=doc.id)
                 processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))

From c63035f5999c64b43b9a436364f4816a79232732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 25 Sep 2025 11:36:31 +0200
Subject: [PATCH 38/85] more explicit testing for inferred headers

---
 .../test_markdown_header_splitter.py            | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 86b27a1c24..7db00dc927 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -104,8 +104,21 @@ def test_split_infer_header_levels():
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Should rewrite headers to # and ##
-    assert split_docs[0].content.startswith("## H2") or split_docs[0].content.startswith("# H1")
+
+    # Should have exactly one document
+    assert len(split_docs) == 1
+
+    # Extract header information from metadata instead of content
+    h1_doc = next((doc for doc in split_docs if doc.meta["header"] == "H1"), None)
+    h2_doc = next((doc for doc in split_docs if doc.meta["header"] == "H2"), None)
+
+    # Check proper doc creation
+    assert h1_doc is None
+    assert h2_doc is not None
+
+    # Check that headers are properly leveled (looking at content)
+    assert "H1" in h2_doc.meta["parentheaders"]
+    assert "## H2" in h2_doc.content
 
 
 def test_infer_header_levels_complex():

From cf1b82071fde5f42440271d12e2adfb4ed44b313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 25 Sep 2025 11:41:17 +0200
Subject: [PATCH 39/85] fix linting issue

---
 .../preprocessors/markdown_header_splitter.py     | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 0f248ced89..6d059889df 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -409,14 +409,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
                 if self.skip_empty_documents:
                     logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                     continue
-                else:
-                    # keep empty documents
-                    processed_documents.append(doc)
-                    logger.warning(
-                        "Document ID {doc_id} has an empty content. Keeping this document as per configuration.",
-                        doc_id=doc.id,
-                    )
-                    continue
+                # keep empty documents
+                processed_documents.append(doc)
+                logger.warning(
+                    "Document ID {doc_id} has an empty content. Keeping this document as per configuration.",
+                    doc_id=doc.id,
+                )
+                continue
 
             if infer_header_levels:
                 content = self._infer_header_levels(doc.content, doc_id=doc.id)

From 22369b6dad89c73719e57d0934a62611d8995375 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 26 Sep 2025 17:15:13 +0200
Subject: [PATCH 40/85] improved empty content handling test cases

---
 .../preprocessors/test_markdown_header_splitter.py    | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 7db00dc927..40380d9db7 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -221,13 +221,16 @@ def test_invalid_split_parameters_at_init():
 
 def test_empty_content_handling():
     """Test handling of documents with empty content."""
-    splitter = MarkdownHeaderSplitter()
+    splitter_skip = MarkdownHeaderSplitter()  # skip empty documents by default
     docs = [Document(content="")]
-    result = splitter.run(documents=docs)
-
-    # DocumentSplitter skips empty documents by default
+    result = splitter_skip.run(documents=docs)
     assert len(result["documents"]) == 0
 
+    splitter_no_skip = MarkdownHeaderSplitter(skip_empty_documents=False)
+    docs = [Document(content="")]
+    result = splitter_no_skip.run(documents=docs)
+    assert len(result["documents"]) == 1
+
 
 # Output format and split ID checks
 def test_document_splitting_format():

From 316ebec2ad6031df8eef4aeda9158d46f92547d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 29 Sep 2025 15:59:43 +0200
Subject: [PATCH 41/85] remove all functionality related to inferring md-header
 levels

---
 .../preprocessors/markdown_header_splitter.py | 134 ++----------------
 .../test_markdown_header_splitter.py          |  68 ---------
 2 files changed, 8 insertions(+), 194 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 6d059889df..245b51d8d1 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -14,20 +14,18 @@
 @component
 class MarkdownHeaderSplitter:
     """
-    Split documents at ATX-style Markdown headers (#), with optional secondary splitting and header level inference.
+    Split documents at ATX-style Markdown headers (#), with optional secondary splitting.
 
     This component processes text documents by:
     - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata.
-    - Optionally inferring and rewriting header levels for documents where header structure is ambiguous.
-    - Optionally applying a secondary split (by word, passage, period, or line) to each chunk.
-      This is done in haystack's DocumentSplitter.
+    - Optionally applying a secondary split (by word, passage, period, or line) to each chunk
+      (using haystack's DocumentSplitter).
     - Preserving and propagating metadata such as parent headers, page numbers, and split IDs.
     """
 
     def __init__(
         self,
         *,
-        infer_header_levels: bool = False,
         page_break_character: str = "\f",
         secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",
         split_length: int = 200,
@@ -38,21 +36,6 @@ def __init__(
         """
         Initialize the MarkdownHeaderSplitter.
 
-        :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
-            For example, a document like:
-                "## Title
-                 ## Introduction
-                 Introductory text
-                 ## Methods
-                 Method details"
-            Would be normalized to:
-                "# Title
-                 ## Introduction
-                 Introductory text
-                 ## Methods
-                 Method details"
-            This attempts to maintain proper hierarchical structure. Defaults to False.
         :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are "none", "word", "passage", "period", "line". Defaults to "none".
@@ -63,7 +46,6 @@ def __init__(
         :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
             Defaults to True.
         """
-        self.infer_header_levels = infer_header_levels
         self.page_break_character = page_break_character
         self.secondary_split = secondary_split
         self.split_length = split_length
@@ -81,84 +63,6 @@ def __init__(
                 split_threshold=self.split_threshold,
             )
 
-    def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str:
-        """
-        Infer and rewrite header levels in the markdown text.
-
-        This function analyzes the document structure to infer proper header levels:
-        - First header is always level 1
-        - If there's content between headers, the next header stays at the same level
-        - If there's no content between headers, the next header goes one level deeper
-        - Header levels never exceed 6 (the maximum in markdown)
-
-        This is useful for documents where all headers are at the same level, such as
-        output from document conversion tools like docling.
-
-        :param text: The text to process
-        :param doc_id: Optional document ID for logging context
-        """
-        logger.debug("Inferring and rewriting header levels")
-
-        # find headers
-        matches = list(re.finditer(self._header_pattern, text))
-
-        if not matches:
-            logger.info(
-                "No headers found in document{doc_ref}; skipping header level inference.",
-                doc_ref=f" (id: {doc_id})" if doc_id else "",
-            )
-            return text
-
-        modified_text = text
-        offset = 0  # track offset due to length changes in headers
-
-        # track header structure
-        current_level = 1
-        header_stack = [1]  # always start with level 1
-
-        for i, match in enumerate(matches):
-            original_header = match.group(0)
-            header_text = match.group(2).strip()
-
-            # check if there's content between this header and the previous one
-            has_content = False
-            if i > 0:
-                prev_end = matches[i - 1].end()
-                current_start = match.start()
-                content_between = text[prev_end:current_start].strip()
-                has_content = bool(content_between)
-
-            # first header is always level 1
-            if i == 0:
-                inferred_level = 1
-            elif has_content:
-                # stay at the same level if there's content
-                inferred_level = current_level
-            else:
-                # go one level deeper if there's no content
-                inferred_level = min(current_level + 1, 6)
-
-            # update tracking variables
-            current_level = inferred_level
-            header_stack = header_stack[:inferred_level]
-            while len(header_stack) < inferred_level:
-                header_stack.append(1)
-
-            # new header with inferred level
-            new_prefix = "#" * inferred_level
-            new_header = f"{new_prefix} {header_text}"
-
-            # replace old header
-            start_pos = match.start() + offset
-            end_pos = match.end() + offset
-            modified_text = modified_text[:start_pos] + new_header + modified_text[end_pos:]
-
-            # update offset
-            offset += len(new_header) - len(original_header)
-
-        logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches))
-        return modified_text
-
     def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
         """Split text by ATX-style headers (#) and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
@@ -360,33 +264,17 @@ def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int:
         return content.count(self.page_break_character) + 1
 
     @component.output_types(documents=list[Document])
-    def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]:
+    def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         """
         Run the markdown header splitter with optional secondary splitting.
 
         :param documents: List of documents to split
-        :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
-            For example, a document like:
-                "## Title
-                 ## Introduction
-                 Introductory text
-                 ## Methods
-                 Method details"
-            Would be normalized to:
-                "# Title
-                 ## Introduction
-                 Introductory text
-                 ## Methods
-                 Method details"
-            This attempts to maintain proper hierarchical structure. Defaults to False.
-            If None, uses the instance's initialized infer_header_levels setting.
 
         :returns: A dictionary with the following key:
             - `documents`: List of documents with the split texts. Each document includes:
-            - A metadata field `source_id` to track the original document.
-            - A metadata field `page_number` to track the original page number.
-            - All other metadata copied from the original document.
+                - A metadata field `source_id` to track the original document.
+                - A metadata field `page_number` to track the original page number.
+                - All other metadata copied from the original document.
         """
         # validate input documents
         for doc in documents:
@@ -400,8 +288,6 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
             if not isinstance(doc.content, str):
                 raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).")
 
-        infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels
-
         processed_documents = []
         for doc in documents:
             # handle empty documents
@@ -417,11 +303,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
                 )
                 continue
 
-            if infer_header_levels:
-                content = self._infer_header_levels(doc.content, doc_id=doc.id)
-                processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))
-            else:
-                processed_documents.append(doc)
+            processed_documents.append(doc)
 
         if not processed_documents:
             return {"documents": []}
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 40380d9db7..19bf520626 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -97,74 +97,6 @@ def test_split_only_headers():
     assert len(split_docs) == 0
 
 
-# Header inference and overrides
-def test_split_infer_header_levels():
-    text = "## H1\n## H2\nContent"
-    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
-    docs = [Document(content=text)]
-    result = splitter.run(documents=docs)
-    split_docs = result["documents"]
-
-    # Should have exactly one document
-    assert len(split_docs) == 1
-
-    # Extract header information from metadata instead of content
-    h1_doc = next((doc for doc in split_docs if doc.meta["header"] == "H1"), None)
-    h2_doc = next((doc for doc in split_docs if doc.meta["header"] == "H2"), None)
-
-    # Check proper doc creation
-    assert h1_doc is None
-    assert h2_doc is not None
-
-    # Check that headers are properly leveled (looking at content)
-    assert "H1" in h2_doc.meta["parentheaders"]
-    assert "## H2" in h2_doc.content
-
-
-def test_infer_header_levels_complex():
-    """Test header level inference with a complex document structure."""
-    text = (
-        "## All Headers Same Level\n"
-        "Some content\n"
-        "## Second Header\n"
-        "Some content\n"  # Added content to ensure headers are processed correctly
-        "## Third Header With No Content\n"
-        "## Fourth Header With No Content\n"
-        "## Fifth Header\n"
-        "More content"
-    )
-
-    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
-    docs = [Document(content=text)]
-    result = splitter.run(documents=docs)
-    split_docs = result["documents"]
-
-    # Get docs by header content to avoid position assumptions
-    first_doc = next((doc for doc in split_docs if "All Headers Same Level" in doc.content), None)
-    second_doc = next((doc for doc in split_docs if "Second Header" in doc.content), None)
-
-    # First header should be level 1
-    assert first_doc and "# All Headers Same Level" in first_doc.content
-
-    # Second header with content should stay at level 1
-    assert second_doc and "# Second Header" in second_doc.content
-
-
-def test_infer_header_levels_override_both_directions():
-    text = "## H1\n## H2\nContent"
-    docs = [Document(content=text)]
-
-    # False at init, True at run
-    splitter = MarkdownHeaderSplitter(infer_header_levels=False)
-    result = splitter.run(documents=docs, infer_header_levels=True)
-    assert "# " in result["documents"][0].content
-
-    # True at init, False at run
-    splitter = MarkdownHeaderSplitter(infer_header_levels=True)
-    result = splitter.run(documents=docs, infer_header_levels=False)
-    assert all("## " in doc.content for doc in result["documents"])
-
-
 # Metadata preservation
 def test_preserve_document_metadata():
     """Test that document metadata is preserved through splitting."""

From d5e462c98b64020819b44025cf17c319dec9cbdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 30 Sep 2025 12:18:46 +0200
Subject: [PATCH 42/85] compile regex-pattern in init for performance gains

---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 245b51d8d1..e31dd55ac6 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -52,7 +52,7 @@ def __init__(
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
         self.skip_empty_documents = skip_empty_documents
-        self._header_pattern = r"(?m)^(#{1,6}) (.+)$"  # ATX-style .md-headers
+        self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$")  # ATX-style .md-headers
 
         # initialize secondary_splitter only if needed
         if self.secondary_split != "none":

From 4089ddc5cbd5fe688026e2d035c9e321703deb2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Mon, 13 Oct 2025 15:27:03 +0200
Subject: [PATCH 43/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index e31dd55ac6..45174f353a 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -27,7 +27,7 @@ def __init__(
         self,
         *,
         page_break_character: str = "\f",
-        secondary_split: Literal["none", "word", "passage", "period", "line"] = "none",
+        secondary_split: Optional[Literal["word", "passage", "period", "line"]] = None,
         split_length: int = 200,
         split_overlap: int = 0,
         split_threshold: int = 0,

From 20d172ef1a39d9c41880a51e26b1de756ee25e0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:09:12 +0200
Subject: [PATCH 44/85] change all "none" to proper None values

---
 .../preprocessors/markdown_header_splitter.py          | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 45174f353a..4167751324 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -38,7 +38,7 @@ def __init__(
 
         :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
         :param secondary_split: Optional secondary split condition after header splitting.
-            Options are "none", "word", "passage", "period", "line". Defaults to "none".
+            Options are None, "word", "passage", "period", "line". Defaults to None.
         :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
         :param split_overlap: The number of overlapping units for each split when using secondary splitting.
             Defaults to 0.
@@ -55,7 +55,7 @@ def __init__(
         self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$")  # ATX-style .md-headers
 
         # initialize secondary_splitter only if needed
-        if self.secondary_split != "none":
+        if self.secondary_split:
             self.secondary_splitter = DocumentSplitter(
                 split_by=self.secondary_split,
                 split_length=self.split_length,
@@ -129,7 +129,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
         Ensures page counting is maintained across splits.
         """
-        if self.secondary_split == "none":
+        if not self.secondary_split:
             return documents
 
         logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split)
@@ -313,11 +313,11 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
 
         # secondary splitting if configured
         final_docs = (
-            self._apply_secondary_splitting(header_split_docs) if self.secondary_split != "none" else header_split_docs
+            self._apply_secondary_splitting(header_split_docs) if not self.secondary_split else header_split_docs
         )
 
         # assign split_id if not already done in secondary splitting
-        if self.secondary_split == "none":
+        if not self.secondary_split:
             for idx, doc in enumerate(final_docs):
                 if doc.meta is None:
                     doc.meta = {}

From a7c6725a53447310e6feef1b09fd2a473729dfaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:14:50 +0200
Subject: [PATCH 45/85] fix minor

---
 haystack/components/preprocessors/markdown_header_splitter.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 4167751324..a00e7ff9fc 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -312,9 +312,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
         # secondary splitting if configured
-        final_docs = (
-            self._apply_secondary_splitting(header_split_docs) if not self.secondary_split else header_split_docs
-        )
+        final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs
 
         # assign split_id if not already done in secondary splitting
         if not self.secondary_split:

From c9c44eee30bc96e09954bcc95fd6dbbe776eeada Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:48:25 +0200
Subject: [PATCH 46/85] explicitly test doc content

---
 .../test_markdown_header_splitter.py            | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 19bf520626..998fbeebde 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -43,8 +43,23 @@ def test_basic_split(sample_text):
     assert "Subheader 1.2.3" in headers
 
     # Check that content is present and correct
+    header1_doc = next(doc for doc in split_docs if doc.meta["header"] == "Header 1")
+    assert "Content under header 1." in header1_doc.content
+
+    subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1")
+    assert "Content under sub-header 1.1.1" in subheader111_doc.content
+
+    subheader121_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.1")
+    assert "Content under header 1.2.1." in subheader121_doc.content
+
+    subheader122_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2")
+    assert "Content under header 1.2.2." in subheader122_doc.content
+
+    subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3")
+    assert "Content under header 1.2.3." in subheader123_doc.content
+
+    # Ensure all documents have a header in their metadata
     for doc in split_docs:
-        assert doc.content.startswith("#") or doc.content.startswith("##") or doc.content.startswith("###")
         assert doc.meta.get("header") is not None
 
 

From 0e36419750d2930e8f1ea682afc1837dc759ba61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:49:55 +0200
Subject: [PATCH 47/85] rename parentheaders to parent_headers

---
 .../components/preprocessors/markdown_header_splitter.py  | 8 ++++----
 .../preprocessors/test_markdown_header_splitter.py        | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index a00e7ff9fc..3da29749a2 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -73,7 +73,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
         # return unsplit if no headers found
         if not matches:
             logger.info("No headers found in document; returning full document as single chunk.")
-            return [{"content": text, "meta": {"header": None, "parentheaders": []}}]
+            return [{"content": text, "meta": {"header": None, "parent_headers": []}}]
 
         # process headers and build chunks
         chunks: list[dict] = []
@@ -104,7 +104,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
                 continue
 
             # get parent headers
-            parentheaders = list(active_parents)
+            parent_headers = list(active_parents)
 
             logger.debug(
                 "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
@@ -113,7 +113,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
             chunks.append(
                 {
                     "content": f"{header_prefix} {header_text}\n{content}",
-                    "meta": {"header": header_text, "parentheaders": parentheaders},
+                    "meta": {"header": header_text, "parent_headers": parent_headers},
                 }
             )
 
@@ -167,7 +167,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 split.meta["page_number"] = current_page
 
                 # preserve header metadata
-                for key in ["header", "parentheaders"]:
+                for key in ["header", "parent_headers"]:
                     if key in doc.meta:
                         split.meta[key] = doc.meta[key]
 
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 998fbeebde..d52b8cbf29 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -70,11 +70,11 @@ def test_split_parentheaders(sample_text):
     split_docs = result["documents"]
     # Check parentheaders for both a deep subheader and a simple one
     subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2")
-    assert "Header 1" in subheader_doc.meta["parentheaders"]
-    assert "Header 1.2" in subheader_doc.meta["parentheaders"]
+    assert "Header 1" in subheader_doc.meta["parent_headers"]
+    assert "Header 1.2" in subheader_doc.meta["parent_headers"]
     h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None)
     if h3_doc:
-        assert h3_doc.meta["parentheaders"] == ["H1", "H2"]
+        assert h3_doc.meta["parent_headers"] == ["H1", "H2"]
 
 
 def test_split_no_headers():

From edc60b5b948f03bd1783d001f8d03ec430a83a8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:54:33 +0200
Subject: [PATCH 48/85] test split_id, doc length

---
 .../preprocessors/test_markdown_header_splitter.py    | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index d52b8cbf29..ae3c5be8a2 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -73,8 +73,7 @@ def test_split_parentheaders(sample_text):
     assert "Header 1" in subheader_doc.meta["parent_headers"]
     assert "Header 1.2" in subheader_doc.meta["parent_headers"]
     h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None)
-    if h3_doc:
-        assert h3_doc.meta["parent_headers"] == ["H1", "H2"]
+    assert h3_doc.meta["parent_headers"] == ["H1", "H2"]
 
 
 def test_split_no_headers():
@@ -98,9 +97,17 @@ def test_split_multiple_documents(sample_text):
     ]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
+
+    assert len(split_docs) == 8
+
     headers = {doc.meta["header"] for doc in split_docs}
     assert {"Another Header", "H1", "H2"}.issubset(headers)
 
+    # Verify that all documents have a split_id and they're sequential
+    split_ids = [doc.meta.get("split_id") for doc in split_docs]
+    assert all(split_id is not None for split_id in split_ids)
+    assert split_ids == list(range(len(split_ids)))
+
 
 def test_split_only_headers():
     text = "# H1\n# H2\n# H3"

From 995c1219aacf371704532e397f33523486dcd895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:56:14 +0200
Subject: [PATCH 49/85] check meta content

---
 test/components/preprocessors/test_markdown_header_splitter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index ae3c5be8a2..ae3ed0eed4 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -135,7 +135,9 @@ def test_preserve_document_metadata():
 
     # New metadata should be added
     assert "header" in split_docs[0].meta
+    assert split_docs[0].meta["header"] == "Header"
     assert "split_id" in split_docs[0].meta
+    assert split_docs[0].meta["split_id"] == 0
 
 
 # Error and edge case handling

From 223a676f2c932519ecbd58ea731ccbf482f9bc4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 16:59:08 +0200
Subject: [PATCH 50/85] remove unneeded test

---
 .../preprocessors/test_markdown_header_splitter.py  | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index ae3ed0eed4..9273a3f30f 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -188,19 +188,6 @@ def test_empty_content_handling():
     assert len(result["documents"]) == 1
 
 
-# Output format and split ID checks
-def test_document_splitting_format():
-    """Test that the format of split documents is correct."""
-    splitter = MarkdownHeaderSplitter()
-    docs = [Document(content="# Header\nContent")]
-    result = splitter.run(documents=docs)
-
-    # Basic validation of the output structure
-    assert isinstance(result, dict)
-    assert "documents" in result
-    assert isinstance(result["documents"], list)
-
-
 def test_split_id_sequentiality_primary_and_secondary():
     text = "# Header\n" + "Word " * 30
     # Test primary splitting

From babc7d98b8520e519697a69a52ba1d4db2b9649d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Mon, 13 Oct 2025 17:06:25 +0200
Subject: [PATCH 51/85] make split_id testing more robust

---
 .../test_markdown_header_splitter.py          | 39 ++++++++++++-------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 9273a3f30f..bd0fb5fa2d 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -188,30 +188,41 @@ def test_empty_content_handling():
     assert len(result["documents"]) == 1
 
 
-def test_split_id_sequentiality_primary_and_secondary():
-    text = "# Header\n" + "Word " * 30
+def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test primary splitting
     splitter = MarkdownHeaderSplitter()
-    docs = [Document(content=text)]
+    docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
-    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    split_docs = result["documents"]
+
+    # Test number of documents
+    assert len(split_docs) == 5
+
+    # Check that split_ids are sequential
+    split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
     # Test secondary splitting
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5)
-    result = splitter.run(documents=docs)
-    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
-    assert split_ids == list(range(len(split_ids)))
-    docs = [Document(content=text)]
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)
+    docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
-    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
+    split_docs = result["documents"]
+
+    # Test number of documents
+    assert len(split_docs) == 10
+
+    split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test secondary splitting
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5)
+    # Test with multiple input documents
+    docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")]
     result = splitter.run(documents=docs)
-    split_ids = [doc.meta["split_id"] for doc in result["documents"]]
-    assert split_ids == list(range(len(split_ids)))
+    split_docs = result["documents"]
+
+    # Test number of documents
+    assert len(split_docs) == 12
+
+    split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
 

From e488edc65146a1a4b6c955c1e9ae17ecd63aaae8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 14:37:23 +0200
Subject: [PATCH 52/85] more realistic overlap test sample

---
 .../test_markdown_header_splitter.py          | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index bd0fb5fa2d..2018ee1c1d 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -227,18 +227,29 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
 
 
 def test_secondary_split_with_overlap():
-    text = "# Header\n" + "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10"
+    realistic_text = (
+        "# Introduction\n"
+        "This is the introduction section with some words for testing overlap splitting. "
+        "It should be split into chunks with overlap.\n"
+        "## Details\n"
+        "Here are more details about the topic. "
+        "Splitting should work across multiple headers and content blocks.\n"
+        "### Subsection\n"
+        "This subsection contains additional information and should also be split with overlap."
+    )
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2)
-    docs = [Document(content=text)]
+    docs = [Document(content=realistic_text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Overlap of 2, so each chunk after the first should share 2 words with previous
-    assert len(split_docs) > 1
+    assert len(split_docs) == 21
+
     for i in range(1, len(split_docs)):
-        prev_words = split_docs[i - 1].content.split()
-        curr_words = split_docs[i].content.split()
-        # The overlap should be the last 2 words of previous == first 2 of current
-        assert prev_words[-2:] == curr_words[:2]
+        prev_doc = split_docs[i - 1]
+        curr_doc = split_docs[i]
+        if prev_doc.meta["header"] == curr_doc.meta["header"]:  # only check overlap within same header
+            prev_words = prev_doc.content.split()
+            curr_words = curr_doc.content.split()
+            assert prev_words[-2:] == curr_words[:2]
 
 
 def test_secondary_split_with_threshold():

From c0efda3ec0d7c0b6f25575e1f9c5356bb9c8a79f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 14:46:18 +0200
Subject: [PATCH 53/85] assign split_id globally to all output docs

---
 .../preprocessors/markdown_header_splitter.py   | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 3da29749a2..af5c362309 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -173,12 +173,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
                 result_docs.append(split)
 
-        # assign unique, sequential split_id to all final chunks
-        for idx, doc in enumerate(result_docs):
-            if doc.meta is None:
-                doc.meta = {}
-            doc.meta["split_id"] = idx
-
         logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
         return result_docs
 
@@ -314,11 +308,10 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         # secondary splitting if configured
         final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs
 
-        # assign split_id if not already done in secondary splitting
-        if not self.secondary_split:
-            for idx, doc in enumerate(final_docs):
-                if doc.meta is None:
-                    doc.meta = {}
-                doc.meta["split_id"] = idx
+        # assign split_id to all output documents
+        for idx, doc in enumerate(final_docs):
+            if doc.meta is None:
+                doc.meta = {}
+            doc.meta["split_id"] = idx
 
         return {"documents": final_docs}

From 893e3dec766c22111aad0a97f7c8726ed006a033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 14:56:14 +0200
Subject: [PATCH 54/85] taste page numbers explicitly

---
 .../preprocessors/test_markdown_header_splitter.py       | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 2018ee1c1d..226c28c8e3 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -290,10 +290,7 @@ def test_page_break_handling_with_multiple_headers():
     # Collect page numbers for each header
     header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"]
     header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"]
-    # Both headers should have splits with page_number 1 and 2 for Header 1, and 1 and 2 for Header 2
-    # (relative to their own chunk)
     assert min(header1_pages) == 1
-    assert max(header1_pages) >= 2
-    # header2_pages may start at 2 if the previous header's last chunk ended with a page break
-    assert min(header2_pages) >= 1
-    assert max(header2_pages) >= 2
+    assert max(header1_pages) == 2
+    assert min(header2_pages) == 2
+    assert max(header2_pages) == 3

From 9abf10b17bd98ddbe27283b36f0570c97ac3a00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 14:58:56 +0200
Subject: [PATCH 55/85] cleanup pagebreak test

---
 .../preprocessors/test_markdown_header_splitter.py        | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 226c28c8e3..1ca748b585 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -267,18 +267,14 @@ def test_secondary_split_with_threshold():
 
 def test_page_break_handling_in_secondary_split():
     text = "# Header\nFirst page\fSecond page\fThird page"
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2)
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # The page_number should increment at each page break
     page_numbers = [doc.meta.get("page_number") for doc in split_docs]
     # Should start at 1 and increment at each \f
     assert page_numbers[0] == 1
-    assert 2 in page_numbers
-    # Remove: assert 3 in page_numbers
-    # Instead, check that the max page number is 2 or 3, depending on split alignment
-    assert max(page_numbers) >= 2
+    assert max(page_numbers) == 3
 
 
 def test_page_break_handling_with_multiple_headers():

From 11da0a86a945bd6888fe13d804164df0fa9c1e50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 15:00:27 +0200
Subject: [PATCH 56/85] minor

---
 test/components/preprocessors/test_markdown_header_splitter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 1ca748b585..ca4360905e 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -258,7 +258,6 @@ def test_secondary_split_with_threshold():
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # The last chunk should have at least split_threshold words if possible
     for doc in split_docs[:-1]:
         assert len(doc.content.split()) == 3
     # The last chunk should have at least 2 words (threshold)

From 32d8c6862b3cb876ee28a019a1ff003ba61913b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 14 Oct 2025 15:11:53 +0200
Subject: [PATCH 57/85] return doc unchunked if no headers have content

---
 .../components/preprocessors/markdown_header_splitter.py   | 7 +++++++
 .../preprocessors/test_markdown_header_splitter.py         | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index af5c362309..59d8f35bc3 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -79,6 +79,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
         chunks: list[dict] = []
         header_stack: list[Optional[str]] = [None] * 6
         active_parents: list[str] = []
+        has_content = False  # Flag to track if any header has content
 
         for i, match in enumerate(matches):
             # extract header info
@@ -103,6 +104,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
                 active_parents.append(header_text)
                 continue
 
+            has_content = True  # At least one header has content
             # get parent headers
             parent_headers = list(active_parents)
 
@@ -120,6 +122,11 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
             # reset active parents
             active_parents = [h for h in header_stack[: level - 1] if h is not None]
 
+        # return doc unchunked if no headers have content
+        if not has_content:
+            logger.info("Document contains only headers with no content; returning original document.")
+            return [{"content": text, "meta": {}}]
+
         logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks))
         return chunks
 
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index ca4360905e..a37c2c9b0a 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -115,8 +115,9 @@ def test_split_only_headers():
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Should not create chunks for headers with no content
-    assert len(split_docs) == 0
+    # Return doc without content unchunked
+    assert len(split_docs) == 1
+    assert split_docs[0].content == text
 
 
 # Metadata preservation

From bcf56cac1fd6feaec525a7b499ddebdfb5dc827b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 11:35:15 +0200
Subject: [PATCH 58/85] add doc-id to logging statement for unsplit documents

---
 .../preprocessors/markdown_header_splitter.py   | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 59d8f35bc3..31d2b686fe 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -63,7 +63,7 @@ def __init__(
                 split_threshold=self.split_threshold,
             )
 
-    def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
+    def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
         """Split text by ATX-style headers (#) and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
 
@@ -72,7 +72,9 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
 
         # return unsplit if no headers found
         if not matches:
-            logger.info("No headers found in document; returning full document as single chunk.")
+            logger.info(
+                "No headers found in document {doc_id}; returning full document as single chunk.", doc_id=doc_id
+            )
             return [{"content": text, "meta": {"header": None, "parent_headers": []}}]
 
         # process headers and build chunks
@@ -99,13 +101,12 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
 
             # skip splits w/o content
             if not content:
-                # Add as parent for subsequent headers
+                # add as parent for subsequent headers
                 active_parents = [h for h in header_stack[: level - 1] if h is not None]
                 active_parents.append(header_text)
                 continue
 
-            has_content = True  # At least one header has content
-            # get parent headers
+            has_content = True  # at least one header has content
             parent_headers = list(active_parents)
 
             logger.debug(
@@ -124,7 +125,9 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]:
 
         # return doc unchunked if no headers have content
         if not has_content:
-            logger.info("Document contains only headers with no content; returning original document.")
+            logger.info(
+                "Document {doc_id} contains only headers with no content; returning original document.", doc_id=doc_id
+            )
             return [{"content": text, "meta": {}}]
 
         logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks))
@@ -226,7 +229,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
             if doc.content is None:
                 continue
-            splits = self._split_text_by_markdown_headers(doc.content)
+            splits = self._split_text_by_markdown_headers(doc.content, doc.id)
             docs = []
             total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0)
 

From c5415ec2133a505c328e2ceb2e7a6c3f462ffe3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 11:42:58 +0200
Subject: [PATCH 59/85] remove unneeded logs

---
 .../components/preprocessors/markdown_header_splitter.py   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 31d2b686fe..f44b6d1977 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -130,7 +130,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             )
             return [{"content": text, "meta": {}}]
 
-        logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks))
         return chunks
 
     def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]:
@@ -142,7 +141,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         if not self.secondary_split:
             return documents
 
-        logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split)
         result_docs = []
 
         for doc in documents:
@@ -183,7 +181,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
                 result_docs.append(split)
 
-        logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs))
+        logger.debug(
+            "Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)
+        )
         return result_docs
 
     def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
@@ -313,7 +313,6 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             return {"documents": []}
 
         header_split_docs = self._split_documents_by_markdown_headers(processed_documents)
-        logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs))
 
         # secondary splitting if configured
         final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs

From dff06bc0dd93b34774810b2d63784b59a1005680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 11:49:38 +0200
Subject: [PATCH 60/85] minor cleanup

---
 haystack/components/preprocessors/markdown_header_splitter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f44b6d1977..6f2ce68947 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -278,6 +278,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             - `documents`: List of documents with the split texts. Each document includes:
                 - A metadata field `source_id` to track the original document.
                 - A metadata field `page_number` to track the original page number.
+                - A metadata field `split_id` to uniquely identify each split chunk.
                 - All other metadata copied from the original document.
         """
         # validate input documents
@@ -319,8 +320,6 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
 
         # assign split_id to all output documents
         for idx, doc in enumerate(final_docs):
-            if doc.meta is None:
-                doc.meta = {}
             doc.meta["split_id"] = idx
 
         return {"documents": final_docs}

From a54d25a7ce67ace21a0473059493eaf12f4f40cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 11:56:21 +0200
Subject: [PATCH 61/85] simplify page-number tracking method to not return
 count, just the updated page number

---
 .../preprocessors/markdown_header_splitter.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 6f2ce68947..1222f87142 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -169,7 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             for i, split in enumerate(secondary_splits):
                 # calculate page number for this split
                 if i > 0 and secondary_splits[i - 1].content:
-                    _, current_page = self._count_page_breaks_and_update(secondary_splits[i - 1].content, current_page)
+                    current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page)
 
                 # set page number to meta
                 split.meta["page_number"] = current_page
@@ -198,16 +198,16 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] =
                 target_dict[new_key] = value
         return target_dict
 
-    def _count_page_breaks_and_update(self, content: str, current_page: int) -> tuple[int, int]:
+    def _update_page_number_with_breaks(self, content: str, current_page: int) -> int:
         """
-        Count page breaks in content and return updated page count.
+        Update page number based on page breaks in content.
 
         :param content: Content to check for page breaks
         :param current_page: Current page number
-        :return: Tuple of (page_breaks_count, new_current_page)
+        :return: New current page number
         """
         if not isinstance(content, str):
-            return 0, current_page
+            return current_page
 
         page_breaks = content.count(self.page_break_character)
         new_page_number = current_page + page_breaks
@@ -220,7 +220,7 @@ def _count_page_breaks_and_update(self, content: str, current_page: int) -> tupl
                 new=new_page_number,
             )
 
-        return page_breaks, new_page_number
+        return new_page_number
 
     def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]:
         """Split a list of documents by markdown headers, preserving metadata."""
@@ -244,7 +244,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)
                 meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
-                _, current_page = self._count_page_breaks_and_update(split["content"], current_page)
+                current_page = self._update_page_number_with_breaks(split["content"], current_page)
                 if split.get("meta"):
                     meta.update(self._flatten_dict(split.get("meta") or {}))
                 docs.append(Document(content=split["content"], meta=meta))

From a34c7a6fb135fa142e3396193432029c844aac20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 12:02:14 +0200
Subject: [PATCH 62/85] add dev comment to mypy check for doc.content is None

---
 haystack/components/preprocessors/markdown_header_splitter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 1222f87142..aaf825c5aa 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -227,6 +227,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
         result_docs = []
         for doc in documents:
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
+            # mypy: doc.content is Optional[str], so we must check for None before passing to splitting method
             if doc.content is None:
                 continue
             splits = self._split_text_by_markdown_headers(doc.content, doc.id)

From 7bc798e3cd8fe45cfd922e15d6015a4f6e536179 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Thu, 16 Oct 2025 12:03:37 +0200
Subject: [PATCH 63/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index aaf825c5aa..187a01dd19 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -232,7 +232,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 continue
             splits = self._split_text_by_markdown_headers(doc.content, doc.id)
             docs = []
-            total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0)
+            total_pages = self._calculate_total_pages(doc.content, 0)
 
             current_page = doc.meta.get("page_number", 1) if doc.meta else 1
             logger.debug(

From a7eef6b7bc265d882a90af872d00ceed2ef1cc15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 12:23:05 +0200
Subject: [PATCH 64/85] remove split meta flattening

---
 .../components/preprocessors/markdown_header_splitter.py     | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 187a01dd19..3ea9ee4828 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -246,8 +246,6 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                     meta = self._flatten_dict(doc.meta)
                 meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
                 current_page = self._update_page_number_with_breaks(split["content"], current_page)
-                if split.get("meta"):
-                    meta.update(self._flatten_dict(split.get("meta") or {}))
                 docs.append(Document(content=split["content"], meta=meta))
             logger.debug(
                 "Split into {num_docs} documents for id={doc_id}, final page: {current_page}",
@@ -263,9 +261,6 @@ def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int:
         if existing_total > 0:
             return existing_total
 
-        if not isinstance(content, str):
-            return 1
-
         return content.count(self.page_break_character) + 1
 
     @component.output_types(documents=list[Document])

From 5b5fc93330f9138bc8e5a77e8803108cea06438d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 12:25:20 +0200
Subject: [PATCH 65/85] keep empty meta return consistent

---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 3ea9ee4828..c43a2f1f95 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -75,7 +75,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             logger.info(
                 "No headers found in document {doc_id}; returning full document as single chunk.", doc_id=doc_id
             )
-            return [{"content": text, "meta": {"header": None, "parent_headers": []}}]
+            return [{"content": text, "meta": {}}]
 
         # process headers and build chunks
         chunks: list[dict] = []

From 8ef5af032c57cf23cf54629444d230ed2e609b8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 12:26:38 +0200
Subject: [PATCH 66/85] remove unneeded content is none check

---
 haystack/components/preprocessors/markdown_header_splitter.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index c43a2f1f95..cc1ee0448e 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -138,9 +138,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
 
         Ensures page counting is maintained across splits.
         """
-        if not self.secondary_split:
-            return documents
-
         result_docs = []
 
         for doc in documents:

From f1e3739f6b95b6c4550662a8f199ebecce8d89c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 12:40:31 +0200
Subject: [PATCH 67/85] update tests to reflect empty meta dict for unsplit
 docs

---
 haystack/components/preprocessors/markdown_header_splitter.py | 2 ++
 .../components/preprocessors/test_markdown_header_splitter.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index cc1ee0448e..f4e51527eb 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -242,6 +242,8 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)
                 meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
+                if split.get("meta"):
+                    meta.update(split["meta"])
                 current_page = self._update_page_number_with_breaks(split["content"], current_page)
                 docs.append(Document(content=split["content"], meta=meta))
             logger.debug(
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index a37c2c9b0a..aa6a6e9235 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -81,10 +81,10 @@ def test_split_no_headers():
     docs = [Document(content="No headers here."), Document(content="Just some text without headers.")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Should return one doc per input, header is None
+    # Should return one doc per input, and no header key in meta
     assert len(split_docs) == 2
     for doc in split_docs:
-        assert doc.meta["header"] is None
+        assert "header" not in doc.meta
 
 
 def test_split_multiple_documents(sample_text):

From df7e775a9967b3417b0c9420be6e3418727670f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 13:42:20 +0200
Subject: [PATCH 68/85] clean up total_page counts

---
 .../preprocessors/markdown_header_splitter.py      | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f4e51527eb..130a1ce417 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -221,6 +221,7 @@ def _update_page_number_with_breaks(self, content: str, current_page: int) -> in
 
     def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]:
         """Split a list of documents by markdown headers, preserving metadata."""
+
         result_docs = []
         for doc in documents:
             logger.debug("Splitting document with id={doc_id}", doc_id=doc.id)
@@ -229,11 +230,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 continue
             splits = self._split_text_by_markdown_headers(doc.content, doc.id)
             docs = []
-            total_pages = self._calculate_total_pages(doc.content, 0)
 
             current_page = doc.meta.get("page_number", 1) if doc.meta else 1
+            total_pages = doc.content.count(self.page_break_character) + 1
             logger.debug(
-                "Starting page number: {current_page}, Total pages: {total_pages}",
+                "Processing page number: {current_page} out of {total_pages}",
                 current_page=current_page,
                 total_pages=total_pages,
             )
@@ -241,7 +242,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 meta = {}
                 if doc.meta:
                     meta = self._flatten_dict(doc.meta)
-                meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
+                meta.update({"source_id": doc.id, "page_number": current_page})
                 if split.get("meta"):
                     meta.update(split["meta"])
                 current_page = self._update_page_number_with_breaks(split["content"], current_page)
@@ -255,13 +256,6 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
             result_docs.extend(docs)
         return result_docs
 
-    def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int:
-        """Calculate total pages based on content and existing metadata."""
-        if existing_total > 0:
-            return existing_total
-
-        return content.count(self.page_break_character) + 1
-
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         """

From 3c1c3762307ddb42ed6d2478da13718f3e919ffd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Thu, 16 Oct 2025 13:46:01 +0200
Subject: [PATCH 69/85] remove unneeded meta check

---
 .../components/preprocessors/test_markdown_header_splitter.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index aa6a6e9235..02c0c7beac 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -58,10 +58,6 @@ def test_basic_split(sample_text):
     subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3")
     assert "Content under header 1.2.3." in subheader123_doc.content
 
-    # Ensure all documents have a header in their metadata
-    for doc in split_docs:
-        assert doc.meta.get("header") is not None
-
 
 def test_split_parentheaders(sample_text):
     splitter = MarkdownHeaderSplitter()

From 86feef6844aed0bf5856ab3267b17d2a895bda67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:23:12 +0200
Subject: [PATCH 70/85] Update
 test/components/preprocessors/test_markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 .../test_markdown_header_splitter.py          | 60 +++++++++++++++++--
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 02c0c7beac..71e08c2168 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -280,9 +280,57 @@ def test_page_break_handling_with_multiple_headers():
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
     # Collect page numbers for each header
-    header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"]
-    header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"]
-    assert min(header1_pages) == 1
-    assert max(header1_pages) == 2
-    assert min(header2_pages) == 2
-    assert max(header2_pages) == 3
+    assert len(split_docs) == 4
+
+    # Split 1
+    assert split_docs[0].content == "\nPage 1\fPage "
+    assert split_docs[0].meta == {
+        "source_id": ANY,
+        "total_pages": 3,
+        "page_number": 1,
+        "header": "Header 1",
+        "parent_headers": [],
+        "split_id": 0,
+        "split_idx_start": 0,
+    }
+
+    # Split 2
+    assert split_docs[1].content == "2"
+    assert split_docs[1].meta == {
+        "source_id": ANY,
+        "total_pages": 3,
+        "page_number": 2,
+        "header": "Header 1",
+        "parent_headers": [],
+        "split_id": 1,
+        "split_idx_start": 13,
+    }
+
+    # Split 3
+    assert split_docs[2].content == "\nPage 3\fPage "
+    assert split_docs[2].meta == {
+        "source_id": ANY,
+        "total_pages": 3,
+        "page_number": 2,
+        "header": "Header 2",
+        "parent_headers": [],
+        "split_id": 2,
+        "split_idx_start": 0,
+    }
+
+    # Split 4
+    assert split_docs[3].content == "4"
+    assert split_docs[3].meta == {
+        "source_id": ANY,
+        "total_pages": 3,
+        "page_number": 3,
+        "header": "Header 2",
+        "parent_headers": [],
+        "split_id": 3,
+        "split_idx_start": 13,
+    }
+
+    # Check reconstruction
+    # NOTE: This doesn't seem to pass currently
+    reconstructed_text = "".join(doc.content for doc in split_docs)
+    assert reconstructed_text == text

From c22b57db35e67dc9844016373ca52943ea3c7e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 17 Oct 2025 18:01:17 +0200
Subject: [PATCH 71/85] implement keep_headers parameter

---
 .../preprocessors/markdown_header_splitter.py | 54 +++++++----
 .../test_markdown_header_splitter.py          | 92 ++++++++-----------
 2 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 130a1ce417..993c5fb720 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -27,6 +27,7 @@ def __init__(
         self,
         *,
         page_break_character: str = "\f",
+        keep_headers: bool = True,
         secondary_split: Optional[Literal["word", "passage", "period", "line"]] = None,
         split_length: int = 200,
         split_overlap: int = 0,
@@ -37,6 +38,8 @@ def __init__(
         Initialize the MarkdownHeaderSplitter.
 
         :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
+        :param keep_headers: If True, headers are kept in the content. If False, headers are moved to metadata.
+            Defaults to True.
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are None, "word", "passage", "period", "line". Defaults to None.
         :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
@@ -52,6 +55,7 @@ def __init__(
         self.split_overlap = split_overlap
         self.split_threshold = split_threshold
         self.skip_empty_documents = skip_empty_documents
+        self.keep_headers = keep_headers
         self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$")  # ATX-style .md-headers
 
         # initialize secondary_splitter only if needed
@@ -80,8 +84,9 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
         # process headers and build chunks
         chunks: list[dict] = []
         header_stack: list[Optional[str]] = [None] * 6
-        active_parents: list[str] = []
-        has_content = False  # Flag to track if any header has content
+        active_parents: list[str] = []  # track active parent headers
+        pending_headers: list[str] = []  # store empty headers to prepend to next content
+        has_content = False  # flag to track if any header has content
 
         for i, match in enumerate(matches):
             # extract header info
@@ -99,11 +104,16 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             for j in range(level, 6):
                 header_stack[j] = None
 
+            # prepare header_line if keep_headers
+            header_line = f"{header_prefix} {header_text}"
+
             # skip splits w/o content
             if not content:
                 # add as parent for subsequent headers
                 active_parents = [h for h in header_stack[: level - 1] if h is not None]
                 active_parents.append(header_text)
+                if self.keep_headers:
+                    pending_headers.append(header_line)
                 continue
 
             has_content = True  # at least one header has content
@@ -113,12 +123,21 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
                 "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
             )
 
-            chunks.append(
-                {
-                    "content": f"{header_prefix} {header_text}\n{content}",
-                    "meta": {"header": header_text, "parent_headers": parent_headers},
-                }
-            )
+            if self.keep_headers:
+                # add pending & current header to content
+                chunk_content = ""
+                if pending_headers:
+                    chunk_content += "\n".join(pending_headers) + "\n"
+                chunk_content += f"{header_line}\n{content}"
+                chunks.append(
+                    {
+                        "content": chunk_content,
+                        "meta": {} if self.keep_headers else {"header": header_text, "parent_headers": parent_headers},
+                    }
+                )
+                pending_headers = []  # reset pending headers
+            else:
+                chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}})
 
             # reset active parents
             active_parents = [h for h in header_stack[: level - 1] if h is not None]
@@ -145,11 +164,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 result_docs.append(doc)
                 continue
 
-            # extract header information
-            header_match = re.search(self._header_pattern, doc.content)
             content_for_splitting: str = doc.content
-            if header_match:
-                content_for_splitting = doc.content[header_match.end() :]
+
+            if not self.keep_headers:  # skip header extraction if keep_headers
+                # extract header information
+                header_match = re.search(self._header_pattern, doc.content)
+                if header_match:
+                    content_for_splitting = doc.content[header_match.end() :]
 
             if not content_for_splitting or not content_for_splitting.strip():  # skip empty content
                 result_docs.append(doc)
@@ -171,10 +192,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 # set page number to meta
                 split.meta["page_number"] = current_page
 
-                # preserve header metadata
-                for key in ["header", "parent_headers"]:
-                    if key in doc.meta:
-                        split.meta[key] = doc.meta[key]
+                # preserve header metadata if we're not keeping headers in content
+                if not self.keep_headers:
+                    for key in ["header", "parent_headers"]:
+                        if key in doc.meta:
+                            split.meta[key] = doc.meta[key]
 
                 result_docs.append(split)
 
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 71e08c2168..0d1091797b 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from unittest.mock import ANY
+
 import pytest
 
 from haystack import Document
@@ -29,7 +31,7 @@ def sample_text():
 
 # Basic splitting and structure
 def test_basic_split(sample_text):
-    splitter = MarkdownHeaderSplitter()
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -60,7 +62,7 @@ def test_basic_split(sample_text):
 
 
 def test_split_parentheaders(sample_text):
-    splitter = MarkdownHeaderSplitter()
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [Document(content=sample_text), Document(content="# H1\n## H2\n### H3\nContent")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -84,7 +86,7 @@ def test_split_no_headers():
 
 
 def test_split_multiple_documents(sample_text):
-    splitter = MarkdownHeaderSplitter()
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [
         Document(content=sample_text),
         Document(content="# Another Header\nSome content."),
@@ -119,7 +121,7 @@ def test_split_only_headers():
 # Metadata preservation
 def test_preserve_document_metadata():
     """Test that document metadata is preserved through splitting."""
-    splitter = MarkdownHeaderSplitter()
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})]
 
     result = splitter.run(documents=docs)
@@ -187,7 +189,7 @@ def test_empty_content_handling():
 
 def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test primary splitting
-    splitter = MarkdownHeaderSplitter()
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -206,7 +208,7 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     split_docs = result["documents"]
 
     # Test number of documents
-    assert len(split_docs) == 10
+    assert len(split_docs) == 12
 
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
@@ -217,14 +219,14 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     split_docs = result["documents"]
 
     # Test number of documents
-    assert len(split_docs) == 12
+    assert len(split_docs) == 14
 
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
 
 def test_secondary_split_with_overlap():
-    realistic_text = (
+    text = (
         "# Introduction\n"
         "This is the introduction section with some words for testing overlap splitting. "
         "It should be split into chunks with overlap.\n"
@@ -234,8 +236,8 @@ def test_secondary_split_with_overlap():
         "### Subsection\n"
         "This subsection contains additional information and should also be split with overlap."
     )
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2)
-    docs = [Document(content=realistic_text)]
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2, keep_headers=False)
+    docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
     assert len(split_docs) == 21
@@ -251,7 +253,7 @@ def test_secondary_split_with_overlap():
 
 def test_secondary_split_with_threshold():
     text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)])
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2)
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -274,63 +276,41 @@ def test_page_break_handling_in_secondary_split():
 
 
 def test_page_break_handling_with_multiple_headers():
-    text = "# Header 1\nPage 1\fPage 2\n# Header 2\nPage 3\fPage 4"
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2)
+    text = "# Header\nFirst page\f Second page\f Third page"
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1, keep_headers=True)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Collect page numbers for each header
-    assert len(split_docs) == 4
+    assert len(split_docs) == 7
 
     # Split 1
-    assert split_docs[0].content == "\nPage 1\fPage "
-    assert split_docs[0].meta == {
-        "source_id": ANY,
-        "total_pages": 3,
-        "page_number": 1,
-        "header": "Header 1",
-        "parent_headers": [],
-        "split_id": 0,
-        "split_idx_start": 0,
-    }
+    assert split_docs[0].content == "# "
+    assert split_docs[0].meta == {"source_id": ANY, "page_number": 1, "split_id": 0, "split_idx_start": 0}
 
     # Split 2
-    assert split_docs[1].content == "2"
-    assert split_docs[1].meta == {
-        "source_id": ANY,
-        "total_pages": 3,
-        "page_number": 2,
-        "header": "Header 1",
-        "parent_headers": [],
-        "split_id": 1,
-        "split_idx_start": 13,
-    }
+    assert split_docs[1].content == "Header\nFirst "
+    assert split_docs[1].meta == {"source_id": ANY, "page_number": 1, "split_id": 1, "split_idx_start": 2}
 
     # Split 3
-    assert split_docs[2].content == "\nPage 3\fPage "
-    assert split_docs[2].meta == {
-        "source_id": ANY,
-        "total_pages": 3,
-        "page_number": 2,
-        "header": "Header 2",
-        "parent_headers": [],
-        "split_id": 2,
-        "split_idx_start": 0,
-    }
+    assert split_docs[2].content == "page\f "
+    assert split_docs[2].meta == {"source_id": ANY, "page_number": 1, "split_id": 2, "split_idx_start": 15}
 
     # Split 4
-    assert split_docs[3].content == "4"
-    assert split_docs[3].meta == {
-        "source_id": ANY,
-        "total_pages": 3,
-        "page_number": 3,
-        "header": "Header 2",
-        "parent_headers": [],
-        "split_id": 3,
-        "split_idx_start": 13,
-    }
+    assert split_docs[3].content == "Second "
+    assert split_docs[3].meta == {"source_id": ANY, "page_number": 2, "split_id": 3, "split_idx_start": 21}
+
+    # Split 5
+    assert split_docs[4].content == "page\f "
+    assert split_docs[4].meta == {"source_id": ANY, "page_number": 2, "split_id": 4, "split_idx_start": 28}
+
+    # Split 6
+    assert split_docs[5].content == "Third "
+    assert split_docs[5].meta == {"source_id": ANY, "page_number": 3, "split_id": 5, "split_idx_start": 34}
+
+    # Split 7
+    assert split_docs[6].content == "page"
+    assert split_docs[6].meta == {"source_id": ANY, "page_number": 3, "split_id": 6, "split_idx_start": 40}
 
     # Check reconstruction
-    # NOTE: This doesn't seem to pass currently
     reconstructed_text = "".join(doc.content for doc in split_docs)
     assert reconstructed_text == text

From 7c03a0494aff8c075b85e3cfa1697ebdb45ab438 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 17 Oct 2025 19:18:01 +0200
Subject: [PATCH 72/85] remove meta-dict flattening

---
 .../preprocessors/markdown_header_splitter.py      | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 993c5fb720..4e837a8db5 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -205,18 +205,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         )
         return result_docs
 
-    def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict:
-        """Flatten a nested dictionary, concatenating keys with underscores."""
-        if target_dict is None:
-            target_dict = {}
-        for key, value in d.items():
-            new_key = f"{prefix}{key}" if prefix else key
-            if isinstance(value, dict):
-                self._flatten_dict(value, f"{new_key}_", target_dict)
-            else:
-                target_dict[new_key] = value
-        return target_dict
-
     def _update_page_number_with_breaks(self, content: str, current_page: int) -> int:
         """
         Update page number based on page breaks in content.
@@ -263,7 +251,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
             for split in splits:
                 meta = {}
                 if doc.meta:
-                    meta = self._flatten_dict(doc.meta)
+                    meta = doc.meta.copy()
                 meta.update({"source_id": doc.id, "page_number": current_page})
                 if split.get("meta"):
                     meta.update(split["meta"])

From 9a8ca7676d90a19638533f42199e693d5185fb34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Tue, 21 Oct 2025 10:56:48 +0200
Subject: [PATCH 73/85] add minor sanity checks

---
 test/components/preprocessors/test_markdown_header_splitter.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 0d1091797b..67f1e040e1 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -83,6 +83,9 @@ def test_split_no_headers():
     assert len(split_docs) == 2
     for doc in split_docs:
         assert "header" not in doc.meta
+    # Sanity Checks
+    assert split_docs[0].content == docs[0].content
+    assert split_docs[1].content == docs[1].content
 
 
 def test_split_multiple_documents(sample_text):

From 2f1e2037d94f0be8bab8158924ab9046fd7bff05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Tue, 21 Oct 2025 10:53:51 +0200
Subject: [PATCH 74/85] Update
 test/components/preprocessors/test_markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 .../preprocessors/test_markdown_header_splitter.py       | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 67f1e040e1..8d5694e6cd 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -45,8 +45,13 @@ def test_basic_split(sample_text):
     assert "Subheader 1.2.3" in headers
 
     # Check that content is present and correct
-    header1_doc = next(doc for doc in split_docs if doc.meta["header"] == "Header 1")
-    assert "Content under header 1." in header1_doc.content
+    # Test first split
+    header1_doc = split_docs[0]
+    assert header1_doc.meta["header"] == "Header 1"
+    assert header1_doc.meta["split_id"] == 0
+    assert header1_doc.meta["page_number"] == 1
+    assert header1_doc.meta["parent_headers"] == []
+    assert header1_doc.content == "# Header 1\nContent under header 1."
 
     subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1")
     assert "Content under sub-header 1.1.1" in subheader111_doc.content

From b22feb5c001155223dda12af69dff784a0c4e5af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 10:33:46 +0100
Subject: [PATCH 75/85] add warmup

---
 .../preprocessors/markdown_header_splitter.py         | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 4e837a8db5..f6792a1862 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -57,6 +57,7 @@ def __init__(
         self.skip_empty_documents = skip_empty_documents
         self.keep_headers = keep_headers
         self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$")  # ATX-style .md-headers
+        self._is_warmed_up = False
 
         # initialize secondary_splitter only if needed
         if self.secondary_split:
@@ -67,6 +68,14 @@ def __init__(
                 split_threshold=self.split_threshold,
             )
 
+    def warm_up(self):
+        """
+        Warm up the MarkdownHeaderSplitter.
+        """
+        if self.secondary_split and not self._is_warmed_up:
+            self.secondary_splitter.warm_up()
+            self._is_warmed_up = True
+
     def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
         """Split text by ATX-style headers (#) and create chunks with appropriate metadata."""
         logger.debug("Splitting text by markdown headers")
@@ -97,7 +106,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             # get content
             start = match.end()
             end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
-            content = text[start:end].strip()
+            content = text[start:end]
 
             # update header stack to track nesting
             header_stack[level - 1] = header_text

From 85018315d8af7d8497a76aeefb0ca3cc8b2df6c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?=
 <45487933+OGuggenbuehl@users.noreply.github.com>
Date: Fri, 21 Nov 2025 10:29:47 +0100
Subject: [PATCH 76/85] Update
 haystack/components/preprocessors/markdown_header_splitter.py

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
---
 .../components/preprocessors/markdown_header_splitter.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f6792a1862..cf933abae6 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -46,8 +46,9 @@ def __init__(
         :param split_overlap: The number of overlapping units for each split when using secondary splitting.
             Defaults to 0.
         :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
-        :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
-            Defaults to True.
+        :param skip_empty_documents: Choose whether to skip documents with empty content. Default is True.
+            Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text
+            from non-textual documents.
         """
         self.page_break_character = page_break_character
         self.secondary_split = secondary_split

From 23da68ed7b2ffe858f5a0f1fd955c4b64461b6af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 10:54:54 +0100
Subject: [PATCH 77/85] fix splitting when keeping headers

---
 .../preprocessors/markdown_header_splitter.py |   4 +-
 .../test_markdown_header_splitter.py          | 124 ++++++++++++++----
 2 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index cf933abae6..463ea6ceed 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -118,7 +118,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             header_line = f"{header_prefix} {header_text}"
 
             # skip splits w/o content
-            if not content:
+            if not content.strip():
                 # add as parent for subsequent headers
                 active_parents = [h for h in header_stack[: level - 1] if h is not None]
                 active_parents.append(header_text)
@@ -138,7 +138,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
                 chunk_content = ""
                 if pending_headers:
                     chunk_content += "\n".join(pending_headers) + "\n"
-                chunk_content += f"{header_line}\n{content}"
+                chunk_content += f"{header_line}{content}"
                 chunks.append(
                     {
                         "content": chunk_content,
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 8d5694e6cd..0cfc732bbe 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -31,39 +31,115 @@ def sample_text():
 
 # Basic splitting and structure
 def test_basic_split(sample_text):
-    splitter = MarkdownHeaderSplitter(keep_headers=False)
+    splitter = MarkdownHeaderSplitter()
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
 
-    # Should split into all headers with content
-    headers = [doc.meta["header"] for doc in split_docs]
-    assert "Header 1" in headers
-    assert "Subheader 1.1.1" in headers
-    assert "Subheader 1.2.1" in headers
-    assert "Subheader 1.2.2" in headers
-    assert "Subheader 1.2.3" in headers
-
     # Check that content is present and correct
     # Test first split
     header1_doc = split_docs[0]
-    assert header1_doc.meta["header"] == "Header 1"
+    # assert header1_doc.meta["header"] == "Header 1"
     assert header1_doc.meta["split_id"] == 0
     assert header1_doc.meta["page_number"] == 1
-    assert header1_doc.meta["parent_headers"] == []
-    assert header1_doc.content == "# Header 1\nContent under header 1."
-
-    subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1")
-    assert "Content under sub-header 1.1.1" in subheader111_doc.content
-
-    subheader121_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.1")
-    assert "Content under header 1.2.1." in subheader121_doc.content
-
-    subheader122_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2")
-    assert "Content under header 1.2.2." in subheader122_doc.content
-
-    subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3")
-    assert "Content under header 1.2.3." in subheader123_doc.content
+    # assert header1_doc.meta["parent_headers"] == []
+    assert header1_doc.content == "# Header 1\nContent under header 1.\n"
+
+    # Test second split
+    subheader111_doc = split_docs[1]
+    # assert subheader111_doc.meta["header"] == "Subheader 1.1.1"
+    assert subheader111_doc.meta["split_id"] == 1
+    assert subheader111_doc.meta["page_number"] == 1
+    # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"]
+    assert subheader111_doc.content == "## Header 1.1\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n"
+
+    # Test third split
+    subheader121_doc = split_docs[2]
+    # assert subheader121_doc.meta["header"] == "Subheader 1.2.1"
+    assert subheader121_doc.meta["split_id"] == 2
+    assert subheader121_doc.meta["page_number"] == 1
+    # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader121_doc.content == "## Header 1.2\n### Subheader 1.2.1\nContent under header 1.2.1.\n"
+
+    # Test fourth split
+    subheader122_doc = split_docs[3]
+    # assert subheader122_doc.meta["header"] == "Subheader 1.2.2"
+    assert subheader122_doc.meta["split_id"] == 3
+    assert subheader122_doc.meta["page_number"] == 1
+    # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader122_doc.content == "### Subheader 1.2.2\nContent under header 1.2.2.\n"
+
+    # Test fifth split
+    subheader123_doc = split_docs[4]
+    # assert subheader123_doc.meta["header"] == "Subheader 1.2.3"
+    assert subheader123_doc.meta["split_id"] == 4
+    assert subheader123_doc.meta["page_number"] == 1
+    # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader123_doc.content == "### Subheader 1.2.3\nContent under header 1.2.3."
+
+    # Sanity check: reconstruct original text
+    reconstructed_doc = "".join([doc.content for doc in split_docs])
+    assert reconstructed_doc == sample_text
+
+
+# def test_split_without_headers(sample_text):
+#     splitter = MarkdownHeaderSplitter(keep_headers=False)
+#     docs = [Document(content=sample_text)]
+#     result = splitter.run(documents=docs)
+#     split_docs = result["documents"]
+
+#     # Should split into all headers with content
+#     headers = [doc.meta["header"] for doc in split_docs]
+#     assert "Header 1" in headers
+#     assert "Subheader 1.1.1" in headers
+#     assert "Subheader 1.2.1" in headers
+#     assert "Subheader 1.2.2" in headers
+#     assert "Subheader 1.2.3" in headers
+
+#     # Check that content is present and correct
+#     # Test first split
+#     header1_doc = split_docs[0]
+#     # assert header1_doc.meta["header"] == "Header 1"
+#     assert header1_doc.meta["split_id"] == 0
+#     assert header1_doc.meta["page_number"] == 1
+#     # assert header1_doc.meta["parent_headers"] == []
+#     assert header1_doc.content == "# Header 1\n\nContent under header 1.\n"
+
+#     # Test second split
+#     subheader111_doc = split_docs[1]
+#     # assert subheader111_doc.meta["header"] == "Subheader 1.1.1"
+#     assert subheader111_doc.meta["split_id"] == 1
+#     assert subheader111_doc.meta["page_number"] == 1
+#     # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"]
+#     assert subheader111_doc.content == "## Header 1.1\n\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n"
+
+#     # Test third split
+#     subheader121_doc = split_docs[2]
+#     # assert subheader121_doc.meta["header"] == "Subheader 1.2.1"
+#     assert subheader121_doc.meta["split_id"] == 2
+#     assert subheader121_doc.meta["page_number"] == 1
+#     # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+#     assert subheader121_doc.content == "## Header 1.2\n\n### Subheader 1.2.1\nContent under header 1.2.1.\n"
+
+#     # Test fourth split
+#     subheader122_doc = split_docs[3]
+#     # assert subheader122_doc.meta["header"] == "Subheader 1.2.2"
+#     assert subheader122_doc.meta["split_id"] == 3
+#     assert subheader122_doc.meta["page_number"] == 1
+#     # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+#     assert subheader122_doc.content == "### Subheader 1.2.2\n\nContent under header 1.2.2.\n"
+
+#     # Test fifth split
+#     subheader123_doc = split_docs[4]
+#     # assert subheader123_doc.meta["header"] == "Subheader 1.2.3"
+#     assert subheader123_doc.meta["split_id"] == 4
+#     assert subheader123_doc.meta["page_number"] == 1
+#     # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+#     assert subheader123_doc.content == "### Subheader 1.2.3\n\nContent under header 1.2.3.\n"
+
+#     # Sanity check: reconstruct original text
+#     reconstructed_doc = "".join([doc.content for doc in split_docs])
+#     assert reconstructed_doc == sample_text
 
 
 def test_split_parentheaders(sample_text):

From ccc10577c5196cea168b307ba815814aa23eef40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 11:34:27 +0100
Subject: [PATCH 78/85] test cleanup to cover keep_headers=True

---
 .../test_markdown_header_splitter.py          | 35 +++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 0cfc732bbe..ad9cf2a6c4 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -205,7 +205,7 @@ def test_split_only_headers():
 # Metadata preservation
 def test_preserve_document_metadata():
     """Test that document metadata is preserved through splitting."""
-    splitter = MarkdownHeaderSplitter(keep_headers=False)
+    splitter = MarkdownHeaderSplitter(keep_headers=False)  # keep_headers=True case is covered by this test too
     docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})]
 
     result = splitter.run(documents=docs)
@@ -273,7 +273,7 @@ def test_empty_content_handling():
 
 def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test primary splitting
-    splitter = MarkdownHeaderSplitter(keep_headers=False)
+    splitter = MarkdownHeaderSplitter()
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -320,6 +320,7 @@ def test_secondary_split_with_overlap():
         "### Subsection\n"
         "This subsection contains additional information and should also be split with overlap."
     )
+    # keep_headers=False
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2, keep_headers=False)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
@@ -333,10 +334,38 @@ def test_secondary_split_with_overlap():
             prev_words = prev_doc.content.split()
             curr_words = curr_doc.content.split()
             assert prev_words[-2:] == curr_words[:2]
+    # keep_headers=True
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    assert len(split_docs) == 24
+
+    assert split_docs[0].content.startswith("# Introduction")
+    assert all("header" not in doc.meta for doc in split_docs)
 
 
 def test_secondary_split_with_threshold():
     text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)])
+    # keep_headers=True
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=True)
+    docs = [Document(content=text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+    for i, doc in enumerate(split_docs):
+        words = doc.content.split()
+        if i == 0:
+            # First chunk includes header-hashtag plus split_length words
+            assert words[:2] == ["#", "Header"]
+            assert len(words) == 4
+        elif i < len(split_docs) - 1:
+            # Subsequent chunks should have split_length words
+            assert len(words) == 3
+        else:
+            # Last chunk should have at least split_threshold words
+            assert len(words) >= 2
+
+    # keep_headers=False
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
@@ -361,7 +390,7 @@ def test_page_break_handling_in_secondary_split():
 
 def test_page_break_handling_with_multiple_headers():
     text = "# Header\nFirst page\f Second page\f Third page"
-    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1, keep_headers=True)
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]

From c4a5c171b8e9d6a90425fccbfd9d91bc091d9834 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 11:47:08 +0100
Subject: [PATCH 79/85] add tests for keep_headers=False splitting

---
 .../preprocessors/markdown_header_splitter.py |   2 +
 .../test_markdown_header_splitter.py          | 112 +++++++++---------
 2 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 463ea6ceed..f28437dcc0 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -108,6 +108,8 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             start = match.end()
             end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
             content = text[start:end]
+            if not self.keep_headers and content.startswith("\n"):
+                content = content[1:]
 
             # update header stack to track nesting
             header_stack[level - 1] = header_text
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index ad9cf2a6c4..fb51c3e8de 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -82,64 +82,60 @@ def test_basic_split(sample_text):
     assert reconstructed_doc == sample_text
 
 
-# def test_split_without_headers(sample_text):
-#     splitter = MarkdownHeaderSplitter(keep_headers=False)
-#     docs = [Document(content=sample_text)]
-#     result = splitter.run(documents=docs)
-#     split_docs = result["documents"]
-
-#     # Should split into all headers with content
-#     headers = [doc.meta["header"] for doc in split_docs]
-#     assert "Header 1" in headers
-#     assert "Subheader 1.1.1" in headers
-#     assert "Subheader 1.2.1" in headers
-#     assert "Subheader 1.2.2" in headers
-#     assert "Subheader 1.2.3" in headers
-
-#     # Check that content is present and correct
-#     # Test first split
-#     header1_doc = split_docs[0]
-#     # assert header1_doc.meta["header"] == "Header 1"
-#     assert header1_doc.meta["split_id"] == 0
-#     assert header1_doc.meta["page_number"] == 1
-#     # assert header1_doc.meta["parent_headers"] == []
-#     assert header1_doc.content == "# Header 1\n\nContent under header 1.\n"
-
-#     # Test second split
-#     subheader111_doc = split_docs[1]
-#     # assert subheader111_doc.meta["header"] == "Subheader 1.1.1"
-#     assert subheader111_doc.meta["split_id"] == 1
-#     assert subheader111_doc.meta["page_number"] == 1
-#     # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"]
-#     assert subheader111_doc.content == "## Header 1.1\n\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n"
-
-#     # Test third split
-#     subheader121_doc = split_docs[2]
-#     # assert subheader121_doc.meta["header"] == "Subheader 1.2.1"
-#     assert subheader121_doc.meta["split_id"] == 2
-#     assert subheader121_doc.meta["page_number"] == 1
-#     # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
-#     assert subheader121_doc.content == "## Header 1.2\n\n### Subheader 1.2.1\nContent under header 1.2.1.\n"
-
-#     # Test fourth split
-#     subheader122_doc = split_docs[3]
-#     # assert subheader122_doc.meta["header"] == "Subheader 1.2.2"
-#     assert subheader122_doc.meta["split_id"] == 3
-#     assert subheader122_doc.meta["page_number"] == 1
-#     # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
-#     assert subheader122_doc.content == "### Subheader 1.2.2\n\nContent under header 1.2.2.\n"
-
-#     # Test fifth split
-#     subheader123_doc = split_docs[4]
-#     # assert subheader123_doc.meta["header"] == "Subheader 1.2.3"
-#     assert subheader123_doc.meta["split_id"] == 4
-#     assert subheader123_doc.meta["page_number"] == 1
-#     # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
-#     assert subheader123_doc.content == "### Subheader 1.2.3\n\nContent under header 1.2.3.\n"
-
-#     # Sanity check: reconstruct original text
-#     reconstructed_doc = "".join([doc.content for doc in split_docs])
-#     assert reconstructed_doc == sample_text
+def test_split_without_headers(sample_text):
+    splitter = MarkdownHeaderSplitter(keep_headers=False)
+    docs = [Document(content=sample_text)]
+    result = splitter.run(documents=docs)
+    split_docs = result["documents"]
+
+    # Should split into all headers with content
+    headers = [doc.meta["header"] for doc in split_docs]
+    assert "Header 1" in headers
+    assert "Subheader 1.1.1" in headers
+    assert "Subheader 1.2.1" in headers
+    assert "Subheader 1.2.2" in headers
+    assert "Subheader 1.2.3" in headers
+
+    # Check that content is present and correct
+    # Test first split
+    header1_doc = split_docs[0]
+    assert header1_doc.meta["header"] == "Header 1"
+    assert header1_doc.meta["split_id"] == 0
+    assert header1_doc.meta["page_number"] == 1
+    assert header1_doc.meta["parent_headers"] == []
+    assert header1_doc.content == "Content under header 1.\n"
+
+    # Test second split
+    subheader111_doc = split_docs[1]
+    assert subheader111_doc.meta["header"] == "Subheader 1.1.1"
+    assert subheader111_doc.meta["split_id"] == 1
+    assert subheader111_doc.meta["page_number"] == 1
+    assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"]
+    assert subheader111_doc.content == "Content under sub-header 1.1.1\n"
+
+    # Test third split
+    subheader121_doc = split_docs[2]
+    assert subheader121_doc.meta["header"] == "Subheader 1.2.1"
+    assert subheader121_doc.meta["split_id"] == 2
+    assert subheader121_doc.meta["page_number"] == 1
+    assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader121_doc.content == "Content under header 1.2.1.\n"
+
+    # Test fourth split
+    subheader122_doc = split_docs[3]
+    assert subheader122_doc.meta["header"] == "Subheader 1.2.2"
+    assert subheader122_doc.meta["split_id"] == 3
+    assert subheader122_doc.meta["page_number"] == 1
+    assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader122_doc.content == "Content under header 1.2.2.\n"
+
+    # Test fifth split
+    subheader123_doc = split_docs[4]
+    assert subheader123_doc.meta["header"] == "Subheader 1.2.3"
+    assert subheader123_doc.meta["split_id"] == 4
+    assert subheader123_doc.meta["page_number"] == 1
+    assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
+    assert subheader123_doc.content == "Content under header 1.2.3."
 
 
 def test_split_parentheaders(sample_text):

From f3d77990549bc6ace3f4d75d3411fca0d54e4bbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 11:57:53 +0100
Subject: [PATCH 80/85] remove strip()

---
 .../preprocessors/markdown_header_splitter.py         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f28437dcc0..f053b19667 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -101,7 +101,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
         for i, match in enumerate(matches):
             # extract header info
             header_prefix = match.group(1)
-            header_text = match.group(2).strip()
+            header_text = match.group(2)
             level = len(header_prefix)
 
             # get content
@@ -109,22 +109,20 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
             content = text[start:end]
             if not self.keep_headers and content.startswith("\n"):
-                content = content[1:]
+                content = content[1:]  # remove leading newline if headers not kept
 
             # update header stack to track nesting
             header_stack[level - 1] = header_text
             for j in range(level, 6):
                 header_stack[j] = None
 
-            # prepare header_line if keep_headers
-            header_line = f"{header_prefix} {header_text}"
-
             # skip splits w/o content
-            if not content.strip():
+            if not content.strip():  # this strip is needed to avoid counting whitespace as content
                 # add as parent for subsequent headers
                 active_parents = [h for h in header_stack[: level - 1] if h is not None]
                 active_parents.append(header_text)
                 if self.keep_headers:
+                    header_line = f"{header_prefix} {header_text}"
                     pending_headers.append(header_line)
                 continue
 
@@ -136,6 +134,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             )
 
             if self.keep_headers:
+                header_line = f"{header_prefix} {header_text}"
                 # add pending & current header to content
                 chunk_content = ""
                 if pending_headers:

From f842fdb6650c87eb5ec57b6c36c0afedaa081843 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 12:20:05 +0100
Subject: [PATCH 81/85] simplify doc handling

---
 .../preprocessors/markdown_header_splitter.py | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index f053b19667..5bb2047838 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -183,10 +183,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 if header_match:
                     content_for_splitting = doc.content[header_match.end() :]
 
-            if not content_for_splitting or not content_for_splitting.strip():  # skip empty content
-                result_docs.append(doc)
-                continue
-
             # track page from meta
             current_page = doc.meta.get("page_number", 1)
 
@@ -303,7 +299,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             if not isinstance(doc.content, str):
                 raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).")
 
-        processed_documents = []
+        final_docs = []
         for doc in documents:
             # handle empty documents
             if not doc.content or not doc.content.strip():
@@ -311,22 +307,23 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
                     logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                     continue
                 # keep empty documents
-                processed_documents.append(doc)
+                final_docs.append(doc)
                 logger.warning(
                     "Document ID {doc_id} has an empty content. Keeping this document as per configuration.",
                     doc_id=doc.id,
                 )
                 continue
 
-            processed_documents.append(doc)
+            # split this document by headers
+            header_split_docs = self._split_documents_by_markdown_headers([doc])
 
-        if not processed_documents:
-            return {"documents": []}
-
-        header_split_docs = self._split_documents_by_markdown_headers(processed_documents)
+            # apply secondary splitting if configured
+            if self.secondary_split:
+                doc_splits = self._apply_secondary_splitting(header_split_docs)
+            else:
+                doc_splits = header_split_docs
 
-        # secondary splitting if configured
-        final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs
+            final_docs.extend(doc_splits)
 
         # assign split_id to all output documents
         for idx, doc in enumerate(final_docs):

From c7fc2e45d25949c8cb6a4bd12d839447be26f6f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 13:00:56 +0100
Subject: [PATCH 82/85] fix split id assignment

---
 .../preprocessors/markdown_header_splitter.py | 23 ++++++++-----
 .../test_markdown_header_splitter.py          | 33 +++++++++++++------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index 5bb2047838..e7dca1b68e 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -169,6 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         Ensures page counting is maintained across splits.
         """
         result_docs = []
+        current_split_id = 0  # track split_id across all secondary splits from the same parent
 
         for doc in documents:
             if doc.content is None:
@@ -186,8 +187,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             # track page from meta
             current_page = doc.meta.get("page_number", 1)
 
+            # create a clean meta dict without split_id for secondary splitting
+            clean_meta = {k: v for k, v in doc.meta.items() if k != "split_id"}
+
             secondary_splits = self.secondary_splitter.run(
-                documents=[Document(content=content_for_splitting, meta=doc.meta)]
+                documents=[Document(content=content_for_splitting, meta=clean_meta)]
             )["documents"]
 
             # split processing
@@ -196,8 +200,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 if i > 0 and secondary_splits[i - 1].content:
                     current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page)
 
-                # set page number to meta
+                # set page number and split_id to meta
                 split.meta["page_number"] = current_page
+                split.meta["split_id"] = current_split_id
+                # ensure source_id is preserved from the original document
+                if "source_id" in doc.meta:
+                    split.meta["source_id"] = doc.meta["source_id"]
+                current_split_id += 1
 
                 # preserve header metadata if we're not keeping headers in content
                 if not self.keep_headers:
@@ -255,11 +264,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 current_page=current_page,
                 total_pages=total_pages,
             )
-            for split in splits:
+            for split_idx, split in enumerate(splits):
                 meta = {}
                 if doc.meta:
                     meta = doc.meta.copy()
-                meta.update({"source_id": doc.id, "page_number": current_page})
+                meta.update({"source_id": doc.id, "page_number": current_page, "split_id": split_idx})
                 if split.get("meta"):
                     meta.update(split["meta"])
                 current_page = self._update_page_number_with_breaks(split["content"], current_page)
@@ -284,7 +293,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             - `documents`: List of documents with the split texts. Each document includes:
                 - A metadata field `source_id` to track the original document.
                 - A metadata field `page_number` to track the original page number.
-                - A metadata field `split_id` to uniquely identify each split chunk.
+                - A metadata field `split_id` to identify the split chunk index within its parent document.
                 - All other metadata copied from the original document.
         """
         # validate input documents
@@ -325,8 +334,4 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
 
             final_docs.extend(doc_splits)
 
-        # assign split_id to all output documents
-        for idx, doc in enumerate(final_docs):
-            doc.meta["split_id"] = idx
-
         return {"documents": final_docs}
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index fb51c3e8de..e37f06484e 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from collections import defaultdict
 from unittest.mock import ANY
 
 import pytest
@@ -181,10 +182,14 @@ def test_split_multiple_documents(sample_text):
     headers = {doc.meta["header"] for doc in split_docs}
     assert {"Another Header", "H1", "H2"}.issubset(headers)
 
-    # Verify that all documents have a split_id and they're sequential
-    split_ids = [doc.meta.get("split_id") for doc in split_docs]
-    assert all(split_id is not None for split_id in split_ids)
-    assert split_ids == list(range(len(split_ids)))
+    # Verify that split_ids are per-parent-document
+    splits_by_source = defaultdict(list)
+    for doc in split_docs:
+        splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
+
+    # Each parent document should have split_ids starting from 0
+    for source_id, split_ids in splits_by_source.items():
+        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
 
 
 def test_split_only_headers():
@@ -268,7 +273,7 @@ def test_empty_content_handling():
 
 
 def test_split_id_sequentiality_primary_and_secondary(sample_text):
-    # Test primary splitting
+    # Test primary splitting with single document
     splitter = MarkdownHeaderSplitter()
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
@@ -277,11 +282,11 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test number of documents
     assert len(split_docs) == 5
 
-    # Check that split_ids are sequential
+    # Check that split_ids are sequential from 0 for this single parent document
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test secondary splitting
+    # Test secondary splitting with single document
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
@@ -290,10 +295,12 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test number of documents
     assert len(split_docs) == 12
 
+    # Check that split_ids are sequential from 0 for this single parent document
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test with multiple input documents
+    # Test with multiple input documents - each should have its own split_id sequence
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)  # Use fresh instance
     docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
@@ -301,8 +308,14 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test number of documents
     assert len(split_docs) == 14
 
-    split_ids = [doc.meta["split_id"] for doc in split_docs]
-    assert split_ids == list(range(len(split_ids)))
+    # Verify split_ids are per-parent-document
+    splits_by_source = defaultdict(list)
+    for doc in split_docs:
+        splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
+
+    # Each parent document should have split_ids starting from 0
+    for source_id, split_ids in splits_by_source.items():
+        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
 
 
 def test_secondary_split_with_overlap():

From 64ff6fb1927a8f46c27ab687538b533c660f5c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 13:09:47 +0100
Subject: [PATCH 83/85] test cleanup

---
 .../preprocessors/test_markdown_header_splitter.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index e37f06484e..82b55963ac 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -188,8 +188,8 @@ def test_split_multiple_documents(sample_text):
         splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
 
     # Each parent document should have split_ids starting from 0
-    for source_id, split_ids in splits_by_source.items():
-        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
+    for split_ids in splits_by_source.values():
+        assert split_ids == list(range(len(split_ids)))
 
 
 def test_split_only_headers():
@@ -314,8 +314,8 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
         splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
 
     # Each parent document should have split_ids starting from 0
-    for source_id, split_ids in splits_by_source.items():
-        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
+    for split_ids in splits_by_source.values():
+        assert split_ids == list(range(len(split_ids)))
 
 
 def test_secondary_split_with_overlap():

From eb3e568bb73565c30b629d47c2ad3f0f9a596e45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 13:29:31 +0100
Subject: [PATCH 84/85] test splits more explicitly

---
 .../test_markdown_header_splitter.py          | 37 +++++++++----------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 82b55963ac..440135bc14 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -361,28 +361,25 @@ def test_secondary_split_with_threshold():
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    for i, doc in enumerate(split_docs):
-        words = doc.content.split()
-        if i == 0:
-            # First chunk includes header-hashtag plus split_length words
-            assert words[:2] == ["#", "Header"]
-            assert len(words) == 4
-        elif i < len(split_docs) - 1:
-            # Subsequent chunks should have split_length words
-            assert len(words) == 3
-        else:
-            # Last chunk should have at least split_threshold words
-            assert len(words) >= 2
+
+    # Explicitly test each split
+    assert len(split_docs) == 4
+    assert len(split_docs[0].content.split()) == 4  # "# Header" + 2 words
+    assert len(split_docs[1].content.split()) == 3  # 3 words (split_length)
+    assert len(split_docs[2].content.split()) == 3  # 3 words (split_length)
+    assert len(split_docs[3].content.split()) == 2  # 2 words (meets threshold)
 
     # keep_headers=False
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    for doc in split_docs[:-1]:
-        assert len(doc.content.split()) == 3
-    # The last chunk should have at least 2 words (threshold)
-    assert len(split_docs[-1].content.split()) >= 2
+
+    # Explicitly test each split
+    assert len(split_docs) == 3
+    assert len(split_docs[0].content.split()) == 3  # 3 words
+    assert len(split_docs[1].content.split()) == 3  # 3 words
+    assert len(split_docs[2].content.split()) == 4  # 4 words (due to threshold, not possible to split 3-1)
 
 
 def test_page_break_handling_in_secondary_split():
@@ -391,10 +388,10 @@ def test_page_break_handling_in_secondary_split():
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    page_numbers = [doc.meta.get("page_number") for doc in split_docs]
-    # Should start at 1 and increment at each \f
-    assert page_numbers[0] == 1
-    assert max(page_numbers) == 3
+    # Explicitly check the page number of each split
+    expected_page_numbers = [1, 1, 1, 2, 3]
+    actual_page_numbers = [doc.meta.get("page_number") for doc in split_docs]
+    assert actual_page_numbers == expected_page_numbers
 
 
 def test_page_break_handling_with_multiple_headers():

From ad155cc8b2036e039a728d4fc919ff9ec5f61f78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <oliver.guggenbuehl@deepset.ai>
Date: Fri, 21 Nov 2025 15:06:43 +0100
Subject: [PATCH 85/85] cleanup tests

minor commenting
---
 .../preprocessors/markdown_header_splitter.py |  2 +-
 .../test_markdown_header_splitter.py          | 22 +++++--------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
index e7dca1b68e..02ccf8c99c 100644
--- a/haystack/components/preprocessors/markdown_header_splitter.py
+++ b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -311,7 +311,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         final_docs = []
         for doc in documents:
             # handle empty documents
-            if not doc.content or not doc.content.strip():
+            if not doc.content or not doc.content.strip():  # avoid counting whitespace as content
                 if self.skip_empty_documents:
                     logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                     continue
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
index 440135bc14..3efa22fd89 100644
--- a/test/components/preprocessors/test_markdown_header_splitter.py
+++ b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -40,45 +40,35 @@ def test_basic_split(sample_text):
     # Check that content is present and correct
     # Test first split
     header1_doc = split_docs[0]
-    # assert header1_doc.meta["header"] == "Header 1"
     assert header1_doc.meta["split_id"] == 0
     assert header1_doc.meta["page_number"] == 1
-    # assert header1_doc.meta["parent_headers"] == []
     assert header1_doc.content == "# Header 1\nContent under header 1.\n"
 
     # Test second split
     subheader111_doc = split_docs[1]
-    # assert subheader111_doc.meta["header"] == "Subheader 1.1.1"
     assert subheader111_doc.meta["split_id"] == 1
     assert subheader111_doc.meta["page_number"] == 1
-    # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"]
     assert subheader111_doc.content == "## Header 1.1\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n"
 
     # Test third split
     subheader121_doc = split_docs[2]
-    # assert subheader121_doc.meta["header"] == "Subheader 1.2.1"
     assert subheader121_doc.meta["split_id"] == 2
     assert subheader121_doc.meta["page_number"] == 1
-    # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
     assert subheader121_doc.content == "## Header 1.2\n### Subheader 1.2.1\nContent under header 1.2.1.\n"
 
     # Test fourth split
     subheader122_doc = split_docs[3]
-    # assert subheader122_doc.meta["header"] == "Subheader 1.2.2"
     assert subheader122_doc.meta["split_id"] == 3
     assert subheader122_doc.meta["page_number"] == 1
-    # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
     assert subheader122_doc.content == "### Subheader 1.2.2\nContent under header 1.2.2.\n"
 
     # Test fifth split
     subheader123_doc = split_docs[4]
-    # assert subheader123_doc.meta["header"] == "Subheader 1.2.3"
     assert subheader123_doc.meta["split_id"] == 4
     assert subheader123_doc.meta["page_number"] == 1
-    # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"]
     assert subheader123_doc.content == "### Subheader 1.2.3\nContent under header 1.2.3."
 
-    # Sanity check: reconstruct original text
+    # Reconstruct original text
     reconstructed_doc = "".join([doc.content for doc in split_docs])
     assert reconstructed_doc == sample_text
 
@@ -299,7 +289,7 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test with multiple input documents - each should have its own split_id sequence
+    # Test with multiple input documents; each should have its own split_id sequence
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)  # Use fresh instance
     docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")]
     result = splitter.run(documents=docs)
@@ -383,13 +373,13 @@ def test_secondary_split_with_threshold():
 
 
 def test_page_break_handling_in_secondary_split():
-    text = "# Header\nFirst page\fSecond page\fThird page"
+    text = "# Header\nFirst page\f Second page\f Third page"
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1)
     docs = [Document(content=text)]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
-    # Explicitly check the page number of each split
-    expected_page_numbers = [1, 1, 1, 2, 3]
+
+    expected_page_numbers = [1, 1, 1, 2, 2, 3, 3]
     actual_page_numbers = [doc.meta.get("page_number") for doc in split_docs]
     assert actual_page_numbers == expected_page_numbers
 
@@ -430,6 +420,6 @@ def test_page_break_handling_with_multiple_headers():
     assert split_docs[6].content == "page"
     assert split_docs[6].meta == {"source_id": ANY, "page_number": 3, "split_id": 6, "split_idx_start": 40}
 
-    # Check reconstruction
+    # Reconstruct original text
     reconstructed_text = "".join(doc.content for doc in split_docs)
     assert reconstructed_text == text