From 45e7c1270eefe740051175b3e96ed5718d1e369a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 11 Jul 2025 16:17:18 +0200 Subject: [PATCH 01/85] implement md-header-splitter and add tests --- .../preprocessors/markdown_header_splitter.py | 298 ++++++++++++++++++ .../test_markdown_header_splitter.py | 87 +++++ 2 files changed, 385 insertions(+) create mode 100644 haystack/components/preprocessors/markdown_header_splitter.py create mode 100644 test/components/preprocessors/test_markdown_header_splitter.py diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py new file mode 100644 index 0000000000..91247237e9 --- /dev/null +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -0,0 +1,298 @@ +import logging +import re +from typing import Any, Dict, List, Literal, Optional + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.components.preprocessors import DocumentSplitter + +logger = logging.getLogger(__name__) + + +class CustomDocumentSplitter(DocumentSplitter): + """ + Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'. + """ + + def __init__(self, *args, page_break_character="\\f", **kwargs): + super().__init__(*args, **kwargs) + self.page_break_character = page_break_character + + def _flatten_dict(self, d: Dict, prefix: str = "", target_dict: Optional[Dict] = None) -> Dict: + """Helper method to flatten a nested dictionary.""" + if target_dict is None: + target_dict = {} + + for key, value in d.items(): + new_key = f"{prefix}{key}" if prefix else key + + if isinstance(value, dict): + self._flatten_dict(value, f"{new_key}_", target_dict) + else: + target_dict[new_key] = value + + return target_dict + + def _process_split_content(self, split_content: str, split_index: int) -> int: + """Process the content of a split and return the number of page breaks.""" + if not isinstance(split_content, str): + return 0 + + page_breaks = split_content.count(self.page_break_character) + if page_breaks > 0: + logger.debug(f"Found {page_breaks} page breaks in split {split_index}") + return page_breaks + + def _split_by_function(self, doc: Document) -> List[Document]: + """Split document using a custom function that returns dictionaries with 'content' and 'meta'.""" + logger.debug(f"Splitting document with id={doc.id}") + splits = self.splitting_function(doc.content) + docs = [] + + # calculate total pages and set current page + total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 + current_page = doc.meta.get("page_number", 1) + logger.debug(f"Starting page number: {current_page}, Total pages: {total_pages}") + + # get meta for each split + for i, split in enumerate(splits): + meta = {} + if doc.meta: + meta = self._flatten_dict(doc.meta) + + # add standard metadata + meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page}) + + # get page number based on page breaks + page_breaks = self._process_split_content(split["content"], i) + current_page += page_breaks + + # add split-specific metadata + if split.get("meta"): + meta.update(self._flatten_dict(split.get("meta"))) + + docs.append(Document(content=split["content"], meta=meta)) + + logger.debug(f"Split into {len(docs)} documents for id={doc.id}, final page: {current_page}") + return docs + + +@component +class MarkdownHeaderSplitter: + """ + A custom component that splits documents at markdown headers with optional secondary splitting. + + :param enforce_first_header: If True, ensures the first header is always included in the parent headers. + This is useful for docling outputs where header levels are uniformly detected and the first header + is often overwritten. Defaults to False. + :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). + :param secondary_split: Optional secondary split condition after header splitting. + Options are "none", "word", "passage", "period", "line". Defaults to "none". + :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. + :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. + :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. + """ + + def __init__( + self, + enforce_first_header: bool = False, + page_break_character: str = "\\f", + secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", + split_length: int = 200, + split_overlap: int = 0, + split_threshold: int = 0, + ): + self.enforce_first_header = enforce_first_header + self.page_break_character = page_break_character + self.secondary_split = secondary_split + self.split_length = split_length + self.split_overlap = split_overlap + self.split_threshold = split_threshold + + def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[bool] = None) -> List[Dict]: + """Split text by markdown headers and create chunks with appropriate metadata.""" + logger.debug("Splitting text by markdown headers") + + # find headers + pattern = r"(?m)^(#{1,6}) (.+)$" + matches = list(re.finditer(pattern, text)) + + # return unsplit if no headers found + if not matches: + logger.info("No headers found in document; returning full document as single chunk.") + return [{"content": text, "meta": {"header": None, "parentheaders": []}}] + + # process headers and build chunks + chunks = [] + header_stack = [None] * 6 + active_parents = [] + first_header = matches[0].group(2).strip() + + for i, match in enumerate(matches): + # Extract header info + header_prefix = match.group(1) + header_text = match.group(2).strip() + level = len(header_prefix) + + # get content + start = match.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(text) + content = text[start:end].strip() + + # update header stack to track nesting + header_stack[level - 1] = header_text + for j in range(level, 6): + header_stack[j] = None + + # skip splits w/o content + if not content: + # Add as parent for subsequent headers + active_parents = [h for h in header_stack[: level - 1] if h] + active_parents.append(header_text) + continue + + # get parent headers + parentheaders = list(active_parents) + + # enforce first header if needed + if enforce_first_header and first_header and (not parentheaders or parentheaders[0] != first_header): + parentheaders = [first_header] + [h for h in parentheaders if h != first_header] + + logger.debug(f"Creating chunk for header '{header_text}' at level {level}") + + chunks.append( + { + "content": f"{header_prefix} {header_text}\n{content}", + "meta": {"header": header_text, "parentheaders": parentheaders}, + } + ) + + # reset active parents + active_parents = [h for h in header_stack[: level - 1] if h] + + logger.info(f"Split into {len(chunks)} chunks by markdown headers.") + return chunks + + def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document]: + """ + Apply secondary splitting while preserving header metadata and structure. + + Ensures page counting is maintained across splits. + """ + if self.secondary_split == "none": + return documents + + logger.info(f"Applying secondary splitting by {self.secondary_split}") + result_docs = [] + + for doc in documents: + # extract header information + header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content) + if header_match: + header_prefix = header_match.group(0) + "\n" + content_for_splitting = doc.content[header_match.end() :] + else: + header_prefix = "" + content_for_splitting = doc.content + + if not content_for_splitting.strip(): # skip empty content + result_docs.append(doc) + continue + + # track page from meta + current_page = doc.meta.get("page_number", 1) + + secondary_splitter = DocumentSplitter( + split_by=self.secondary_split, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + ) + + # apply secondary splitting + temp_doc = Document(content=content_for_splitting, meta=doc.meta) + secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"] + parent_headers = doc.meta.get("parentheaders", []) + first_header = parent_headers[0] if parent_headers else None + accumulated_page_breaks = 0 # track page breaks + + # split processing + for i, split in enumerate(secondary_splits): + # calculate page number for this split + if i > 0: # page break counting + prev_content = secondary_splits[i - 1].content + page_breaks = prev_content.count(self.page_break_character) + accumulated_page_breaks += page_breaks + + # set page number to meta + split.meta["page_number"] = current_page + accumulated_page_breaks + + if header_prefix: # add header prefix to content + split.content = header_prefix + split.content + + # preserve header metadata + for key in ["header", "parentheaders"]: + if key in doc.meta: + split.meta[key] = doc.meta[key] + + # enforce first header if needed + if self.enforce_first_header and first_header: + parentheaders = split.meta.get("parentheaders", []) + if not parentheaders: + split.meta["parentheaders"] = [first_header] + elif parentheaders[0] != first_header: + split.meta["parentheaders"] = [first_header] + [h for h in parentheaders if h != first_header] + # preserve primary split ID + if "split_id" in doc.meta: + split.meta["header_split_id"] = doc.meta["split_id"] + + result_docs.append(split) + + logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.") + return result_docs + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document], enforce_first_header: Optional[bool] = None) -> Dict[str, List[Document]]: + """ + Run the markdown header splitter with optional secondary splitting. + + :param documents: List of documents to split + :param enforce_first_header: If True, ensures the first header is included in all parentheaders. + If None, uses the value from initialization. + """ + logger.info(f"Processing {len(documents)} documents with enforce_first_header={enforce_first_header}") + + # split by markdown headers + header_splitter = CustomDocumentSplitter( + split_by="function", + splitting_function=lambda text: self._split_by_markdown_headers(text, enforce_first_header), + page_break_character=self.page_break_character, + ) + + # get splits + header_split_docs = header_splitter.run(documents=documents)["documents"] + logger.info(f"Header splitting produced {len(header_split_docs)} documents") + + # apply secondary splitting if requested + if self.secondary_split != "none": + final_docs = self._apply_secondary_splitting(header_split_docs) + else: + final_docs = header_split_docs + + return {"documents": final_docs} + + def to_dict(self) -> Dict[str, Any]: + """Serialize component to dictionary.""" + return default_to_dict( + self, + enforce_first_header=self.enforce_first_header, + page_break_character=self.page_break_character, + secondary_split=self.secondary_split, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter": + """Deserialize component from dictionary.""" + return default_from_dict(cls, data) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py new file mode 100644 index 0000000000..7ae3aeb039 --- /dev/null +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -0,0 +1,87 @@ +import pytest +from haystack import Document + +from deepset_cloud_custom_nodes.splitters.markdown_header_splitter import ( + MarkdownHeaderSplitter, +) + + +@pytest.fixture +def sample_text(): + return ( + "# Header 1\n" + "Content under header 1.\n" + "## Header 1.1\n" + "### Subheader 1.1.1\n" + "Content under sub-header 1.1.1\n" + "## Header 1.2\n" + "### Subheader 1.2.1\n" + "Content under header 1.2.1.\n" + "### Subheader 1.2.2\n" + "Content under header 1.2.2.\n" + "### Subheader 1.2.3\n" + "Content under header 1.2.3." + ) + + +def test_basic_split(sample_text): + splitter = MarkdownHeaderSplitter() + docs = [Document(content=sample_text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + + # Should split into all headers with content + headers = [doc.meta["header"] for doc in split_docs] + assert "Header 1" in headers + assert "Subheader 1.1.1" in headers + assert "Subheader 1.2.1" in headers + assert "Subheader 1.2.2" in headers + assert "Subheader 1.2.3" in headers + + # Check that content is present and correct + for doc in split_docs: + assert doc.content.startswith("#") or doc.content.startswith("##") or doc.content.startswith("###") + assert doc.meta.get("header") is not None + + +def test_parentheaders(sample_text): + splitter = MarkdownHeaderSplitter() + docs = [Document(content=sample_text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + + # Find a subheader and check parentheaders + subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2") + assert "Header 1" in subheader_doc.meta["parentheaders"] + assert "Header 1.2" in subheader_doc.meta["parentheaders"] + + +def test_enforce_first_header(sample_text): + splitter = MarkdownHeaderSplitter(enforce_first_header=True) + docs = [Document(content=sample_text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + + # All parentheaders should start with the first header + first_header = "Header 1" + for doc in split_docs: + if doc.meta["parentheaders"]: + assert doc.meta["parentheaders"][0] == first_header + + +def test_no_headers(): + splitter = MarkdownHeaderSplitter() + docs = [Document(content="Just some text without headers.")] + result = splitter.run(documents=docs) + assert len(result["documents"]) == 1 + + +def test_multiple_documents(sample_text): + splitter = MarkdownHeaderSplitter() + docs = [ + Document(content=sample_text), + Document(content="# Another Header\nSome content."), + ] + result = splitter.run(documents=docs) + split_docs = result["documents"] + assert any(doc.meta["header"] == "Another Header" for doc in split_docs) From edfd644a6fece9714b6ec67248920707ed52fda9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 29 Jul 2025 15:49:45 +0200 Subject: [PATCH 02/85] rework md-header splitter to rewrite md-header levels --- .../preprocessors/markdown_header_splitter.py | 161 +++++++++++++++--- .../test_markdown_header_splitter.py | 11 +- 2 files changed, 137 insertions(+), 35 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 91247237e9..c51ff606cf 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -81,9 +81,8 @@ class MarkdownHeaderSplitter: """ A custom component that splits documents at markdown headers with optional secondary splitting. - :param enforce_first_header: If True, ensures the first header is always included in the parent headers. - This is useful for docling outputs where header levels are uniformly detected and the first header - is often overwritten. Defaults to False. + :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. + Useful for documents where all headers use the same level. Defaults to False. :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). :param secondary_split: Optional secondary split condition after header splitting. Options are "none", "word", "passage", "period", "line". Defaults to "none". @@ -94,21 +93,94 @@ class MarkdownHeaderSplitter: def __init__( self, - enforce_first_header: bool = False, + infer_header_levels: bool = False, page_break_character: str = "\\f", secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", split_length: int = 200, split_overlap: int = 0, split_threshold: int = 0, ): - self.enforce_first_header = enforce_first_header + self.infer_header_levels = infer_header_levels self.page_break_character = page_break_character self.secondary_split = secondary_split self.split_length = split_length self.split_overlap = split_overlap self.split_threshold = split_threshold - def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[bool] = None) -> List[Dict]: + def _infer_and_rewrite_header_levels(self, text: str) -> str: + """ + Infer and rewrite header levels in the markdown text. + + This function analyzes the document structure to infer proper header levels: + - First header is always level 1 + - If there's content between headers, the next header stays at the same level + - If there's no content between headers, the next header goes one level deeper + - Header levels never exceed 6 (the maximum in markdown) + + This is useful for documents where all headers are at the same level, such as + output from document conversion tools like docling. + """ + logger.debug("Inferring and rewriting header levels") + + # find headers + pattern = r"(?m)^(#{1,6}) (.+)$" + matches = list(re.finditer(pattern, text)) + + if not matches: + logger.info("No headers found in document; skipping header level inference.") + return text + + modified_text = text + offset = 0 # track offset due to length changes in headers + + # track header structure + current_level = 1 + header_stack = [1] # always start with level 1 + + for i, match in enumerate(matches): + original_header = match.group(0) + header_text = match.group(2).strip() + + # check if there's content between this header and the previous one + has_content = False + if i > 0: + prev_end = matches[i - 1].end() + current_start = match.start() + content_between = text[prev_end:current_start].strip() + has_content = bool(content_between) + + # first header is always level 1 + if i == 0: + inferred_level = 1 + elif has_content: + # stay at the same level if there's content + inferred_level = current_level + else: + # go one level deeper if there's no content + inferred_level = min(current_level + 1, 6) + + # update tracking variables + current_level = inferred_level + header_stack = header_stack[:inferred_level] + while len(header_stack) < inferred_level: + header_stack.append(1) + + # new header with inferred level + new_prefix = "#" * inferred_level + new_header = f"{new_prefix} {header_text}" + + # replace old header + start_pos = match.start() + offset + end_pos = match.end() + offset + modified_text = modified_text[:start_pos] + new_header + modified_text[end_pos:] + + # update offset + offset += len(new_header) - len(original_header) + + logger.info(f"Rewrote {len(matches)} headers with inferred levels.") + return modified_text + + def _split_by_markdown_headers(self, text: str) -> List[Dict]: """Split text by markdown headers and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -125,10 +197,9 @@ def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[b chunks = [] header_stack = [None] * 6 active_parents = [] - first_header = matches[0].group(2).strip() for i, match in enumerate(matches): - # Extract header info + # extract header info header_prefix = match.group(1) header_text = match.group(2).strip() level = len(header_prefix) @@ -153,10 +224,6 @@ def _split_by_markdown_headers(self, text: str, enforce_first_header: Optional[b # get parent headers parentheaders = list(active_parents) - # enforce first header if needed - if enforce_first_header and first_header and (not parentheaders or parentheaders[0] != first_header): - parentheaders = [first_header] + [h for h in parentheaders if h != first_header] - logger.debug(f"Creating chunk for header '{header_text}' at level {level}") chunks.append( @@ -211,8 +278,6 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document # apply secondary splitting temp_doc = Document(content=content_for_splitting, meta=doc.meta) secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"] - parent_headers = doc.meta.get("parentheaders", []) - first_header = parent_headers[0] if parent_headers else None accumulated_page_breaks = 0 # track page breaks # split processing @@ -234,13 +299,6 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document if key in doc.meta: split.meta[key] = doc.meta[key] - # enforce first header if needed - if self.enforce_first_header and first_header: - parentheaders = split.meta.get("parentheaders", []) - if not parentheaders: - split.meta["parentheaders"] = [first_header] - elif parentheaders[0] != first_header: - split.meta["parentheaders"] = [first_header] + [h for h in parentheaders if h != first_header] # preserve primary split ID if "split_id" in doc.meta: split.meta["header_split_id"] = doc.meta["split_id"] @@ -251,25 +309,34 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document return result_docs @component.output_types(documents=List[Document]) - def run(self, documents: List[Document], enforce_first_header: Optional[bool] = None) -> Dict[str, List[Document]]: + def run(self, documents: List[Document], infer_header_levels: Optional[bool] = None) -> Dict[str, List[Document]]: """ Run the markdown header splitter with optional secondary splitting. :param documents: List of documents to split - :param enforce_first_header: If True, ensures the first header is included in all parentheaders. + :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting. If None, uses the value from initialization. """ - logger.info(f"Processing {len(documents)} documents with enforce_first_header={enforce_first_header}") + infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels + + # process documents - preprocess if told to + processed_documents = [] + for doc in documents: + if infer_header_levels: + content = self._infer_and_rewrite_header_levels(doc.content) + processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) + else: + processed_documents.append(doc) # split by markdown headers header_splitter = CustomDocumentSplitter( split_by="function", - splitting_function=lambda text: self._split_by_markdown_headers(text, enforce_first_header), + splitting_function=lambda text: self._split_by_markdown_headers(text), page_break_character=self.page_break_character, ) # get splits - header_split_docs = header_splitter.run(documents=documents)["documents"] + header_split_docs = header_splitter.run(documents=processed_documents)["documents"] logger.info(f"Header splitting produced {len(header_split_docs)} documents") # apply secondary splitting if requested @@ -284,7 +351,7 @@ def to_dict(self) -> Dict[str, Any]: """Serialize component to dictionary.""" return default_to_dict( self, - enforce_first_header=self.enforce_first_header, + infer_header_levels=self.infer_header_levels, page_break_character=self.page_break_character, secondary_split=self.secondary_split, split_length=self.split_length, @@ -296,3 +363,43 @@ def to_dict(self) -> Dict[str, Any]: def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter": """Deserialize component from dictionary.""" return default_from_dict(cls, data) + + +# TODO: move to proper test file once ready +if __name__ == "__main__": + print() + print("===== Example 1: Regular splitting =====") + splitter = MarkdownHeaderSplitter() + content = """# Header 1 +## Subheader 1.1 +Content under subheader 1.1. +## Subheader 1.2 +### Subheader 1.2.1 +Content under subheader 1.2.1.""" + print("Original content:") + print(content) + example_doc = Document(content=content) + result = splitter.run(documents=[example_doc]) + for doc in result["documents"]: + print("\n---Document---") + print(doc.content) + print(doc.meta) + + print() + print("===== Example 2: Splitting with header inference =====") + splitter = MarkdownHeaderSplitter(infer_header_levels=True) + content = """## Header 1 +## Subheader 1.1 +Content under subheader 1.1. +## Subheader 1.2 +## Subheader 1.2.1 +Content under subheader 1.2.1.""" + print("Original content:") + print(content) + example_doc = Document(content=content) + result = splitter.run(documents=[example_doc]) + print("\nAfter header inference and splitting:") + for doc in result["documents"]: + print("\n---Document---") + print(doc.content) + print(doc.meta) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 7ae3aeb039..4207ea7b8c 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -1,9 +1,7 @@ import pytest -from haystack import Document -from deepset_cloud_custom_nodes.splitters.markdown_header_splitter import ( - MarkdownHeaderSplitter, -) +from haystack import Document +from haystack.components.preprocessors.markdown_header_splitter import MarkdownHeaderSplitter @pytest.fixture @@ -78,10 +76,7 @@ def test_no_headers(): def test_multiple_documents(sample_text): splitter = MarkdownHeaderSplitter() - docs = [ - Document(content=sample_text), - Document(content="# Another Header\nSome content."), - ] + docs = [Document(content=sample_text), Document(content="# Another Header\nSome content.")] result = splitter.run(documents=docs) split_docs = result["documents"] assert any(doc.meta["header"] == "Another Header" for doc in split_docs) From cd55f132729bc26a2b3402fe3794ff357cf29a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 29 Jul 2025 15:53:54 +0200 Subject: [PATCH 03/85] remove deprecated test --- .../preprocessors/test_markdown_header_splitter.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 4207ea7b8c..89a551bd2a 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -54,19 +54,6 @@ def test_parentheaders(sample_text): assert "Header 1.2" in subheader_doc.meta["parentheaders"] -def test_enforce_first_header(sample_text): - splitter = MarkdownHeaderSplitter(enforce_first_header=True) - docs = [Document(content=sample_text)] - result = splitter.run(documents=docs) - split_docs = result["documents"] - - # All parentheaders should start with the first header - first_header = "Header 1" - for doc in split_docs: - if doc.meta["parentheaders"]: - assert doc.meta["parentheaders"][0] == first_header - - def test_no_headers(): splitter = MarkdownHeaderSplitter() docs = [Document(content="Just some text without headers.")] From dafe1bdefcf18e3a55cbd5201ea66aa2336026d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:32:09 +0200 Subject: [PATCH 04/85] Update haystack/components/preprocessors/markdown_header_splitter.py use haystack logging Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- haystack/components/preprocessors/markdown_header_splitter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index c51ff606cf..7104eea0f5 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -1,8 +1,7 @@ -import logging import re from typing import Any, Dict, List, Literal, Optional -from haystack import Document, component, default_from_dict, default_to_dict +from haystack import Document, component, default_from_dict, default_to_dict, logging from haystack.components.preprocessors import DocumentSplitter logger = logging.getLogger(__name__) From 6da2513017821b4c4cc173e54d2925d23dc544e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 14:35:26 +0200 Subject: [PATCH 05/85] use native types --- .../preprocessors/markdown_header_splitter.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 7104eea0f5..994c587f6f 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -1,5 +1,5 @@ import re -from typing import Any, Dict, List, Literal, Optional +from typing import Any, Literal, Optional from haystack import Document, component, default_from_dict, default_to_dict, logging from haystack.components.preprocessors import DocumentSplitter @@ -16,7 +16,7 @@ def __init__(self, *args, page_break_character="\\f", **kwargs): super().__init__(*args, **kwargs) self.page_break_character = page_break_character - def _flatten_dict(self, d: Dict, prefix: str = "", target_dict: Optional[Dict] = None) -> Dict: + def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: """Helper method to flatten a nested dictionary.""" if target_dict is None: target_dict = {} @@ -41,7 +41,7 @@ def _process_split_content(self, split_content: str, split_index: int) -> int: logger.debug(f"Found {page_breaks} page breaks in split {split_index}") return page_breaks - def _split_by_function(self, doc: Document) -> List[Document]: + def _split_by_function(self, doc: Document) -> list[Document]: """Split document using a custom function that returns dictionaries with 'content' and 'meta'.""" logger.debug(f"Splitting document with id={doc.id}") splits = self.splitting_function(doc.content) @@ -179,7 +179,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str: logger.info(f"Rewrote {len(matches)} headers with inferred levels.") return modified_text - def _split_by_markdown_headers(self, text: str) -> List[Dict]: + def _split_by_markdown_headers(self, text: str) -> list[dict]: """Split text by markdown headers and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -238,7 +238,7 @@ def _split_by_markdown_headers(self, text: str) -> List[Dict]: logger.info(f"Split into {len(chunks)} chunks by markdown headers.") return chunks - def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document]: + def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]: """ Apply secondary splitting while preserving header metadata and structure. @@ -307,8 +307,8 @@ def _apply_secondary_splitting(self, documents: List[Document]) -> List[Document logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.") return result_docs - @component.output_types(documents=List[Document]) - def run(self, documents: List[Document], infer_header_levels: Optional[bool] = None) -> Dict[str, List[Document]]: + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]: """ Run the markdown header splitter with optional secondary splitting. @@ -346,7 +346,7 @@ def run(self, documents: List[Document], infer_header_levels: Optional[bool] = N return {"documents": final_docs} - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """Serialize component to dictionary.""" return default_to_dict( self, @@ -359,7 +359,7 @@ def to_dict(self) -> Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "MarkdownHeaderSplitter": + def from_dict(cls, data: dict[str, Any]) -> "MarkdownHeaderSplitter": """Deserialize component from dictionary.""" return default_from_dict(cls, data) From 96e616c7bbb5eae1c474656163179555176caf78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 14:42:47 +0200 Subject: [PATCH 06/85] move to haystack logging --- .../preprocessors/markdown_header_splitter.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 994c587f6f..0aa92d5d5f 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -38,19 +38,27 @@ def _process_split_content(self, split_content: str, split_index: int) -> int: page_breaks = split_content.count(self.page_break_character) if page_breaks > 0: - logger.debug(f"Found {page_breaks} page breaks in split {split_index}") + logger.debug( + "Found {page_breaks} page breaks in split {split_index}", + page_breaks=page_breaks, + split_index=split_index, + ) return page_breaks def _split_by_function(self, doc: Document) -> list[Document]: """Split document using a custom function that returns dictionaries with 'content' and 'meta'.""" - logger.debug(f"Splitting document with id={doc.id}") + logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) splits = self.splitting_function(doc.content) docs = [] # calculate total pages and set current page total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 current_page = doc.meta.get("page_number", 1) - logger.debug(f"Starting page number: {current_page}, Total pages: {total_pages}") + logger.debug( + "Starting page number: {current_page}, Total pages: {total_pages}", + current_page=current_page, + total_pages=total_pages, + ) # get meta for each split for i, split in enumerate(splits): @@ -71,7 +79,12 @@ def _split_by_function(self, doc: Document) -> list[Document]: docs.append(Document(content=split["content"], meta=meta)) - logger.debug(f"Split into {len(docs)} documents for id={doc.id}, final page: {current_page}") + logger.debug( + "Split into {num_docs} documents for id={doc_id}, final page: {current_page}", + num_docs=len(docs), + doc_id=doc.id, + current_page=current_page, + ) return docs @@ -176,7 +189,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str: # update offset offset += len(new_header) - len(original_header) - logger.info(f"Rewrote {len(matches)} headers with inferred levels.") + logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches)) return modified_text def _split_by_markdown_headers(self, text: str) -> list[dict]: @@ -223,7 +236,9 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]: # get parent headers parentheaders = list(active_parents) - logger.debug(f"Creating chunk for header '{header_text}' at level {level}") + logger.debug( + "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level + ) chunks.append( { @@ -235,7 +250,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]: # reset active parents active_parents = [h for h in header_stack[: level - 1] if h] - logger.info(f"Split into {len(chunks)} chunks by markdown headers.") + logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks)) return chunks def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]: @@ -247,7 +262,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document if self.secondary_split == "none": return documents - logger.info(f"Applying secondary splitting by {self.secondary_split}") + logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split) result_docs = [] for doc in documents: @@ -304,7 +319,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document result_docs.append(split) - logger.info(f"Secondary splitting complete. Final count: {len(result_docs)} documents.") + logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) return result_docs @component.output_types(documents=list[Document]) @@ -336,7 +351,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N # get splits header_split_docs = header_splitter.run(documents=processed_documents)["documents"] - logger.info(f"Header splitting produced {len(header_split_docs)} documents") + logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) # apply secondary splitting if requested if self.secondary_split != "none": From c3e397f991689ce05337bf7f14aa27b179dfb92e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 15:17:53 +0200 Subject: [PATCH 07/85] docstrings improvements --- .../preprocessors/markdown_header_splitter.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 0aa92d5d5f..3927346534 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -10,10 +10,15 @@ class CustomDocumentSplitter(DocumentSplitter): """ Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'. + + :param split_by: The method to split by. Must be "function" for this custom splitter. + :param splitting_function: The custom splitting function that takes a string and returns a list of dicts + with 'content' and optional 'meta' keys. + :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). """ - def __init__(self, *args, page_break_character="\\f", **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, page_break_character="\\f"): + super().__init__() self.page_break_character = page_break_character def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: @@ -92,15 +97,6 @@ def _split_by_function(self, doc: Document) -> list[Document]: class MarkdownHeaderSplitter: """ A custom component that splits documents at markdown headers with optional secondary splitting. - - :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. - Useful for documents where all headers use the same level. Defaults to False. - :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). - :param secondary_split: Optional secondary split condition after header splitting. - Options are "none", "word", "passage", "period", "line". Defaults to "none". - :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. - :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. - :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. """ def __init__( @@ -112,6 +108,18 @@ def __init__( split_overlap: int = 0, split_threshold: int = 0, ): + """ + Initialize the MarkdownHeaderSplitter. + + :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. + Useful for documents where all headers use the same level. Defaults to False. + :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). + :param secondary_split: Optional secondary split condition after header splitting. + Options are "none", "word", "passage", "period", "line". Defaults to "none". + :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. + :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. + :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. + """ self.infer_header_levels = infer_header_levels self.page_break_character = page_break_character self.secondary_split = secondary_split From 1ca9803d4a3c161fdd47183e61e5fb4c1874597e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:17:01 +0200 Subject: [PATCH 08/85] Update haystack/components/preprocessors/markdown_header_splitter.py remove temp toc Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- haystack/components/preprocessors/markdown_header_splitter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 3927346534..b973b623e9 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -298,8 +298,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document ) # apply secondary splitting - temp_doc = Document(content=content_for_splitting, meta=doc.meta) - secondary_splits = secondary_splitter.run(documents=[temp_doc])["documents"] + secondary_splits = secondary_splitter.run(documents=[Document(content=content_for_splitting, meta=doc.meta)])["documents"] accumulated_page_breaks = 0 # track page breaks # split processing From 6c496003f448bf576305343b58ca34b12505598a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 15:23:33 +0200 Subject: [PATCH 09/85] fix CustomDocumentSplitter arguments --- .../preprocessors/markdown_header_splitter.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index b973b623e9..516d391931 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -10,15 +10,17 @@ class CustomDocumentSplitter(DocumentSplitter): """ Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'. - - :param split_by: The method to split by. Must be "function" for this custom splitter. - :param splitting_function: The custom splitting function that takes a string and returns a list of dicts - with 'content' and optional 'meta' keys. - :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). """ - def __init__(self, page_break_character="\\f"): - super().__init__() + def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"): + """ + Initialize the CustomDocumentSplitter. + + :param split_by: The method to split by. Must be "function" for custom splitting functions. + :param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'. + :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). + """ + super().__init__(split_by=split_by, splitting_function=splitting_function) self.page_break_character = page_break_character def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: @@ -298,7 +300,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document ) # apply secondary splitting - secondary_splits = secondary_splitter.run(documents=[Document(content=content_for_splitting, meta=doc.meta)])["documents"] + secondary_splits = secondary_splitter.run( + documents=[Document(content=content_for_splitting, meta=doc.meta)] + )["documents"] accumulated_page_breaks = 0 # track page breaks # split processing From 9c23202d1d5cd1bfeae5c5ac08e1f8e488b6a17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 15:29:56 +0200 Subject: [PATCH 10/85] remove header prefix from content --- .../components/preprocessors/markdown_header_splitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 516d391931..5658797a32 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -316,8 +316,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # set page number to meta split.meta["page_number"] = current_page + accumulated_page_breaks - if header_prefix: # add header prefix to content - split.content = header_prefix + split.content + ## deactivated: header prefix to content + # if header_prefix: + # split.content = header_prefix + split.content # preserve header metadata for key in ["header", "parentheaders"]: From b24d92d23210d56539e85a69d45a75622fd709c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 15:42:27 +0200 Subject: [PATCH 11/85] rework split_id assignment to avoid collisions --- .../preprocessors/markdown_header_splitter.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 5658797a32..7d09ce4591 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -279,10 +279,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # extract header information header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content) if header_match: - header_prefix = header_match.group(0) + "\n" content_for_splitting = doc.content[header_match.end() :] else: - header_prefix = "" content_for_splitting = doc.content if not content_for_splitting.strip(): # skip empty content @@ -316,21 +314,17 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # set page number to meta split.meta["page_number"] = current_page + accumulated_page_breaks - ## deactivated: header prefix to content - # if header_prefix: - # split.content = header_prefix + split.content - # preserve header metadata for key in ["header", "parentheaders"]: if key in doc.meta: split.meta[key] = doc.meta[key] - # preserve primary split ID - if "split_id" in doc.meta: - split.meta["header_split_id"] = doc.meta["split_id"] - result_docs.append(split) + # assign unique, sequential split_id to all final chunks + for idx, doc in enumerate(result_docs): + doc.meta["split_id"] = idx + logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) return result_docs From 7b8150e69cda684dbad44ce38890e05c5d09a2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 15:46:30 +0200 Subject: [PATCH 12/85] remove unneeded dese methods --- .../preprocessors/markdown_header_splitter.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 7d09ce4591..963b46ceb5 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -367,23 +367,6 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N return {"documents": final_docs} - def to_dict(self) -> dict[str, Any]: - """Serialize component to dictionary.""" - return default_to_dict( - self, - infer_header_levels=self.infer_header_levels, - page_break_character=self.page_break_character, - secondary_split=self.secondary_split, - split_length=self.split_length, - split_overlap=self.split_overlap, - split_threshold=self.split_threshold, - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "MarkdownHeaderSplitter": - """Deserialize component from dictionary.""" - return default_from_dict(cls, data) - # TODO: move to proper test file once ready if __name__ == "__main__": From f0852218298c3648e963b526cda132a30a8c3f48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 16:02:30 +0200 Subject: [PATCH 13/85] cleanup --- .../preprocessors/markdown_header_splitter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 963b46ceb5..8b997b5173 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -1,7 +1,7 @@ import re -from typing import Any, Literal, Optional +from typing import Literal, Optional -from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack import Document, component, logging from haystack.components.preprocessors import DocumentSplitter logger = logging.getLogger(__name__) @@ -73,8 +73,8 @@ def _split_by_function(self, doc: Document) -> list[Document]: if doc.meta: meta = self._flatten_dict(doc.meta) - # add standard metadata - meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page}) + # add standard metadata (no split_id here) + meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) # get page number based on page breaks page_breaks = self._process_split_content(split["content"], i) @@ -365,6 +365,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N else: final_docs = header_split_docs + # assign unique, sequential split_id to all final chunks + for idx, doc in enumerate(final_docs): + doc.meta["split_id"] = idx + return {"documents": final_docs} From 3490d89056e419df831057266a2f8d88e97cd108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 9 Sep 2025 16:14:43 +0200 Subject: [PATCH 14/85] cleanup --- .../preprocessors/markdown_header_splitter.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 8b997b5173..c1e739ac00 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -1,5 +1,5 @@ import re -from typing import Literal, Optional +from typing import Callable, Literal, Optional from haystack import Document, component, logging from haystack.components.preprocessors import DocumentSplitter @@ -7,18 +7,28 @@ logger = logging.getLogger(__name__) -class CustomDocumentSplitter(DocumentSplitter): +class _CustomDocumentSplitter(DocumentSplitter): """ - Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'. + Internal helper class that extends DocumentSplitter to support splitting functions. + + This class handles splitting functions that return dictionaries with 'content' and 'meta' + keys instead of just strings. For internal use only within the MarkdownHeaderSplitter. """ - def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"): + def __init__( + self, + split_by: str = "function", + splitting_function: Optional[Callable] = None, + page_break_character: str = "\\f", + ): """ - Initialize the CustomDocumentSplitter. + Initialize the _CustomDocumentSplitter. :param split_by: The method to split by. Must be "function" for custom splitting functions. - :param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'. - :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). + :param splitting_function: A custom function that takes a string and returns a list of dicts + with 'content' and optional 'meta'. + :param page_break_character: Character used to identify page breaks. + Defaults to form feed ("\\f"). """ super().__init__(split_by=split_by, splitting_function=splitting_function) self.page_break_character = page_break_character @@ -119,7 +129,8 @@ def __init__( :param secondary_split: Optional secondary split condition after header splitting. Options are "none", "word", "passage", "period", "line". Defaults to "none". :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. - :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. + :param split_overlap: The number of overlapping units for each split when using secondary splitting. + Defaults to 0. :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. """ self.infer_header_levels = infer_header_levels @@ -349,7 +360,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N processed_documents.append(doc) # split by markdown headers - header_splitter = CustomDocumentSplitter( + header_splitter = _CustomDocumentSplitter( split_by="function", splitting_function=lambda text: self._split_by_markdown_headers(text), page_break_character=self.page_break_character, From 0bf3187fc70d6f809056847f0a45329dd023a059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 16 Sep 2025 15:55:45 +0200 Subject: [PATCH 15/85] add tests cleanup --- .../test_markdown_header_splitter.py | 203 +++++++++++++++++- 1 file changed, 193 insertions(+), 10 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 89a551bd2a..2c97270290 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -4,6 +4,7 @@ from haystack.components.preprocessors.markdown_header_splitter import MarkdownHeaderSplitter +# Fixtures @pytest.fixture def sample_text(): return ( @@ -22,6 +23,7 @@ def sample_text(): ) +# Basic splitting and structure def test_basic_split(sample_text): splitter = MarkdownHeaderSplitter() docs = [Document(content=sample_text)] @@ -42,28 +44,209 @@ def test_basic_split(sample_text): assert doc.meta.get("header") is not None -def test_parentheaders(sample_text): +def test_split_parentheaders(sample_text): splitter = MarkdownHeaderSplitter() - docs = [Document(content=sample_text)] + docs = [Document(content=sample_text), Document(content="# H1\n## H2\n### H3\nContent")] result = splitter.run(documents=docs) split_docs = result["documents"] - - # Find a subheader and check parentheaders + # Check parentheaders for both a deep subheader and a simple one subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2") assert "Header 1" in subheader_doc.meta["parentheaders"] assert "Header 1.2" in subheader_doc.meta["parentheaders"] + h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None) + if h3_doc: + assert h3_doc.meta["parentheaders"] == ["H1", "H2"] + + +def test_split_no_headers(): + splitter = MarkdownHeaderSplitter() + docs = [Document(content="No headers here."), Document(content="Just some text without headers.")] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # Should return one doc per input, header is None + assert len(split_docs) == 2 + for doc in split_docs: + assert doc.meta["header"] is None -def test_no_headers(): +def test_split_multiple_documents(sample_text): splitter = MarkdownHeaderSplitter() - docs = [Document(content="Just some text without headers.")] + docs = [ + Document(content=sample_text), + Document(content="# Another Header\nSome content."), + Document(content="# H1\nA"), + Document(content="# H2\nB"), + ] + result = splitter.run(documents=docs) + split_docs = result["documents"] + headers = {doc.meta["header"] for doc in split_docs} + assert {"Another Header", "H1", "H2"}.issubset(headers) + + +def test_split_only_headers(): + text = "# H1\n# H2\n# H3" + splitter = MarkdownHeaderSplitter() + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # Should not create chunks for headers with no content + assert len(split_docs) == 0 + + +# Header inference and overrides +def test_split_infer_header_levels(): + text = "## H1\n## H2\nContent" + splitter = MarkdownHeaderSplitter(infer_header_levels=True) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # Should rewrite headers to # and ## + assert split_docs[0].content.startswith("## H2") or split_docs[0].content.startswith("# H1") + + +def test_infer_header_levels_complex(): + """Test header level inference with a complex document structure.""" + text = ( + "## All Headers Same Level\n" + "Some content\n" + "## Second Header\n" + "Some content\n" # Added content to ensure headers are processed correctly + "## Third Header With No Content\n" + "## Fourth Header With No Content\n" + "## Fifth Header\n" + "More content" + ) + + splitter = MarkdownHeaderSplitter(infer_header_levels=True) + docs = [Document(content=text)] result = splitter.run(documents=docs) - assert len(result["documents"]) == 1 + split_docs = result["documents"] + # Get docs by header content to avoid position assumptions + first_doc = next((doc for doc in split_docs if "All Headers Same Level" in doc.content), None) + second_doc = next((doc for doc in split_docs if "Second Header" in doc.content), None) -def test_multiple_documents(sample_text): + # First header should be level 1 + assert first_doc and "# All Headers Same Level" in first_doc.content + + # Second header with content should stay at level 1 + assert second_doc and "# Second Header" in second_doc.content + + +def test_infer_header_levels_override_both_directions(): + text = "## H1\n## H2\nContent" + docs = [Document(content=text)] + + # False at init, True at run + splitter = MarkdownHeaderSplitter(infer_header_levels=False) + result = splitter.run(documents=docs, infer_header_levels=True) + assert "# " in result["documents"][0].content + + # True at init, False at run + splitter = MarkdownHeaderSplitter(infer_header_levels=True) + result = splitter.run(documents=docs, infer_header_levels=False) + assert all("## " in doc.content for doc in result["documents"]) + + +# Metadata preservation +def test_preserve_document_metadata(): + """Test that document metadata is preserved through splitting.""" splitter = MarkdownHeaderSplitter() - docs = [Document(content=sample_text), Document(content="# Another Header\nSome content.")] + docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})] + result = splitter.run(documents=docs) split_docs = result["documents"] - assert any(doc.meta["header"] == "Another Header" for doc in split_docs) + + # Original metadata should be preserved + assert split_docs[0].meta["source"] == "test" + assert split_docs[0].meta["importance"] == "high" + assert split_docs[0].meta["custom_field"] == 123 + + # New metadata should be added + assert "header" in split_docs[0].meta + assert "split_id" in split_docs[0].meta + + +# Error and edge case handling +def test_non_text_document(caplog): + """Test that the component correctly handles non-text documents.""" + splitter = MarkdownHeaderSplitter() + docs = [Document(content=None)] + + # Should raise ValueError about text documents + with pytest.raises(ValueError, match="only works with text documents"): + splitter.run(documents=docs) + + +def test_empty_document_list(): + """Test handling of an empty document list.""" + splitter = MarkdownHeaderSplitter() + result = splitter.run(documents=[]) + assert result["documents"] == [] + + +def test_invalid_secondary_split(): + """Test that an invalid secondary split type raises an error.""" + # In MarkdownHeaderSplitter, this is validated at DocumentSplitter instantiation time in _apply_secondary_splitting + splitter = MarkdownHeaderSplitter(secondary_split="invalid_split_type") + docs = [Document(content="# Header\nContent")] + + # Error should be raised when run is called and secondary splitter is created + with pytest.raises(ValueError, match="split_by must be one of"): + splitter.run(documents=docs) + + +def test_invalid_split_parameters(): + """Test invalid split parameter validation.""" + # Similar to invalid_secondary_split, validation happens at DocumentSplitter instantiation + + # Test split_length validation + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=0) + docs = [Document(content="# Header\nContent")] + with pytest.raises(ValueError, match="split_length must be greater than 0"): + splitter.run(documents=docs) + + # Test split_overlap validation + splitter = MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1) + docs = [Document(content="# Header\nContent")] + with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0"): + splitter.run(documents=docs) + + +def test_empty_content_handling(): + """Test handling of documents with empty content.""" + splitter = MarkdownHeaderSplitter() + docs = [Document(content="")] + result = splitter.run(documents=docs) + + # DocumentSplitter skips empty documents by default + assert len(result["documents"]) == 0 + + +# Output format and split ID checks +def test_document_splitting_format(): + """Test that the format of split documents is correct.""" + splitter = MarkdownHeaderSplitter() + docs = [Document(content="# Header\nContent")] + result = splitter.run(documents=docs) + + # Basic validation of the output structure + assert isinstance(result, dict) + assert "documents" in result + assert isinstance(result["documents"], list) + + +def test_split_id_sequentiality_primary_and_secondary(): + text = "# Header\n" + "Word " * 30 + # Test primary splitting + splitter = MarkdownHeaderSplitter() + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_ids = [doc.meta["split_id"] for doc in result["documents"]] + assert split_ids == list(range(len(split_ids))) + + # Test secondary splitting + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5) + result = splitter.run(documents=docs) + split_ids = [doc.meta["split_id"] for doc in result["documents"]] + assert split_ids == list(range(len(split_ids))) From d87ef9736ea84dc4b0dfdfdcf92d00e2abbb727a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 16:36:53 +0200 Subject: [PATCH 16/85] move initialization of secondary-splitter out of run method --- .../preprocessors/markdown_header_splitter.py | 24 ++++++++----- .../test_markdown_header_splitter.py | 36 +++++++++---------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index c1e739ac00..a03b858341 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -140,6 +140,17 @@ def __init__( self.split_overlap = split_overlap self.split_threshold = split_threshold + # initialize secondary_splitter only if needed + if self.secondary_split != "none": + self.secondary_splitter = DocumentSplitter( + split_by=self.secondary_split, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + ) + else: + self.secondary_splitter = None + def _infer_and_rewrite_header_levels(self, text: str) -> str: """ Infer and rewrite header levels in the markdown text. @@ -301,15 +312,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # track page from meta current_page = doc.meta.get("page_number", 1) - secondary_splitter = DocumentSplitter( - split_by=self.secondary_split, - split_length=self.split_length, - split_overlap=self.split_overlap, - split_threshold=self.split_threshold, - ) - - # apply secondary splitting - secondary_splits = secondary_splitter.run( + # use the pre-initialized secondary splitter + secondary_splits = self.secondary_splitter.run( documents=[Document(content=content_for_splitting, meta=doc.meta)] )["documents"] accumulated_page_breaks = 0 # track page breaks @@ -421,3 +425,5 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N print("\n---Document---") print(doc.content) print(doc.meta) + print(doc.content) + print(doc.meta) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 2c97270290..a89396e2c8 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -185,32 +185,21 @@ def test_empty_document_list(): assert result["documents"] == [] -def test_invalid_secondary_split(): - """Test that an invalid secondary split type raises an error.""" - # In MarkdownHeaderSplitter, this is validated at DocumentSplitter instantiation time in _apply_secondary_splitting - splitter = MarkdownHeaderSplitter(secondary_split="invalid_split_type") - docs = [Document(content="# Header\nContent")] - - # Error should be raised when run is called and secondary splitter is created +def test_invalid_secondary_split_at_init(): + """Test that an invalid secondary split type raises an error at initialization time.""" with pytest.raises(ValueError, match="split_by must be one of"): - splitter.run(documents=docs) + MarkdownHeaderSplitter(secondary_split="invalid_split_type") -def test_invalid_split_parameters(): - """Test invalid split parameter validation.""" - # Similar to invalid_secondary_split, validation happens at DocumentSplitter instantiation - +def test_invalid_split_parameters_at_init(): + """Test invalid split parameter validation at initialization time.""" # Test split_length validation - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=0) - docs = [Document(content="# Header\nContent")] with pytest.raises(ValueError, match="split_length must be greater than 0"): - splitter.run(documents=docs) + MarkdownHeaderSplitter(secondary_split="word", split_length=0) # Test split_overlap validation - splitter = MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1) - docs = [Document(content="# Header\nContent")] with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0"): - splitter.run(documents=docs) + MarkdownHeaderSplitter(secondary_split="word", split_overlap=-1) def test_empty_content_handling(): @@ -250,3 +239,14 @@ def test_split_id_sequentiality_primary_and_secondary(): result = splitter.run(documents=docs) split_ids = [doc.meta["split_id"] for doc in result["documents"]] assert split_ids == list(range(len(split_ids))) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_ids = [doc.meta["split_id"] for doc in result["documents"]] + assert split_ids == list(range(len(split_ids))) + + # Test secondary splitting + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5) + result = splitter.run(documents=docs) + split_ids = [doc.meta["split_id"] for doc in result["documents"]] + assert split_ids == list(range(len(split_ids))) + assert split_ids == list(range(len(split_ids))) From 84e34edae3134326dd5638ca669fa73871762474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 16:52:18 +0200 Subject: [PATCH 17/85] move _custom_document_splitter to class method --- .../preprocessors/markdown_header_splitter.py | 179 +++++++----------- 1 file changed, 68 insertions(+), 111 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index a03b858341..c6e2e2b1a9 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -7,104 +7,6 @@ logger = logging.getLogger(__name__) -class _CustomDocumentSplitter(DocumentSplitter): - """ - Internal helper class that extends DocumentSplitter to support splitting functions. - - This class handles splitting functions that return dictionaries with 'content' and 'meta' - keys instead of just strings. For internal use only within the MarkdownHeaderSplitter. - """ - - def __init__( - self, - split_by: str = "function", - splitting_function: Optional[Callable] = None, - page_break_character: str = "\\f", - ): - """ - Initialize the _CustomDocumentSplitter. - - :param split_by: The method to split by. Must be "function" for custom splitting functions. - :param splitting_function: A custom function that takes a string and returns a list of dicts - with 'content' and optional 'meta'. - :param page_break_character: Character used to identify page breaks. - Defaults to form feed ("\\f"). - """ - super().__init__(split_by=split_by, splitting_function=splitting_function) - self.page_break_character = page_break_character - - def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: - """Helper method to flatten a nested dictionary.""" - if target_dict is None: - target_dict = {} - - for key, value in d.items(): - new_key = f"{prefix}{key}" if prefix else key - - if isinstance(value, dict): - self._flatten_dict(value, f"{new_key}_", target_dict) - else: - target_dict[new_key] = value - - return target_dict - - def _process_split_content(self, split_content: str, split_index: int) -> int: - """Process the content of a split and return the number of page breaks.""" - if not isinstance(split_content, str): - return 0 - - page_breaks = split_content.count(self.page_break_character) - if page_breaks > 0: - logger.debug( - "Found {page_breaks} page breaks in split {split_index}", - page_breaks=page_breaks, - split_index=split_index, - ) - return page_breaks - - def _split_by_function(self, doc: Document) -> list[Document]: - """Split document using a custom function that returns dictionaries with 'content' and 'meta'.""" - logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) - splits = self.splitting_function(doc.content) - docs = [] - - # calculate total pages and set current page - total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 - current_page = doc.meta.get("page_number", 1) - logger.debug( - "Starting page number: {current_page}, Total pages: {total_pages}", - current_page=current_page, - total_pages=total_pages, - ) - - # get meta for each split - for i, split in enumerate(splits): - meta = {} - if doc.meta: - meta = self._flatten_dict(doc.meta) - - # add standard metadata (no split_id here) - meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) - - # get page number based on page breaks - page_breaks = self._process_split_content(split["content"], i) - current_page += page_breaks - - # add split-specific metadata - if split.get("meta"): - meta.update(self._flatten_dict(split.get("meta"))) - - docs.append(Document(content=split["content"], meta=meta)) - - logger.debug( - "Split into {num_docs} documents for id={doc_id}, final page: {current_page}", - num_docs=len(docs), - doc_id=doc.id, - current_page=current_page, - ) - return docs - - @component class MarkdownHeaderSplitter: """ @@ -343,6 +245,61 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) return result_docs + def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: + if target_dict is None: + target_dict = {} + for key, value in d.items(): + new_key = f"{prefix}{key}" if prefix else key + if isinstance(value, dict): + self._flatten_dict(value, f"{new_key}_", target_dict) + else: + target_dict[new_key] = value + return target_dict + + def _process_split_content(self, split_content: str, split_index: int) -> int: + if not isinstance(split_content, str): + return 0 + page_breaks = split_content.count(self.page_break_character) + if page_breaks > 0: + logger.debug( + "Found {page_breaks} page breaks in split {split_index}", + page_breaks=page_breaks, + split_index=split_index, + ) + return page_breaks + + def _split_documents_by_function(self, documents: list[Document], splitting_function: Callable) -> list[Document]: + result_docs = [] + for doc in documents: + logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) + splits = splitting_function(doc.content) + docs = [] + total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 + current_page = doc.meta.get("page_number", 1) + logger.debug( + "Starting page number: {current_page}, Total pages: {total_pages}", + current_page=current_page, + total_pages=total_pages, + ) + for i, split in enumerate(splits): + meta = {} + if doc.meta: + meta = self._flatten_dict(doc.meta) + meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) + page_breaks = self._process_split_content(split["content"], i) + current_page += page_breaks + if split.get("meta"): + meta.update(self._flatten_dict(split.get("meta"))) + docs.append(Document(content=split["content"], meta=meta)) + logger.debug( + "Split into {num_docs} documents for id={doc_id}, final page: {current_page}", + num_docs=len(docs), + doc_id=doc.id, + current_page=current_page, + ) + result_docs.extend(docs) + return result_docs + @component.output_types(documents=list[Document]) def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]: """ @@ -352,35 +309,37 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting. If None, uses the value from initialization. """ + # validate input documents + for doc in documents: + if not isinstance(doc.content, str): + raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).") + infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels - # process documents - preprocess if told to processed_documents = [] for doc in documents: + # skip empty documents + if not doc.content or not doc.content.strip(): + continue if infer_header_levels: content = self._infer_and_rewrite_header_levels(doc.content) processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) else: processed_documents.append(doc) - # split by markdown headers - header_splitter = _CustomDocumentSplitter( - split_by="function", - splitting_function=lambda text: self._split_by_markdown_headers(text), - page_break_character=self.page_break_character, - ) + if not processed_documents: + return {"documents": []} - # get splits - header_split_docs = header_splitter.run(documents=processed_documents)["documents"] + header_split_docs = self._split_documents_by_function( + processed_documents, splitting_function=self._split_by_markdown_headers + ) logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) - # apply secondary splitting if requested if self.secondary_split != "none": final_docs = self._apply_secondary_splitting(header_split_docs) else: final_docs = header_split_docs - # assign unique, sequential split_id to all final chunks for idx, doc in enumerate(final_docs): doc.meta["split_id"] = idx @@ -425,5 +384,3 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N print("\n---Document---") print(doc.content) print(doc.meta) - print(doc.content) - print(doc.meta) From 32b09585015c5d6d641fa79921394eec8ccf3746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 17:02:08 +0200 Subject: [PATCH 18/85] removed the _CustomDocumentSplitter class. splitting logic is now encapsulated within the MarkdownHeaderSplitter class as private methods. --- .../components/preprocessors/markdown_header_splitter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index c6e2e2b1a9..aa8af1fd96 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -10,7 +10,14 @@ @component class MarkdownHeaderSplitter: """ - A custom component that splits documents at markdown headers with optional secondary splitting. + Split documents at Markdown headers, with optional secondary splitting and header level inference. + + This component processes text documents by: + - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata. + - Optionally inferring and rewriting header levels for documents where header structure is ambiguous. + - Optionally applying a secondary split (by word, passage, period, or line) to each chunk. + This is done in haystack's DocumentSplitter. + - Preserving and propagating metadata such as parent headers, page numbers, and split IDs. """ def __init__( From 69b79532eff332b7c7fce8467ec7b7f477f383b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 17:18:29 +0200 Subject: [PATCH 19/85] return to standard feed-forward character and add tests for page break handling --- .../preprocessors/markdown_header_splitter.py | 4 +- .../test_markdown_header_splitter.py | 62 +++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index aa8af1fd96..35389c3d62 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -23,7 +23,7 @@ class MarkdownHeaderSplitter: def __init__( self, infer_header_levels: bool = False, - page_break_character: str = "\\f", + page_break_character: str = "\f", secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", split_length: int = 200, split_overlap: int = 0, @@ -34,7 +34,7 @@ def __init__( :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. Useful for documents where all headers use the same level. Defaults to False. - :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). + :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f"). :param secondary_split: Optional secondary split condition after header splitting. Options are "none", "word", "passage", "period", "line". Defaults to "none". :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index a89396e2c8..c01b624275 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -250,3 +250,65 @@ def test_split_id_sequentiality_primary_and_secondary(): split_ids = [doc.meta["split_id"] for doc in result["documents"]] assert split_ids == list(range(len(split_ids))) assert split_ids == list(range(len(split_ids))) + + +def test_secondary_split_with_overlap(): + text = "# Header\n" + "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10" + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # Overlap of 2, so each chunk after the first should share 2 words with previous + assert len(split_docs) > 1 + for i in range(1, len(split_docs)): + prev_words = split_docs[i - 1].content.split() + curr_words = split_docs[i].content.split() + # The overlap should be the last 2 words of previous == first 2 of current + assert prev_words[-2:] == curr_words[:2] + + +def test_secondary_split_with_threshold(): + text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)]) + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # The last chunk should have at least split_threshold words if possible + for doc in split_docs[:-1]: + assert len(doc.content.split()) == 3 + # The last chunk should have at least 2 words (threshold) + assert len(split_docs[-1].content.split()) >= 2 + + +def test_page_break_handling_in_secondary_split(): + text = "# Header\nFirst page\fSecond page\fThird page" + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # The page_number should increment at each page break + page_numbers = [doc.meta.get("page_number") for doc in split_docs] + # Should start at 1 and increment at each \f + assert page_numbers[0] == 1 + assert 2 in page_numbers + # Remove: assert 3 in page_numbers + # Instead, check that the max page number is 2 or 3, depending on split alignment + assert max(page_numbers) >= 2 + + +def test_page_break_handling_with_multiple_headers(): + text = "# Header 1\nPage 1\fPage 2\n# Header 2\nPage 3\fPage 4" + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + # Collect page numbers for each header + header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"] + header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"] + # Both headers should have splits with page_number 1 and 2 for Header 1, and 1 and 2 for Header 2 + # (relative to their own chunk) + assert min(header1_pages) == 1 + assert max(header1_pages) >= 2 + # header2_pages may start at 2 if the previous header's last chunk ended with a page break + assert min(header2_pages) >= 1 + assert max(header2_pages) >= 2 From f5b91f06aa9a4eff864bd61c193ff6b61d124b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 17:23:40 +0200 Subject: [PATCH 20/85] quit exposing splitting_function param since it shouldn't be changed anyway --- .../components/preprocessors/markdown_header_splitter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 35389c3d62..484417db57 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -275,11 +275,11 @@ def _process_split_content(self, split_content: str, split_index: int) -> int: ) return page_breaks - def _split_documents_by_function(self, documents: list[Document], splitting_function: Callable) -> list[Document]: + def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]: result_docs = [] for doc in documents: logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) - splits = splitting_function(doc.content) + splits = self._split_by_markdown_headers(doc.content) docs = [] total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 current_page = doc.meta.get("page_number", 1) @@ -337,9 +337,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N if not processed_documents: return {"documents": []} - header_split_docs = self._split_documents_by_function( - processed_documents, splitting_function=self._split_by_markdown_headers - ) + header_split_docs = self._split_documents_by_markdown_headers(processed_documents) logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) if self.secondary_split != "none": From 83e5579d4086504b39c88d44fb0ffe52bf9d5b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 17:24:35 +0200 Subject: [PATCH 21/85] remove test section in module --- .../preprocessors/markdown_header_splitter.py | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 484417db57..d913bf7ba7 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -349,43 +349,3 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N doc.meta["split_id"] = idx return {"documents": final_docs} - - -# TODO: move to proper test file once ready -if __name__ == "__main__": - print() - print("===== Example 1: Regular splitting =====") - splitter = MarkdownHeaderSplitter() - content = """# Header 1 -## Subheader 1.1 -Content under subheader 1.1. -## Subheader 1.2 -### Subheader 1.2.1 -Content under subheader 1.2.1.""" - print("Original content:") - print(content) - example_doc = Document(content=content) - result = splitter.run(documents=[example_doc]) - for doc in result["documents"]: - print("\n---Document---") - print(doc.content) - print(doc.meta) - - print() - print("===== Example 2: Splitting with header inference =====") - splitter = MarkdownHeaderSplitter(infer_header_levels=True) - content = """## Header 1 -## Subheader 1.1 -Content under subheader 1.1. -## Subheader 1.2 -## Subheader 1.2.1 -Content under subheader 1.2.1.""" - print("Original content:") - print(content) - example_doc = Document(content=content) - result = splitter.run(documents=[example_doc]) - print("\nAfter header inference and splitting:") - for doc in result["documents"]: - print("\n---Document---") - print(doc.content) - print(doc.meta) From f3625f528df9f25f421ef1a46fd48077483324c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 17:26:22 +0200 Subject: [PATCH 22/85] add license header --- .../components/preprocessors/markdown_header_splitter.py | 6 +++++- .../preprocessors/test_markdown_header_splitter.py | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index d913bf7ba7..f95e1e01ba 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -1,5 +1,9 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import re -from typing import Callable, Literal, Optional +from typing import Literal, Optional from haystack import Document, component, logging from haystack.components.preprocessors import DocumentSplitter diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index c01b624275..86b27a1c24 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import pytest from haystack import Document From 526ac4f87a84178b55a2b0d1e2c64ce84cc35411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 19 Sep 2025 18:02:46 +0200 Subject: [PATCH 23/85] add release note --- ...d-md-header-splitter-df5c024a6ddd2718.yaml | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml diff --git a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml new file mode 100644 index 0000000000..48cf170d4e --- /dev/null +++ b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml @@ -0,0 +1,36 @@ +--- +highlights: > + Added a MarkdownHeaderSplitter component for splitting documents at Markdown headers, with optional header level inference and secondary splitting based on Haystack's DocumentSplitter. This enables a more appropriate splitting logic for Markdown documents where sections are defined by headers, improving the structure and relevance of the resulting document chunks for downstream tasks. + +features: + - | + Introduced the `MarkdownHeaderSplitter` component: + - Splits documents into chunks at Markdown headers (`#`, `##`, etc.), preserving header hierarchy as metadata. + - Optionally infers and rewrites header levels for documents where header structure is ambiguous (e.g. documents parsed using Docling). + - Supports secondary splitting (by word, passage, period, or line) for further chunking after header-based splitting using Haystack's `DocumentSplitter`. + - Preserves and propagates metadata such as parent headers and page numbers. + - Handles edge cases such as documents with no headers, empty content, and non-text documents. + +upgrade: + - | + No upgrade actions required. This is a new component and does not affect existing pipelines. + +enhancements: + - | + Improves preprocessing flexibility for Markdown documents, making it easier to build indexing pipelines for Markdown-files. + +issues: + - | + Inferring header levels can only move downwards in the header hierarchy (e.g., `##` to `###`), not back up (e.g., `###` to `##`), meaning that the algorithm may not perfectly reconstruct the original header hierarchy in cases where header levels move up again. + +deprecations: + - | + None. + +security: + - | + No security-related changes. + +fixes: + - | + N/A (new feature). From a46ac62782911102c84bfb974f69c39cf04b099e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 23 Sep 2025 11:08:54 +0200 Subject: [PATCH 24/85] minor refactor for type safety --- .../preprocessors/markdown_header_splitter.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f95e1e01ba..63f163dca4 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -54,6 +54,7 @@ def __init__( self.split_threshold = split_threshold # initialize secondary_splitter only if needed + self.secondary_splitter: Optional[DocumentSplitter] if self.secondary_split != "none": self.secondary_splitter = DocumentSplitter( split_by=self.secondary_split, @@ -151,9 +152,9 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]: return [{"content": text, "meta": {"header": None, "parentheaders": []}}] # process headers and build chunks - chunks = [] - header_stack = [None] * 6 - active_parents = [] + chunks: list[dict] = [] + header_stack: list[Optional[str]] = [None] * 6 + active_parents: list[str] = [] for i, match in enumerate(matches): # extract header info @@ -174,7 +175,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]: # skip splits w/o content if not content: # Add as parent for subsequent headers - active_parents = [h for h in header_stack[: level - 1] if h] + active_parents = [h for h in header_stack[: level - 1] if h is not None] active_parents.append(header_text) continue @@ -193,7 +194,7 @@ def _split_by_markdown_headers(self, text: str) -> list[dict]: ) # reset active parents - active_parents = [h for h in header_stack[: level - 1] if h] + active_parents = [h for h in header_stack[: level - 1] if h is not None] logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks)) return chunks @@ -211,14 +212,16 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document result_docs = [] for doc in documents: + if doc.content is None: + result_docs.append(doc) + continue # extract header information header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content) + content_for_splitting: str = doc.content if header_match: content_for_splitting = doc.content[header_match.end() :] - else: - content_for_splitting = doc.content - if not content_for_splitting.strip(): # skip empty content + if not content_for_splitting or not content_for_splitting.strip(): # skip empty content result_docs.append(doc) continue @@ -226,6 +229,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document current_page = doc.meta.get("page_number", 1) # use the pre-initialized secondary splitter + if self.secondary_splitter is None: + result_docs.append(doc) + continue secondary_splits = self.secondary_splitter.run( documents=[Document(content=content_for_splitting, meta=doc.meta)] )["documents"] @@ -236,8 +242,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # calculate page number for this split if i > 0: # page break counting prev_content = secondary_splits[i - 1].content - page_breaks = prev_content.count(self.page_break_character) - accumulated_page_breaks += page_breaks + if prev_content is not None: + page_breaks = prev_content.count(self.page_break_character) + accumulated_page_breaks += page_breaks # set page number to meta split.meta["page_number"] = current_page + accumulated_page_breaks @@ -283,10 +290,14 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis result_docs = [] for doc in documents: logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) + if doc.content is None: + continue splits = self._split_by_markdown_headers(doc.content) docs = [] - total_pages = doc.meta.get("total_pages", 0) or doc.content.count(self.page_break_character) + 1 - current_page = doc.meta.get("page_number", 1) + total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0 + if not total_pages: + total_pages = doc.content.count(self.page_break_character) + 1 + current_page = doc.meta.get("page_number", 1) if doc.meta else 1 logger.debug( "Starting page number: {current_page}, Total pages: {total_pages}", current_page=current_page, @@ -300,7 +311,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis page_breaks = self._process_split_content(split["content"], i) current_page += page_breaks if split.get("meta"): - meta.update(self._flatten_dict(split.get("meta"))) + meta.update(self._flatten_dict(split.get("meta") or {})) docs.append(Document(content=split["content"], meta=meta)) logger.debug( "Split into {num_docs} documents for id={doc_id}, final page: {current_page}", From 821d907015f9970033658cdc07452c142d177d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:22:14 +0200 Subject: [PATCH 25/85] Update haystack/components/preprocessors/markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- haystack/components/preprocessors/markdown_header_splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 63f163dca4..20f591ae46 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -26,6 +26,7 @@ class MarkdownHeaderSplitter: def __init__( self, + *, infer_header_levels: bool = False, page_break_character: str = "\f", secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", From c630e14f7de8a9fd54d7d0b72ef0add4941bc653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 23 Sep 2025 11:35:11 +0200 Subject: [PATCH 26/85] remove unneeded release notes entries --- ...d-md-header-splitter-df5c024a6ddd2718.yaml | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml index 48cf170d4e..bb5cbec612 100644 --- a/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml +++ b/releasenotes/notes/add-md-header-splitter-df5c024a6ddd2718.yaml @@ -1,7 +1,4 @@ --- -highlights: > - Added a MarkdownHeaderSplitter component for splitting documents at Markdown headers, with optional header level inference and secondary splitting based on Haystack's DocumentSplitter. This enables a more appropriate splitting logic for Markdown documents where sections are defined by headers, improving the structure and relevance of the resulting document chunks for downstream tasks. - features: - | Introduced the `MarkdownHeaderSplitter` component: @@ -10,27 +7,3 @@ features: - Supports secondary splitting (by word, passage, period, or line) for further chunking after header-based splitting using Haystack's `DocumentSplitter`. - Preserves and propagates metadata such as parent headers and page numbers. - Handles edge cases such as documents with no headers, empty content, and non-text documents. - -upgrade: - - | - No upgrade actions required. This is a new component and does not affect existing pipelines. - -enhancements: - - | - Improves preprocessing flexibility for Markdown documents, making it easier to build indexing pipelines for Markdown-files. - -issues: - - | - Inferring header levels can only move downwards in the header hierarchy (e.g., `##` to `###`), not back up (e.g., `###` to `##`), meaning that the algorithm may not perfectly reconstruct the original header hierarchy in cases where header levels move up again. - -deprecations: - - | - None. - -security: - - | - No security-related changes. - -fixes: - - | - N/A (new feature). From fa53e1b8b59ed64dd7015c551e4c953d5a9dbc77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 23 Sep 2025 11:42:05 +0200 Subject: [PATCH 27/85] improved documentation for methods --- .../components/preprocessors/markdown_header_splitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 20f591ae46..0ec25d7ff4 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -139,7 +139,7 @@ def _infer_and_rewrite_header_levels(self, text: str) -> str: logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches)) return modified_text - def _split_by_markdown_headers(self, text: str) -> list[dict]: + def _split_text_by_markdown_headers(self, text: str) -> list[dict]: """Split text by markdown headers and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -265,6 +265,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document return result_docs def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: + """Flatten a nested dictionary, concatenating keys with underscores.""" if target_dict is None: target_dict = {} for key, value in d.items(): @@ -276,6 +277,7 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = return target_dict def _process_split_content(self, split_content: str, split_index: int) -> int: + """Count page breaks in the split content and log if any are found.""" if not isinstance(split_content, str): return 0 page_breaks = split_content.count(self.page_break_character) @@ -288,12 +290,13 @@ def _process_split_content(self, split_content: str, split_index: int) -> int: return page_breaks def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]: + """Split a list of documents by markdown headers, preserving metadata.""" result_docs = [] for doc in documents: logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) if doc.content is None: continue - splits = self._split_by_markdown_headers(doc.content) + splits = self._split_text_by_markdown_headers(doc.content) docs = [] total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0 if not total_pages: From 1e6cbe39d7ded6508f98b6d47e60561a9ec7ac23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 23 Sep 2025 14:07:43 +0200 Subject: [PATCH 28/85] improve method naming --- .../components/preprocessors/markdown_header_splitter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 0ec25d7ff4..b83080840e 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -66,7 +66,7 @@ def __init__( else: self.secondary_splitter = None - def _infer_and_rewrite_header_levels(self, text: str) -> str: + def _infer_header_levels(self, text: str) -> str: """ Infer and rewrite header levels in the markdown text. @@ -276,7 +276,7 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = target_dict[new_key] = value return target_dict - def _process_split_content(self, split_content: str, split_index: int) -> int: + def _count_page_breaks(self, split_content: str, split_index: int) -> int: """Count page breaks in the split content and log if any are found.""" if not isinstance(split_content, str): return 0 @@ -312,7 +312,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis if doc.meta: meta = self._flatten_dict(doc.meta) meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) - page_breaks = self._process_split_content(split["content"], i) + page_breaks = self._count_page_breaks(split["content"], i) current_page += page_breaks if split.get("meta"): meta.update(self._flatten_dict(split.get("meta") or {})) @@ -348,7 +348,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N if not doc.content or not doc.content.strip(): continue if infer_header_levels: - content = self._infer_and_rewrite_header_levels(doc.content) + content = self._infer_header_levels(doc.content) processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) else: processed_documents.append(doc) From e756d998f242889533fe1456f880db0d0681de30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 23 Sep 2025 14:21:07 +0200 Subject: [PATCH 29/85] improved page-number assignment & added return in docstring minor cleanup --- .../preprocessors/markdown_header_splitter.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index b83080840e..07429f2202 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -259,6 +259,8 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # assign unique, sequential split_id to all final chunks for idx, doc in enumerate(result_docs): + if doc.meta is None: + doc.meta = {} doc.meta["split_id"] = idx logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) @@ -334,6 +336,12 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N :param documents: List of documents to split :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting. If None, uses the value from initialization. + + :returns: A dictionary with the following key: + - `documents`: List of documents with the split texts. Each document includes: + - A metadata field `source_id` to track the original document. + - A metadata field `page_number` to track the original page number. + - All other metadata copied from the original document. """ # validate input documents for doc in documents: @@ -363,8 +371,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N final_docs = self._apply_secondary_splitting(header_split_docs) else: final_docs = header_split_docs - - for idx, doc in enumerate(final_docs): - doc.meta["split_id"] = idx + # assign split_id only if secondary splitting is not applied + for idx, doc in enumerate(final_docs): + if doc.meta is None: + doc.meta = {} + doc.meta["split_id"] = idx return {"documents": final_docs} From c48bdcf7a1162b372ac62c1745e760c88cc03c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 10:31:51 +0200 Subject: [PATCH 30/85] unified page-counting --- .../preprocessors/markdown_header_splitter.py | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 07429f2202..070df0cffb 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -236,19 +236,15 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document secondary_splits = self.secondary_splitter.run( documents=[Document(content=content_for_splitting, meta=doc.meta)] )["documents"] - accumulated_page_breaks = 0 # track page breaks # split processing for i, split in enumerate(secondary_splits): # calculate page number for this split - if i > 0: # page break counting - prev_content = secondary_splits[i - 1].content - if prev_content is not None: - page_breaks = prev_content.count(self.page_break_character) - accumulated_page_breaks += page_breaks + if i > 0 and secondary_splits[i - 1].content: + _, current_page = self._count_page_breaks_and_update(secondary_splits[i - 1].content, current_page) # set page number to meta - split.meta["page_number"] = current_page + accumulated_page_breaks + split.meta["page_number"] = current_page # preserve header metadata for key in ["header", "parentheaders"]: @@ -278,18 +274,29 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = target_dict[new_key] = value return target_dict - def _count_page_breaks(self, split_content: str, split_index: int) -> int: - """Count page breaks in the split content and log if any are found.""" - if not isinstance(split_content, str): - return 0 - page_breaks = split_content.count(self.page_break_character) + def _count_page_breaks_and_update(self, content: str, current_page: int) -> tuple[int, int]: + """ + Count page breaks in content and return updated page count. + + :param content: Content to check for page breaks + :param current_page: Current page number + :return: Tuple of (page_breaks_count, new_current_page) + """ + if not isinstance(content, str): + return 0, current_page + + page_breaks = content.count(self.page_break_character) + new_page_number = current_page + page_breaks + if page_breaks > 0: logger.debug( - "Found {page_breaks} page breaks in split {split_index}", + "Found {page_breaks} page breaks, page number updated: {old} → {new}", page_breaks=page_breaks, - split_index=split_index, + old=current_page, + new=new_page_number, ) - return page_breaks + + return page_breaks, new_page_number def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]: """Split a list of documents by markdown headers, preserving metadata.""" @@ -300,9 +307,8 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis continue splits = self._split_text_by_markdown_headers(doc.content) docs = [] - total_pages = doc.meta.get("total_pages", 0) if doc.meta else 0 - if not total_pages: - total_pages = doc.content.count(self.page_break_character) + 1 + total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0) + current_page = doc.meta.get("page_number", 1) if doc.meta else 1 logger.debug( "Starting page number: {current_page}, Total pages: {total_pages}", @@ -314,8 +320,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis if doc.meta: meta = self._flatten_dict(doc.meta) meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) - page_breaks = self._count_page_breaks(split["content"], i) - current_page += page_breaks + _, current_page = self._count_page_breaks_and_update(split["content"], current_page) if split.get("meta"): meta.update(self._flatten_dict(split.get("meta") or {})) docs.append(Document(content=split["content"], meta=meta)) @@ -328,6 +333,16 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis result_docs.extend(docs) return result_docs + def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int: + """Calculate total pages based on content and existing metadata.""" + if existing_total > 0: + return existing_total + + if not isinstance(content, str): + return 1 + + return content.count(self.page_break_character) + 1 + @component.output_types(documents=list[Document]) def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]: """ From decaadffdff553b5fa6634c7c675224e843a568c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 10:38:12 +0200 Subject: [PATCH 31/85] simplify conditional secondary-split initialization and usage --- .../preprocessors/markdown_header_splitter.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 070df0cffb..95bcc13f95 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -55,7 +55,6 @@ def __init__( self.split_threshold = split_threshold # initialize secondary_splitter only if needed - self.secondary_splitter: Optional[DocumentSplitter] if self.secondary_split != "none": self.secondary_splitter = DocumentSplitter( split_by=self.secondary_split, @@ -63,8 +62,6 @@ def __init__( split_overlap=self.split_overlap, split_threshold=self.split_threshold, ) - else: - self.secondary_splitter = None def _infer_header_levels(self, text: str) -> str: """ @@ -216,6 +213,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document if doc.content is None: result_docs.append(doc) continue + # extract header information header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content) content_for_splitting: str = doc.content @@ -229,10 +227,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # track page from meta current_page = doc.meta.get("page_number", 1) - # use the pre-initialized secondary splitter - if self.secondary_splitter is None: - result_docs.append(doc) - continue secondary_splits = self.secondary_splitter.run( documents=[Document(content=content_for_splitting, meta=doc.meta)] )["documents"] @@ -382,11 +376,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N header_split_docs = self._split_documents_by_markdown_headers(processed_documents) logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) - if self.secondary_split != "none": - final_docs = self._apply_secondary_splitting(header_split_docs) - else: - final_docs = header_split_docs - # assign split_id only if secondary splitting is not applied + # secondary splitting if configured + final_docs = ( + self._apply_secondary_splitting(header_split_docs) if self.secondary_split != "none" else header_split_docs + ) + + # assign split_id if not already done in secondary splitting + if self.secondary_split == "none": for idx, doc in enumerate(final_docs): if doc.meta is None: doc.meta = {} From 3ef71c4fb376f287764c97bec5640e2684440831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 13:54:12 +0200 Subject: [PATCH 32/85] fix linting error --- haystack/components/preprocessors/markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 95bcc13f95..62b4520f0b 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -309,7 +309,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis current_page=current_page, total_pages=total_pages, ) - for i, split in enumerate(splits): + for split in splits: meta = {} if doc.meta: meta = self._flatten_dict(doc.meta) From 0fbea3a220e89987e083ba0b76d1abcadfa2b48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 16:39:25 +0200 Subject: [PATCH 33/85] clearly specify the use of ATX-style headers (#) only --- haystack/components/preprocessors/markdown_header_splitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 62b4520f0b..8fcb5c7b3a 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -14,7 +14,7 @@ @component class MarkdownHeaderSplitter: """ - Split documents at Markdown headers, with optional secondary splitting and header level inference. + Split documents at ATX-style Markdown headers (#), with optional secondary splitting and header level inference. This component processes text documents by: - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata. @@ -137,7 +137,7 @@ def _infer_header_levels(self, text: str) -> str: return modified_text def _split_text_by_markdown_headers(self, text: str) -> list[dict]: - """Split text by markdown headers and create chunks with appropriate metadata.""" + """Split text by ATX-style headers (#) and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") # find headers From 38119a6a087d7ae01e33965d118ffb77dc49ecb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 16:47:37 +0200 Subject: [PATCH 34/85] reference doc_id when logging no headers found --- .../preprocessors/markdown_header_splitter.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 8fcb5c7b3a..077b7d981e 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -63,7 +63,7 @@ def __init__( split_threshold=self.split_threshold, ) - def _infer_header_levels(self, text: str) -> str: + def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str: """ Infer and rewrite header levels in the markdown text. @@ -75,6 +75,9 @@ def _infer_header_levels(self, text: str) -> str: This is useful for documents where all headers are at the same level, such as output from document conversion tools like docling. + + :param text: The text to process + :param doc_id: Optional document ID for logging context """ logger.debug("Inferring and rewriting header levels") @@ -83,7 +86,10 @@ def _infer_header_levels(self, text: str) -> str: matches = list(re.finditer(pattern, text)) if not matches: - logger.info("No headers found in document; skipping header level inference.") + logger.info( + "No headers found in document{doc_ref}; skipping header level inference.", + doc_ref=f" (id: {doc_id})" if doc_id else "", + ) return text modified_text = text @@ -365,7 +371,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N if not doc.content or not doc.content.strip(): continue if infer_header_levels: - content = self._infer_header_levels(doc.content) + content = self._infer_header_levels(doc.content, doc_id=doc.id) processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) else: processed_documents.append(doc) From e12e7f75e28ef7c1c88150b3cf87e12a175c0f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Wed, 24 Sep 2025 16:50:42 +0200 Subject: [PATCH 35/85] initialize md-header pattern as private variable once --- .../components/preprocessors/markdown_header_splitter.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 077b7d981e..44dc40beeb 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -53,6 +53,7 @@ def __init__( self.split_length = split_length self.split_overlap = split_overlap self.split_threshold = split_threshold + self._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers # initialize secondary_splitter only if needed if self.secondary_split != "none": @@ -82,8 +83,7 @@ def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str: logger.debug("Inferring and rewriting header levels") # find headers - pattern = r"(?m)^(#{1,6}) (.+)$" - matches = list(re.finditer(pattern, text)) + matches = list(re.finditer(self._header_pattern, text)) if not matches: logger.info( @@ -147,8 +147,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: logger.debug("Splitting text by markdown headers") # find headers - pattern = r"(?m)^(#{1,6}) (.+)$" - matches = list(re.finditer(pattern, text)) + matches = list(re.finditer(self._header_pattern, text)) # return unsplit if no headers found if not matches: @@ -221,7 +220,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document continue # extract header information - header_match = re.search(r"(#{1,6}) (.+)(?:\n|$)", doc.content) + header_match = re.search(self._header_pattern, doc.content) content_for_splitting: str = doc.content if header_match: content_for_splitting = doc.content[header_match.end() :] From f31528e83242f7c70f97a06151104001e053852a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 25 Sep 2025 10:35:41 +0200 Subject: [PATCH 36/85] add example to for inferring header levels to docstring --- .../preprocessors/markdown_header_splitter.py | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 44dc40beeb..923beba75a 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -38,7 +38,20 @@ def __init__( Initialize the MarkdownHeaderSplitter. :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. - Useful for documents where all headers use the same level. Defaults to False. + Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling). + For example, a document like: + "## Title + ## Introduction + Introductory text + ## Methods + Method details" + Would be normalized to: + "# Title + ## Introduction + Introductory text + ## Methods + Method details" + This attempts to maintain proper hierarchical structure. Defaults to False. :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f"). :param secondary_split: Optional secondary split condition after header splitting. Options are "none", "word", "passage", "period", "line". Defaults to "none". @@ -348,8 +361,22 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N Run the markdown header splitter with optional secondary splitting. :param documents: List of documents to split - :param infer_header_levels: If True, attempts to infer and rewrite header levels before splitting. - If None, uses the value from initialization. + :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. + Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling). + For example, a document like: + "## Title + ## Introduction + Introductory text + ## Methods + Method details" + Would be normalized to: + "# Title + ## Introduction + Introductory text + ## Methods + Method details" + This attempts to maintain proper hierarchical structure. Defaults to False. + If None, uses the instance's initialized infer_header_levels setting. :returns: A dictionary with the following key: - `documents`: List of documents with the split texts. Each document includes: From cee156c216164c3dfa54d6b60a4bd4519166d50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 25 Sep 2025 10:48:49 +0200 Subject: [PATCH 37/85] improve empty document handling add more logging for empty documents --- .../preprocessors/markdown_header_splitter.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 923beba75a..0f248ced89 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -33,6 +33,7 @@ def __init__( split_length: int = 200, split_overlap: int = 0, split_threshold: int = 0, + skip_empty_documents: bool = True, ): """ Initialize the MarkdownHeaderSplitter. @@ -59,6 +60,8 @@ def __init__( :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. + :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents. + Defaults to True. """ self.infer_header_levels = infer_header_levels self.page_break_character = page_break_character @@ -66,6 +69,7 @@ def __init__( self.split_length = split_length self.split_overlap = split_overlap self.split_threshold = split_threshold + self.skip_empty_documents = skip_empty_documents self._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers # initialize secondary_splitter only if needed @@ -386,6 +390,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N """ # validate input documents for doc in documents: + if doc.content is None: + raise ValueError( + ( + "MarkdownHeaderSplitter only works with text documents but content for document ID" + f" {doc.id} is None." + ) + ) if not isinstance(doc.content, str): raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).") @@ -393,9 +404,20 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N processed_documents = [] for doc in documents: - # skip empty documents + # handle empty documents if not doc.content or not doc.content.strip(): - continue + if self.skip_empty_documents: + logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) + continue + else: + # keep empty documents + processed_documents.append(doc) + logger.warning( + "Document ID {doc_id} has an empty content. Keeping this document as per configuration.", + doc_id=doc.id, + ) + continue + if infer_header_levels: content = self._infer_header_levels(doc.content, doc_id=doc.id) processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) From c63035f5999c64b43b9a436364f4816a79232732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 25 Sep 2025 11:36:31 +0200 Subject: [PATCH 38/85] more explicit testing for inferred headers --- .../test_markdown_header_splitter.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 86b27a1c24..7db00dc927 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -104,8 +104,21 @@ def test_split_infer_header_levels(): docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Should rewrite headers to # and ## - assert split_docs[0].content.startswith("## H2") or split_docs[0].content.startswith("# H1") + + # Should have exactly one document + assert len(split_docs) == 1 + + # Extract header information from metadata instead of content + h1_doc = next((doc for doc in split_docs if doc.meta["header"] == "H1"), None) + h2_doc = next((doc for doc in split_docs if doc.meta["header"] == "H2"), None) + + # Check proper doc creation + assert h1_doc is None + assert h2_doc is not None + + # Check that headers are properly leveled (looking at content) + assert "H1" in h2_doc.meta["parentheaders"] + assert "## H2" in h2_doc.content def test_infer_header_levels_complex(): From cf1b82071fde5f42440271d12e2adfb4ed44b313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 25 Sep 2025 11:41:17 +0200 Subject: [PATCH 39/85] fix linting issue --- .../preprocessors/markdown_header_splitter.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 0f248ced89..6d059889df 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -409,14 +409,13 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N if self.skip_empty_documents: logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue - else: - # keep empty documents - processed_documents.append(doc) - logger.warning( - "Document ID {doc_id} has an empty content. Keeping this document as per configuration.", - doc_id=doc.id, - ) - continue + # keep empty documents + processed_documents.append(doc) + logger.warning( + "Document ID {doc_id} has an empty content. Keeping this document as per configuration.", + doc_id=doc.id, + ) + continue if infer_header_levels: content = self._infer_header_levels(doc.content, doc_id=doc.id) From 22369b6dad89c73719e57d0934a62611d8995375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 26 Sep 2025 17:15:13 +0200 Subject: [PATCH 40/85] improved empty content handling test cases --- .../preprocessors/test_markdown_header_splitter.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 7db00dc927..40380d9db7 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -221,13 +221,16 @@ def test_invalid_split_parameters_at_init(): def test_empty_content_handling(): """Test handling of documents with empty content.""" - splitter = MarkdownHeaderSplitter() + splitter_skip = MarkdownHeaderSplitter() # skip empty documents by default docs = [Document(content="")] - result = splitter.run(documents=docs) - - # DocumentSplitter skips empty documents by default + result = splitter_skip.run(documents=docs) assert len(result["documents"]) == 0 + splitter_no_skip = MarkdownHeaderSplitter(skip_empty_documents=False) + docs = [Document(content="")] + result = splitter_no_skip.run(documents=docs) + assert len(result["documents"]) == 1 + # Output format and split ID checks def test_document_splitting_format(): From 316ebec2ad6031df8eef4aeda9158d46f92547d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 29 Sep 2025 15:59:43 +0200 Subject: [PATCH 41/85] remove all functionality related to inferring md-header levels --- .../preprocessors/markdown_header_splitter.py | 134 ++---------------- .../test_markdown_header_splitter.py | 68 --------- 2 files changed, 8 insertions(+), 194 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 6d059889df..245b51d8d1 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -14,20 +14,18 @@ @component class MarkdownHeaderSplitter: """ - Split documents at ATX-style Markdown headers (#), with optional secondary splitting and header level inference. + Split documents at ATX-style Markdown headers (#), with optional secondary splitting. This component processes text documents by: - Splitting them into chunks at Markdown headers (e.g., '#', '##', etc.), preserving header hierarchy as metadata. - - Optionally inferring and rewriting header levels for documents where header structure is ambiguous. - - Optionally applying a secondary split (by word, passage, period, or line) to each chunk. - This is done in haystack's DocumentSplitter. + - Optionally applying a secondary split (by word, passage, period, or line) to each chunk + (using haystack's DocumentSplitter). - Preserving and propagating metadata such as parent headers, page numbers, and split IDs. """ def __init__( self, *, - infer_header_levels: bool = False, page_break_character: str = "\f", secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", split_length: int = 200, @@ -38,21 +36,6 @@ def __init__( """ Initialize the MarkdownHeaderSplitter. - :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. - Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling). - For example, a document like: - "## Title - ## Introduction - Introductory text - ## Methods - Method details" - Would be normalized to: - "# Title - ## Introduction - Introductory text - ## Methods - Method details" - This attempts to maintain proper hierarchical structure. Defaults to False. :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f"). :param secondary_split: Optional secondary split condition after header splitting. Options are "none", "word", "passage", "period", "line". Defaults to "none". @@ -63,7 +46,6 @@ def __init__( :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents. Defaults to True. """ - self.infer_header_levels = infer_header_levels self.page_break_character = page_break_character self.secondary_split = secondary_split self.split_length = split_length @@ -81,84 +63,6 @@ def __init__( split_threshold=self.split_threshold, ) - def _infer_header_levels(self, text: str, doc_id: Optional[str] = None) -> str: - """ - Infer and rewrite header levels in the markdown text. - - This function analyzes the document structure to infer proper header levels: - - First header is always level 1 - - If there's content between headers, the next header stays at the same level - - If there's no content between headers, the next header goes one level deeper - - Header levels never exceed 6 (the maximum in markdown) - - This is useful for documents where all headers are at the same level, such as - output from document conversion tools like docling. - - :param text: The text to process - :param doc_id: Optional document ID for logging context - """ - logger.debug("Inferring and rewriting header levels") - - # find headers - matches = list(re.finditer(self._header_pattern, text)) - - if not matches: - logger.info( - "No headers found in document{doc_ref}; skipping header level inference.", - doc_ref=f" (id: {doc_id})" if doc_id else "", - ) - return text - - modified_text = text - offset = 0 # track offset due to length changes in headers - - # track header structure - current_level = 1 - header_stack = [1] # always start with level 1 - - for i, match in enumerate(matches): - original_header = match.group(0) - header_text = match.group(2).strip() - - # check if there's content between this header and the previous one - has_content = False - if i > 0: - prev_end = matches[i - 1].end() - current_start = match.start() - content_between = text[prev_end:current_start].strip() - has_content = bool(content_between) - - # first header is always level 1 - if i == 0: - inferred_level = 1 - elif has_content: - # stay at the same level if there's content - inferred_level = current_level - else: - # go one level deeper if there's no content - inferred_level = min(current_level + 1, 6) - - # update tracking variables - current_level = inferred_level - header_stack = header_stack[:inferred_level] - while len(header_stack) < inferred_level: - header_stack.append(1) - - # new header with inferred level - new_prefix = "#" * inferred_level - new_header = f"{new_prefix} {header_text}" - - # replace old header - start_pos = match.start() + offset - end_pos = match.end() + offset - modified_text = modified_text[:start_pos] + new_header + modified_text[end_pos:] - - # update offset - offset += len(new_header) - len(original_header) - - logger.info("Rewrote {num_headers} headers with inferred levels.", num_headers=len(matches)) - return modified_text - def _split_text_by_markdown_headers(self, text: str) -> list[dict]: """Split text by ATX-style headers (#) and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -360,33 +264,17 @@ def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int: return content.count(self.page_break_character) + 1 @component.output_types(documents=list[Document]) - def run(self, documents: list[Document], infer_header_levels: Optional[bool] = None) -> dict[str, list[Document]]: + def run(self, documents: list[Document]) -> dict[str, list[Document]]: """ Run the markdown header splitter with optional secondary splitting. :param documents: List of documents to split - :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure. - Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling). - For example, a document like: - "## Title - ## Introduction - Introductory text - ## Methods - Method details" - Would be normalized to: - "# Title - ## Introduction - Introductory text - ## Methods - Method details" - This attempts to maintain proper hierarchical structure. Defaults to False. - If None, uses the instance's initialized infer_header_levels setting. :returns: A dictionary with the following key: - `documents`: List of documents with the split texts. Each document includes: - - A metadata field `source_id` to track the original document. - - A metadata field `page_number` to track the original page number. - - All other metadata copied from the original document. + - A metadata field `source_id` to track the original document. + - A metadata field `page_number` to track the original page number. + - All other metadata copied from the original document. """ # validate input documents for doc in documents: @@ -400,8 +288,6 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N if not isinstance(doc.content, str): raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).") - infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels - processed_documents = [] for doc in documents: # handle empty documents @@ -417,11 +303,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N ) continue - if infer_header_levels: - content = self._infer_header_levels(doc.content, doc_id=doc.id) - processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id)) - else: - processed_documents.append(doc) + processed_documents.append(doc) if not processed_documents: return {"documents": []} diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 40380d9db7..19bf520626 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -97,74 +97,6 @@ def test_split_only_headers(): assert len(split_docs) == 0 -# Header inference and overrides -def test_split_infer_header_levels(): - text = "## H1\n## H2\nContent" - splitter = MarkdownHeaderSplitter(infer_header_levels=True) - docs = [Document(content=text)] - result = splitter.run(documents=docs) - split_docs = result["documents"] - - # Should have exactly one document - assert len(split_docs) == 1 - - # Extract header information from metadata instead of content - h1_doc = next((doc for doc in split_docs if doc.meta["header"] == "H1"), None) - h2_doc = next((doc for doc in split_docs if doc.meta["header"] == "H2"), None) - - # Check proper doc creation - assert h1_doc is None - assert h2_doc is not None - - # Check that headers are properly leveled (looking at content) - assert "H1" in h2_doc.meta["parentheaders"] - assert "## H2" in h2_doc.content - - -def test_infer_header_levels_complex(): - """Test header level inference with a complex document structure.""" - text = ( - "## All Headers Same Level\n" - "Some content\n" - "## Second Header\n" - "Some content\n" # Added content to ensure headers are processed correctly - "## Third Header With No Content\n" - "## Fourth Header With No Content\n" - "## Fifth Header\n" - "More content" - ) - - splitter = MarkdownHeaderSplitter(infer_header_levels=True) - docs = [Document(content=text)] - result = splitter.run(documents=docs) - split_docs = result["documents"] - - # Get docs by header content to avoid position assumptions - first_doc = next((doc for doc in split_docs if "All Headers Same Level" in doc.content), None) - second_doc = next((doc for doc in split_docs if "Second Header" in doc.content), None) - - # First header should be level 1 - assert first_doc and "# All Headers Same Level" in first_doc.content - - # Second header with content should stay at level 1 - assert second_doc and "# Second Header" in second_doc.content - - -def test_infer_header_levels_override_both_directions(): - text = "## H1\n## H2\nContent" - docs = [Document(content=text)] - - # False at init, True at run - splitter = MarkdownHeaderSplitter(infer_header_levels=False) - result = splitter.run(documents=docs, infer_header_levels=True) - assert "# " in result["documents"][0].content - - # True at init, False at run - splitter = MarkdownHeaderSplitter(infer_header_levels=True) - result = splitter.run(documents=docs, infer_header_levels=False) - assert all("## " in doc.content for doc in result["documents"]) - - # Metadata preservation def test_preserve_document_metadata(): """Test that document metadata is preserved through splitting.""" From d5e462c98b64020819b44025cf17c319dec9cbdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 30 Sep 2025 12:18:46 +0200 Subject: [PATCH 42/85] compile regex-pattern in init for performance gains --- haystack/components/preprocessors/markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 245b51d8d1..e31dd55ac6 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -52,7 +52,7 @@ def __init__( self.split_overlap = split_overlap self.split_threshold = split_threshold self.skip_empty_documents = skip_empty_documents - self._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers + self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$") # ATX-style .md-headers # initialize secondary_splitter only if needed if self.secondary_split != "none": From 4089ddc5cbd5fe688026e2d035c9e321703deb2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Mon, 13 Oct 2025 15:27:03 +0200 Subject: [PATCH 43/85] Update haystack/components/preprocessors/markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- haystack/components/preprocessors/markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index e31dd55ac6..45174f353a 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -27,7 +27,7 @@ def __init__( self, *, page_break_character: str = "\f", - secondary_split: Literal["none", "word", "passage", "period", "line"] = "none", + secondary_split: Optional[Literal["word", "passage", "period", "line"]] = None, split_length: int = 200, split_overlap: int = 0, split_threshold: int = 0, From 20d172ef1a39d9c41880a51e26b1de756ee25e0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:09:12 +0200 Subject: [PATCH 44/85] change all "none" to proper None values --- .../preprocessors/markdown_header_splitter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 45174f353a..4167751324 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -38,7 +38,7 @@ def __init__( :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f"). :param secondary_split: Optional secondary split condition after header splitting. - Options are "none", "word", "passage", "period", "line". Defaults to "none". + Options are None, "word", "passage", "period", "line". Defaults to None. :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. @@ -55,7 +55,7 @@ def __init__( self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$") # ATX-style .md-headers # initialize secondary_splitter only if needed - if self.secondary_split != "none": + if self.secondary_split: self.secondary_splitter = DocumentSplitter( split_by=self.secondary_split, split_length=self.split_length, @@ -129,7 +129,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document Ensures page counting is maintained across splits. """ - if self.secondary_split == "none": + if not self.secondary_split: return documents logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split) @@ -313,11 +313,11 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: # secondary splitting if configured final_docs = ( - self._apply_secondary_splitting(header_split_docs) if self.secondary_split != "none" else header_split_docs + self._apply_secondary_splitting(header_split_docs) if not self.secondary_split else header_split_docs ) # assign split_id if not already done in secondary splitting - if self.secondary_split == "none": + if not self.secondary_split: for idx, doc in enumerate(final_docs): if doc.meta is None: doc.meta = {} From a7c6725a53447310e6feef1b09fd2a473729dfaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:14:50 +0200 Subject: [PATCH 45/85] fix minor --- haystack/components/preprocessors/markdown_header_splitter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 4167751324..a00e7ff9fc 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -312,9 +312,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) # secondary splitting if configured - final_docs = ( - self._apply_secondary_splitting(header_split_docs) if not self.secondary_split else header_split_docs - ) + final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs # assign split_id if not already done in secondary splitting if not self.secondary_split: From c9c44eee30bc96e09954bcc95fd6dbbe776eeada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:48:25 +0200 Subject: [PATCH 46/85] explicitly test doc content --- .../test_markdown_header_splitter.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 19bf520626..998fbeebde 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -43,8 +43,23 @@ def test_basic_split(sample_text): assert "Subheader 1.2.3" in headers # Check that content is present and correct + header1_doc = next(doc for doc in split_docs if doc.meta["header"] == "Header 1") + assert "Content under header 1." in header1_doc.content + + subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1") + assert "Content under sub-header 1.1.1" in subheader111_doc.content + + subheader121_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.1") + assert "Content under header 1.2.1." in subheader121_doc.content + + subheader122_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2") + assert "Content under header 1.2.2." in subheader122_doc.content + + subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3") + assert "Content under header 1.2.3." in subheader123_doc.content + + # Ensure all documents have a header in their metadata for doc in split_docs: - assert doc.content.startswith("#") or doc.content.startswith("##") or doc.content.startswith("###") assert doc.meta.get("header") is not None From 0e36419750d2930e8f1ea682afc1837dc759ba61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:49:55 +0200 Subject: [PATCH 47/85] rename parentheaders to parent_headers --- .../components/preprocessors/markdown_header_splitter.py | 8 ++++---- .../preprocessors/test_markdown_header_splitter.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index a00e7ff9fc..3da29749a2 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -73,7 +73,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: # return unsplit if no headers found if not matches: logger.info("No headers found in document; returning full document as single chunk.") - return [{"content": text, "meta": {"header": None, "parentheaders": []}}] + return [{"content": text, "meta": {"header": None, "parent_headers": []}}] # process headers and build chunks chunks: list[dict] = [] @@ -104,7 +104,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: continue # get parent headers - parentheaders = list(active_parents) + parent_headers = list(active_parents) logger.debug( "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level @@ -113,7 +113,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: chunks.append( { "content": f"{header_prefix} {header_text}\n{content}", - "meta": {"header": header_text, "parentheaders": parentheaders}, + "meta": {"header": header_text, "parent_headers": parent_headers}, } ) @@ -167,7 +167,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document split.meta["page_number"] = current_page # preserve header metadata - for key in ["header", "parentheaders"]: + for key in ["header", "parent_headers"]: if key in doc.meta: split.meta[key] = doc.meta[key] diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 998fbeebde..d52b8cbf29 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -70,11 +70,11 @@ def test_split_parentheaders(sample_text): split_docs = result["documents"] # Check parentheaders for both a deep subheader and a simple one subheader_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2") - assert "Header 1" in subheader_doc.meta["parentheaders"] - assert "Header 1.2" in subheader_doc.meta["parentheaders"] + assert "Header 1" in subheader_doc.meta["parent_headers"] + assert "Header 1.2" in subheader_doc.meta["parent_headers"] h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None) if h3_doc: - assert h3_doc.meta["parentheaders"] == ["H1", "H2"] + assert h3_doc.meta["parent_headers"] == ["H1", "H2"] def test_split_no_headers(): From edc60b5b948f03bd1783d001f8d03ec430a83a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:54:33 +0200 Subject: [PATCH 48/85] test split_id, doc length --- .../preprocessors/test_markdown_header_splitter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index d52b8cbf29..ae3c5be8a2 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -73,8 +73,7 @@ def test_split_parentheaders(sample_text): assert "Header 1" in subheader_doc.meta["parent_headers"] assert "Header 1.2" in subheader_doc.meta["parent_headers"] h3_doc = next((doc for doc in split_docs if doc.meta["header"] == "H3"), None) - if h3_doc: - assert h3_doc.meta["parent_headers"] == ["H1", "H2"] + assert h3_doc.meta["parent_headers"] == ["H1", "H2"] def test_split_no_headers(): @@ -98,9 +97,17 @@ def test_split_multiple_documents(sample_text): ] result = splitter.run(documents=docs) split_docs = result["documents"] + + assert len(split_docs) == 8 + headers = {doc.meta["header"] for doc in split_docs} assert {"Another Header", "H1", "H2"}.issubset(headers) + # Verify that all documents have a split_id and they're sequential + split_ids = [doc.meta.get("split_id") for doc in split_docs] + assert all(split_id is not None for split_id in split_ids) + assert split_ids == list(range(len(split_ids))) + def test_split_only_headers(): text = "# H1\n# H2\n# H3" From 995c1219aacf371704532e397f33523486dcd895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:56:14 +0200 Subject: [PATCH 49/85] check meta content --- test/components/preprocessors/test_markdown_header_splitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index ae3c5be8a2..ae3ed0eed4 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -135,7 +135,9 @@ def test_preserve_document_metadata(): # New metadata should be added assert "header" in split_docs[0].meta + assert split_docs[0].meta["header"] == "Header" assert "split_id" in split_docs[0].meta + assert split_docs[0].meta["split_id"] == 0 # Error and edge case handling From 223a676f2c932519ecbd58ea731ccbf482f9bc4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 16:59:08 +0200 Subject: [PATCH 50/85] remove unneeded test --- .../preprocessors/test_markdown_header_splitter.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index ae3ed0eed4..9273a3f30f 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -188,19 +188,6 @@ def test_empty_content_handling(): assert len(result["documents"]) == 1 -# Output format and split ID checks -def test_document_splitting_format(): - """Test that the format of split documents is correct.""" - splitter = MarkdownHeaderSplitter() - docs = [Document(content="# Header\nContent")] - result = splitter.run(documents=docs) - - # Basic validation of the output structure - assert isinstance(result, dict) - assert "documents" in result - assert isinstance(result["documents"], list) - - def test_split_id_sequentiality_primary_and_secondary(): text = "# Header\n" + "Word " * 30 # Test primary splitting From babc7d98b8520e519697a69a52ba1d4db2b9649d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Mon, 13 Oct 2025 17:06:25 +0200 Subject: [PATCH 51/85] make split_id testing more robust --- .../test_markdown_header_splitter.py | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 9273a3f30f..bd0fb5fa2d 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -188,30 +188,41 @@ def test_empty_content_handling(): assert len(result["documents"]) == 1 -def test_split_id_sequentiality_primary_and_secondary(): - text = "# Header\n" + "Word " * 30 +def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test primary splitting splitter = MarkdownHeaderSplitter() - docs = [Document(content=text)] + docs = [Document(content=sample_text)] result = splitter.run(documents=docs) - split_ids = [doc.meta["split_id"] for doc in result["documents"]] + split_docs = result["documents"] + + # Test number of documents + assert len(split_docs) == 5 + + # Check that split_ids are sequential + split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) # Test secondary splitting - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5) - result = splitter.run(documents=docs) - split_ids = [doc.meta["split_id"] for doc in result["documents"]] - assert split_ids == list(range(len(split_ids))) - docs = [Document(content=text)] + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3) + docs = [Document(content=sample_text)] result = splitter.run(documents=docs) - split_ids = [doc.meta["split_id"] for doc in result["documents"]] + split_docs = result["documents"] + + # Test number of documents + assert len(split_docs) == 10 + + split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) - # Test secondary splitting - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=5) + # Test with multiple input documents + docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")] result = splitter.run(documents=docs) - split_ids = [doc.meta["split_id"] for doc in result["documents"]] - assert split_ids == list(range(len(split_ids))) + split_docs = result["documents"] + + # Test number of documents + assert len(split_docs) == 12 + + split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) From e488edc65146a1a4b6c955c1e9ae17ecd63aaae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 14:37:23 +0200 Subject: [PATCH 52/85] more realistic overlap test sample --- .../test_markdown_header_splitter.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index bd0fb5fa2d..2018ee1c1d 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -227,18 +227,29 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): def test_secondary_split_with_overlap(): - text = "# Header\n" + "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10" + realistic_text = ( + "# Introduction\n" + "This is the introduction section with some words for testing overlap splitting. " + "It should be split into chunks with overlap.\n" + "## Details\n" + "Here are more details about the topic. " + "Splitting should work across multiple headers and content blocks.\n" + "### Subsection\n" + "This subsection contains additional information and should also be split with overlap." + ) splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2) - docs = [Document(content=text)] + docs = [Document(content=realistic_text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Overlap of 2, so each chunk after the first should share 2 words with previous - assert len(split_docs) > 1 + assert len(split_docs) == 21 + for i in range(1, len(split_docs)): - prev_words = split_docs[i - 1].content.split() - curr_words = split_docs[i].content.split() - # The overlap should be the last 2 words of previous == first 2 of current - assert prev_words[-2:] == curr_words[:2] + prev_doc = split_docs[i - 1] + curr_doc = split_docs[i] + if prev_doc.meta["header"] == curr_doc.meta["header"]: # only check overlap within same header + prev_words = prev_doc.content.split() + curr_words = curr_doc.content.split() + assert prev_words[-2:] == curr_words[:2] def test_secondary_split_with_threshold(): From c0efda3ec0d7c0b6f25575e1f9c5356bb9c8a79f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 14:46:18 +0200 Subject: [PATCH 53/85] assign split_id globally to all output docs --- .../preprocessors/markdown_header_splitter.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 3da29749a2..af5c362309 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -173,12 +173,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document result_docs.append(split) - # assign unique, sequential split_id to all final chunks - for idx, doc in enumerate(result_docs): - if doc.meta is None: - doc.meta = {} - doc.meta["split_id"] = idx - logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) return result_docs @@ -314,11 +308,10 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: # secondary splitting if configured final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs - # assign split_id if not already done in secondary splitting - if not self.secondary_split: - for idx, doc in enumerate(final_docs): - if doc.meta is None: - doc.meta = {} - doc.meta["split_id"] = idx + # assign split_id to all output documents + for idx, doc in enumerate(final_docs): + if doc.meta is None: + doc.meta = {} + doc.meta["split_id"] = idx return {"documents": final_docs} From 893e3dec766c22111aad0a97f7c8726ed006a033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 14:56:14 +0200 Subject: [PATCH 54/85] taste page numbers explicitly --- .../preprocessors/test_markdown_header_splitter.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 2018ee1c1d..226c28c8e3 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -290,10 +290,7 @@ def test_page_break_handling_with_multiple_headers(): # Collect page numbers for each header header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"] header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"] - # Both headers should have splits with page_number 1 and 2 for Header 1, and 1 and 2 for Header 2 - # (relative to their own chunk) assert min(header1_pages) == 1 - assert max(header1_pages) >= 2 - # header2_pages may start at 2 if the previous header's last chunk ended with a page break - assert min(header2_pages) >= 1 - assert max(header2_pages) >= 2 + assert max(header1_pages) == 2 + assert min(header2_pages) == 2 + assert max(header2_pages) == 3 From 9abf10b17bd98ddbe27283b36f0570c97ac3a00d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 14:58:56 +0200 Subject: [PATCH 55/85] cleanup pagebreak test --- .../preprocessors/test_markdown_header_splitter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 226c28c8e3..1ca748b585 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -267,18 +267,14 @@ def test_secondary_split_with_threshold(): def test_page_break_handling_in_secondary_split(): text = "# Header\nFirst page\fSecond page\fThird page" - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2) + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # The page_number should increment at each page break page_numbers = [doc.meta.get("page_number") for doc in split_docs] # Should start at 1 and increment at each \f assert page_numbers[0] == 1 - assert 2 in page_numbers - # Remove: assert 3 in page_numbers - # Instead, check that the max page number is 2 or 3, depending on split alignment - assert max(page_numbers) >= 2 + assert max(page_numbers) == 3 def test_page_break_handling_with_multiple_headers(): From 11da0a86a945bd6888fe13d804164df0fa9c1e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 15:00:27 +0200 Subject: [PATCH 56/85] minor --- test/components/preprocessors/test_markdown_header_splitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 1ca748b585..ca4360905e 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -258,7 +258,6 @@ def test_secondary_split_with_threshold(): docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # The last chunk should have at least split_threshold words if possible for doc in split_docs[:-1]: assert len(doc.content.split()) == 3 # The last chunk should have at least 2 words (threshold) From 32d8c6862b3cb876ee28a019a1ff003ba61913b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 14 Oct 2025 15:11:53 +0200 Subject: [PATCH 57/85] return doc unchunked if no headers have content --- .../components/preprocessors/markdown_header_splitter.py | 7 +++++++ .../preprocessors/test_markdown_header_splitter.py | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index af5c362309..59d8f35bc3 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -79,6 +79,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: chunks: list[dict] = [] header_stack: list[Optional[str]] = [None] * 6 active_parents: list[str] = [] + has_content = False # Flag to track if any header has content for i, match in enumerate(matches): # extract header info @@ -103,6 +104,7 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: active_parents.append(header_text) continue + has_content = True # At least one header has content # get parent headers parent_headers = list(active_parents) @@ -120,6 +122,11 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: # reset active parents active_parents = [h for h in header_stack[: level - 1] if h is not None] + # return doc unchunked if no headers have content + if not has_content: + logger.info("Document contains only headers with no content; returning original document.") + return [{"content": text, "meta": {}}] + logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks)) return chunks diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index ca4360905e..a37c2c9b0a 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -115,8 +115,9 @@ def test_split_only_headers(): docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Should not create chunks for headers with no content - assert len(split_docs) == 0 + # Return doc without content unchunked + assert len(split_docs) == 1 + assert split_docs[0].content == text # Metadata preservation From bcf56cac1fd6feaec525a7b499ddebdfb5dc827b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 11:35:15 +0200 Subject: [PATCH 58/85] add doc-id to logging statement for unsplit documents --- .../preprocessors/markdown_header_splitter.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 59d8f35bc3..31d2b686fe 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -63,7 +63,7 @@ def __init__( split_threshold=self.split_threshold, ) - def _split_text_by_markdown_headers(self, text: str) -> list[dict]: + def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: """Split text by ATX-style headers (#) and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -72,7 +72,9 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: # return unsplit if no headers found if not matches: - logger.info("No headers found in document; returning full document as single chunk.") + logger.info( + "No headers found in document {doc_id}; returning full document as single chunk.", doc_id=doc_id + ) return [{"content": text, "meta": {"header": None, "parent_headers": []}}] # process headers and build chunks @@ -99,13 +101,12 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: # skip splits w/o content if not content: - # Add as parent for subsequent headers + # add as parent for subsequent headers active_parents = [h for h in header_stack[: level - 1] if h is not None] active_parents.append(header_text) continue - has_content = True # At least one header has content - # get parent headers + has_content = True # at least one header has content parent_headers = list(active_parents) logger.debug( @@ -124,7 +125,9 @@ def _split_text_by_markdown_headers(self, text: str) -> list[dict]: # return doc unchunked if no headers have content if not has_content: - logger.info("Document contains only headers with no content; returning original document.") + logger.info( + "Document {doc_id} contains only headers with no content; returning original document.", doc_id=doc_id + ) return [{"content": text, "meta": {}}] logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks)) @@ -226,7 +229,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) if doc.content is None: continue - splits = self._split_text_by_markdown_headers(doc.content) + splits = self._split_text_by_markdown_headers(doc.content, doc.id) docs = [] total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0) From c5415ec2133a505c328e2ceb2e7a6c3f462ffe3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 11:42:58 +0200 Subject: [PATCH 59/85] remove unneeded logs --- .../components/preprocessors/markdown_header_splitter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 31d2b686fe..f44b6d1977 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -130,7 +130,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: ) return [{"content": text, "meta": {}}] - logger.info("Split into {num_chunks} chunks by markdown headers.", num_chunks=len(chunks)) return chunks def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document]: @@ -142,7 +141,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document if not self.secondary_split: return documents - logger.info("Applying secondary splitting by {secondary_split}", secondary_split=self.secondary_split) result_docs = [] for doc in documents: @@ -183,7 +181,9 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document result_docs.append(split) - logger.info("Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs)) + logger.debug( + "Secondary splitting complete. Final count: {final_count} documents.", final_count=len(result_docs) + ) return result_docs def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: @@ -313,7 +313,6 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: return {"documents": []} header_split_docs = self._split_documents_by_markdown_headers(processed_documents) - logger.info("Header splitting produced {num_docs} documents", num_docs=len(header_split_docs)) # secondary splitting if configured final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs From dff06bc0dd93b34774810b2d63784b59a1005680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 11:49:38 +0200 Subject: [PATCH 60/85] minor cleanup --- haystack/components/preprocessors/markdown_header_splitter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f44b6d1977..6f2ce68947 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -278,6 +278,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: - `documents`: List of documents with the split texts. Each document includes: - A metadata field `source_id` to track the original document. - A metadata field `page_number` to track the original page number. + - A metadata field `split_id` to uniquely identify each split chunk. - All other metadata copied from the original document. """ # validate input documents @@ -319,8 +320,6 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: # assign split_id to all output documents for idx, doc in enumerate(final_docs): - if doc.meta is None: - doc.meta = {} doc.meta["split_id"] = idx return {"documents": final_docs} From a54d25a7ce67ace21a0473059493eaf12f4f40cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 11:56:21 +0200 Subject: [PATCH 61/85] simplify page-number tracking method to not return count, just the updated page number --- .../preprocessors/markdown_header_splitter.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 6f2ce68947..1222f87142 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -169,7 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document for i, split in enumerate(secondary_splits): # calculate page number for this split if i > 0 and secondary_splits[i - 1].content: - _, current_page = self._count_page_breaks_and_update(secondary_splits[i - 1].content, current_page) + current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page) # set page number to meta split.meta["page_number"] = current_page @@ -198,16 +198,16 @@ def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = target_dict[new_key] = value return target_dict - def _count_page_breaks_and_update(self, content: str, current_page: int) -> tuple[int, int]: + def _update_page_number_with_breaks(self, content: str, current_page: int) -> int: """ - Count page breaks in content and return updated page count. + Update page number based on page breaks in content. :param content: Content to check for page breaks :param current_page: Current page number - :return: Tuple of (page_breaks_count, new_current_page) + :return: New current page number """ if not isinstance(content, str): - return 0, current_page + return current_page page_breaks = content.count(self.page_break_character) new_page_number = current_page + page_breaks @@ -220,7 +220,7 @@ def _count_page_breaks_and_update(self, content: str, current_page: int) -> tupl new=new_page_number, ) - return page_breaks, new_page_number + return new_page_number def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]: """Split a list of documents by markdown headers, preserving metadata.""" @@ -244,7 +244,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis if doc.meta: meta = self._flatten_dict(doc.meta) meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) - _, current_page = self._count_page_breaks_and_update(split["content"], current_page) + current_page = self._update_page_number_with_breaks(split["content"], current_page) if split.get("meta"): meta.update(self._flatten_dict(split.get("meta") or {})) docs.append(Document(content=split["content"], meta=meta)) From a34c7a6fb135fa142e3396193432029c844aac20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 12:02:14 +0200 Subject: [PATCH 62/85] add dev comment to mypy check for doc.content is None --- haystack/components/preprocessors/markdown_header_splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 1222f87142..aaf825c5aa 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -227,6 +227,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis result_docs = [] for doc in documents: logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) + # mypy: doc.content is Optional[str], so we must check for None before passing to splitting method if doc.content is None: continue splits = self._split_text_by_markdown_headers(doc.content, doc.id) From 7bc798e3cd8fe45cfd922e15d6015a4f6e536179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Thu, 16 Oct 2025 12:03:37 +0200 Subject: [PATCH 63/85] Update haystack/components/preprocessors/markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- haystack/components/preprocessors/markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index aaf825c5aa..187a01dd19 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -232,7 +232,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis continue splits = self._split_text_by_markdown_headers(doc.content, doc.id) docs = [] - total_pages = self._calculate_total_pages(doc.content, doc.meta.get("total_pages", 0) if doc.meta else 0) + total_pages = self._calculate_total_pages(doc.content, 0) current_page = doc.meta.get("page_number", 1) if doc.meta else 1 logger.debug( From a7eef6b7bc265d882a90af872d00ceed2ef1cc15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 12:23:05 +0200 Subject: [PATCH 64/85] remove split meta flattening --- .../components/preprocessors/markdown_header_splitter.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 187a01dd19..3ea9ee4828 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -246,8 +246,6 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis meta = self._flatten_dict(doc.meta) meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) current_page = self._update_page_number_with_breaks(split["content"], current_page) - if split.get("meta"): - meta.update(self._flatten_dict(split.get("meta") or {})) docs.append(Document(content=split["content"], meta=meta)) logger.debug( "Split into {num_docs} documents for id={doc_id}, final page: {current_page}", @@ -263,9 +261,6 @@ def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int: if existing_total > 0: return existing_total - if not isinstance(content, str): - return 1 - return content.count(self.page_break_character) + 1 @component.output_types(documents=list[Document]) From 5b5fc93330f9138bc8e5a77e8803108cea06438d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 12:25:20 +0200 Subject: [PATCH 65/85] keep empty meta return consistent --- haystack/components/preprocessors/markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 3ea9ee4828..c43a2f1f95 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -75,7 +75,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: logger.info( "No headers found in document {doc_id}; returning full document as single chunk.", doc_id=doc_id ) - return [{"content": text, "meta": {"header": None, "parent_headers": []}}] + return [{"content": text, "meta": {}}] # process headers and build chunks chunks: list[dict] = [] From 8ef5af032c57cf23cf54629444d230ed2e609b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 12:26:38 +0200 Subject: [PATCH 66/85] remove unneeded content is none check --- haystack/components/preprocessors/markdown_header_splitter.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index c43a2f1f95..cc1ee0448e 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -138,9 +138,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document Ensures page counting is maintained across splits. """ - if not self.secondary_split: - return documents - result_docs = [] for doc in documents: From f1e3739f6b95b6c4550662a8f199ebecce8d89c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 12:40:31 +0200 Subject: [PATCH 67/85] update tests to reflect empty meta dict for unsplit docs --- haystack/components/preprocessors/markdown_header_splitter.py | 2 ++ .../components/preprocessors/test_markdown_header_splitter.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index cc1ee0448e..f4e51527eb 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -242,6 +242,8 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis if doc.meta: meta = self._flatten_dict(doc.meta) meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) + if split.get("meta"): + meta.update(split["meta"]) current_page = self._update_page_number_with_breaks(split["content"], current_page) docs.append(Document(content=split["content"], meta=meta)) logger.debug( diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index a37c2c9b0a..aa6a6e9235 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -81,10 +81,10 @@ def test_split_no_headers(): docs = [Document(content="No headers here."), Document(content="Just some text without headers.")] result = splitter.run(documents=docs) split_docs = result["documents"] - # Should return one doc per input, header is None + # Should return one doc per input, and no header key in meta assert len(split_docs) == 2 for doc in split_docs: - assert doc.meta["header"] is None + assert "header" not in doc.meta def test_split_multiple_documents(sample_text): From df7e775a9967b3417b0c9420be6e3418727670f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 13:42:20 +0200 Subject: [PATCH 68/85] clean up total_page counts --- .../preprocessors/markdown_header_splitter.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f4e51527eb..130a1ce417 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -221,6 +221,7 @@ def _update_page_number_with_breaks(self, content: str, current_page: int) -> in def _split_documents_by_markdown_headers(self, documents: list[Document]) -> list[Document]: """Split a list of documents by markdown headers, preserving metadata.""" + result_docs = [] for doc in documents: logger.debug("Splitting document with id={doc_id}", doc_id=doc.id) @@ -229,11 +230,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis continue splits = self._split_text_by_markdown_headers(doc.content, doc.id) docs = [] - total_pages = self._calculate_total_pages(doc.content, 0) current_page = doc.meta.get("page_number", 1) if doc.meta else 1 + total_pages = doc.content.count(self.page_break_character) + 1 logger.debug( - "Starting page number: {current_page}, Total pages: {total_pages}", + "Processing page number: {current_page} out of {total_pages}", current_page=current_page, total_pages=total_pages, ) @@ -241,7 +242,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis meta = {} if doc.meta: meta = self._flatten_dict(doc.meta) - meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) + meta.update({"source_id": doc.id, "page_number": current_page}) if split.get("meta"): meta.update(split["meta"]) current_page = self._update_page_number_with_breaks(split["content"], current_page) @@ -255,13 +256,6 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis result_docs.extend(docs) return result_docs - def _calculate_total_pages(self, content: str, existing_total: int = 0) -> int: - """Calculate total pages based on content and existing metadata.""" - if existing_total > 0: - return existing_total - - return content.count(self.page_break_character) + 1 - @component.output_types(documents=list[Document]) def run(self, documents: list[Document]) -> dict[str, list[Document]]: """ From 3c1c3762307ddb42ed6d2478da13718f3e919ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Thu, 16 Oct 2025 13:46:01 +0200 Subject: [PATCH 69/85] remove unneeded meta check --- .../components/preprocessors/test_markdown_header_splitter.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index aa6a6e9235..02c0c7beac 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -58,10 +58,6 @@ def test_basic_split(sample_text): subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3") assert "Content under header 1.2.3." in subheader123_doc.content - # Ensure all documents have a header in their metadata - for doc in split_docs: - assert doc.meta.get("header") is not None - def test_split_parentheaders(sample_text): splitter = MarkdownHeaderSplitter() From 86feef6844aed0bf5856ab3267b17d2a895bda67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:23:12 +0200 Subject: [PATCH 70/85] Update test/components/preprocessors/test_markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../test_markdown_header_splitter.py | 60 +++++++++++++++++-- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 02c0c7beac..71e08c2168 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -280,9 +280,57 @@ def test_page_break_handling_with_multiple_headers(): result = splitter.run(documents=docs) split_docs = result["documents"] # Collect page numbers for each header - header1_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 1"] - header2_pages = [doc.meta.get("page_number") for doc in split_docs if doc.meta.get("header") == "Header 2"] - assert min(header1_pages) == 1 - assert max(header1_pages) == 2 - assert min(header2_pages) == 2 - assert max(header2_pages) == 3 + assert len(split_docs) == 4 + + # Split 1 + assert split_docs[0].content == "\nPage 1\fPage " + assert split_docs[0].meta == { + "source_id": ANY, + "total_pages": 3, + "page_number": 1, + "header": "Header 1", + "parent_headers": [], + "split_id": 0, + "split_idx_start": 0, + } + + # Split 2 + assert split_docs[1].content == "2" + assert split_docs[1].meta == { + "source_id": ANY, + "total_pages": 3, + "page_number": 2, + "header": "Header 1", + "parent_headers": [], + "split_id": 1, + "split_idx_start": 13, + } + + # Split 3 + assert split_docs[2].content == "\nPage 3\fPage " + assert split_docs[2].meta == { + "source_id": ANY, + "total_pages": 3, + "page_number": 2, + "header": "Header 2", + "parent_headers": [], + "split_id": 2, + "split_idx_start": 0, + } + + # Split 4 + assert split_docs[3].content == "4" + assert split_docs[3].meta == { + "source_id": ANY, + "total_pages": 3, + "page_number": 3, + "header": "Header 2", + "parent_headers": [], + "split_id": 3, + "split_idx_start": 13, + } + + # Check reconstruction + # NOTE: This doesn't seem to pass currently + reconstructed_text = "".join(doc.content for doc in split_docs) + assert reconstructed_text == text From c22b57db35e67dc9844016373ca52943ea3c7e93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 17 Oct 2025 18:01:17 +0200 Subject: [PATCH 71/85] implement keep_headers parameter --- .../preprocessors/markdown_header_splitter.py | 54 +++++++---- .../test_markdown_header_splitter.py | 92 ++++++++----------- 2 files changed, 74 insertions(+), 72 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 130a1ce417..993c5fb720 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -27,6 +27,7 @@ def __init__( self, *, page_break_character: str = "\f", + keep_headers: bool = True, secondary_split: Optional[Literal["word", "passage", "period", "line"]] = None, split_length: int = 200, split_overlap: int = 0, @@ -37,6 +38,8 @@ def __init__( Initialize the MarkdownHeaderSplitter. :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f"). + :param keep_headers: If True, headers are kept in the content. If False, headers are moved to metadata. + Defaults to True. :param secondary_split: Optional secondary split condition after header splitting. Options are None, "word", "passage", "period", "line". Defaults to None. :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. @@ -52,6 +55,7 @@ def __init__( self.split_overlap = split_overlap self.split_threshold = split_threshold self.skip_empty_documents = skip_empty_documents + self.keep_headers = keep_headers self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$") # ATX-style .md-headers # initialize secondary_splitter only if needed @@ -80,8 +84,9 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # process headers and build chunks chunks: list[dict] = [] header_stack: list[Optional[str]] = [None] * 6 - active_parents: list[str] = [] - has_content = False # Flag to track if any header has content + active_parents: list[str] = [] # track active parent headers + pending_headers: list[str] = [] # store empty headers to prepend to next content + has_content = False # flag to track if any header has content for i, match in enumerate(matches): # extract header info @@ -99,11 +104,16 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: for j in range(level, 6): header_stack[j] = None + # prepare header_line if keep_headers + header_line = f"{header_prefix} {header_text}" + # skip splits w/o content if not content: # add as parent for subsequent headers active_parents = [h for h in header_stack[: level - 1] if h is not None] active_parents.append(header_text) + if self.keep_headers: + pending_headers.append(header_line) continue has_content = True # at least one header has content @@ -113,12 +123,21 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level ) - chunks.append( - { - "content": f"{header_prefix} {header_text}\n{content}", - "meta": {"header": header_text, "parent_headers": parent_headers}, - } - ) + if self.keep_headers: + # add pending & current header to content + chunk_content = "" + if pending_headers: + chunk_content += "\n".join(pending_headers) + "\n" + chunk_content += f"{header_line}\n{content}" + chunks.append( + { + "content": chunk_content, + "meta": {} if self.keep_headers else {"header": header_text, "parent_headers": parent_headers}, + } + ) + pending_headers = [] # reset pending headers + else: + chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}}) # reset active parents active_parents = [h for h in header_stack[: level - 1] if h is not None] @@ -145,11 +164,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document result_docs.append(doc) continue - # extract header information - header_match = re.search(self._header_pattern, doc.content) content_for_splitting: str = doc.content - if header_match: - content_for_splitting = doc.content[header_match.end() :] + + if not self.keep_headers: # skip header extraction if keep_headers + # extract header information + header_match = re.search(self._header_pattern, doc.content) + if header_match: + content_for_splitting = doc.content[header_match.end() :] if not content_for_splitting or not content_for_splitting.strip(): # skip empty content result_docs.append(doc) @@ -171,10 +192,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # set page number to meta split.meta["page_number"] = current_page - # preserve header metadata - for key in ["header", "parent_headers"]: - if key in doc.meta: - split.meta[key] = doc.meta[key] + # preserve header metadata if we're not keeping headers in content + if not self.keep_headers: + for key in ["header", "parent_headers"]: + if key in doc.meta: + split.meta[key] = doc.meta[key] result_docs.append(split) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 71e08c2168..0d1091797b 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import ANY + import pytest from haystack import Document @@ -29,7 +31,7 @@ def sample_text(): # Basic splitting and structure def test_basic_split(sample_text): - splitter = MarkdownHeaderSplitter() + splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content=sample_text)] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -60,7 +62,7 @@ def test_basic_split(sample_text): def test_split_parentheaders(sample_text): - splitter = MarkdownHeaderSplitter() + splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content=sample_text), Document(content="# H1\n## H2\n### H3\nContent")] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -84,7 +86,7 @@ def test_split_no_headers(): def test_split_multiple_documents(sample_text): - splitter = MarkdownHeaderSplitter() + splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [ Document(content=sample_text), Document(content="# Another Header\nSome content."), @@ -119,7 +121,7 @@ def test_split_only_headers(): # Metadata preservation def test_preserve_document_metadata(): """Test that document metadata is preserved through splitting.""" - splitter = MarkdownHeaderSplitter() + splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})] result = splitter.run(documents=docs) @@ -187,7 +189,7 @@ def test_empty_content_handling(): def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test primary splitting - splitter = MarkdownHeaderSplitter() + splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content=sample_text)] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -206,7 +208,7 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): split_docs = result["documents"] # Test number of documents - assert len(split_docs) == 10 + assert len(split_docs) == 12 split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) @@ -217,14 +219,14 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): split_docs = result["documents"] # Test number of documents - assert len(split_docs) == 12 + assert len(split_docs) == 14 split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) def test_secondary_split_with_overlap(): - realistic_text = ( + text = ( "# Introduction\n" "This is the introduction section with some words for testing overlap splitting. " "It should be split into chunks with overlap.\n" @@ -234,8 +236,8 @@ def test_secondary_split_with_overlap(): "### Subsection\n" "This subsection contains additional information and should also be split with overlap." ) - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2) - docs = [Document(content=realistic_text)] + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2, keep_headers=False) + docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] assert len(split_docs) == 21 @@ -251,7 +253,7 @@ def test_secondary_split_with_overlap(): def test_secondary_split_with_threshold(): text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)]) - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2) + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -274,63 +276,41 @@ def test_page_break_handling_in_secondary_split(): def test_page_break_handling_with_multiple_headers(): - text = "# Header 1\nPage 1\fPage 2\n# Header 2\nPage 3\fPage 4" - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=2) + text = "# Header\nFirst page\f Second page\f Third page" + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1, keep_headers=True) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Collect page numbers for each header - assert len(split_docs) == 4 + assert len(split_docs) == 7 # Split 1 - assert split_docs[0].content == "\nPage 1\fPage " - assert split_docs[0].meta == { - "source_id": ANY, - "total_pages": 3, - "page_number": 1, - "header": "Header 1", - "parent_headers": [], - "split_id": 0, - "split_idx_start": 0, - } + assert split_docs[0].content == "# " + assert split_docs[0].meta == {"source_id": ANY, "page_number": 1, "split_id": 0, "split_idx_start": 0} # Split 2 - assert split_docs[1].content == "2" - assert split_docs[1].meta == { - "source_id": ANY, - "total_pages": 3, - "page_number": 2, - "header": "Header 1", - "parent_headers": [], - "split_id": 1, - "split_idx_start": 13, - } + assert split_docs[1].content == "Header\nFirst " + assert split_docs[1].meta == {"source_id": ANY, "page_number": 1, "split_id": 1, "split_idx_start": 2} # Split 3 - assert split_docs[2].content == "\nPage 3\fPage " - assert split_docs[2].meta == { - "source_id": ANY, - "total_pages": 3, - "page_number": 2, - "header": "Header 2", - "parent_headers": [], - "split_id": 2, - "split_idx_start": 0, - } + assert split_docs[2].content == "page\f " + assert split_docs[2].meta == {"source_id": ANY, "page_number": 1, "split_id": 2, "split_idx_start": 15} # Split 4 - assert split_docs[3].content == "4" - assert split_docs[3].meta == { - "source_id": ANY, - "total_pages": 3, - "page_number": 3, - "header": "Header 2", - "parent_headers": [], - "split_id": 3, - "split_idx_start": 13, - } + assert split_docs[3].content == "Second " + assert split_docs[3].meta == {"source_id": ANY, "page_number": 2, "split_id": 3, "split_idx_start": 21} + + # Split 5 + assert split_docs[4].content == "page\f " + assert split_docs[4].meta == {"source_id": ANY, "page_number": 2, "split_id": 4, "split_idx_start": 28} + + # Split 6 + assert split_docs[5].content == "Third " + assert split_docs[5].meta == {"source_id": ANY, "page_number": 3, "split_id": 5, "split_idx_start": 34} + + # Split 7 + assert split_docs[6].content == "page" + assert split_docs[6].meta == {"source_id": ANY, "page_number": 3, "split_id": 6, "split_idx_start": 40} # Check reconstruction - # NOTE: This doesn't seem to pass currently reconstructed_text = "".join(doc.content for doc in split_docs) assert reconstructed_text == text From 7c03a0494aff8c075b85e3cfa1697ebdb45ab438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 17 Oct 2025 19:18:01 +0200 Subject: [PATCH 72/85] remove meta-dict flattening --- .../preprocessors/markdown_header_splitter.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 993c5fb720..4e837a8db5 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -205,18 +205,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document ) return result_docs - def _flatten_dict(self, d: dict, prefix: str = "", target_dict: Optional[dict] = None) -> dict: - """Flatten a nested dictionary, concatenating keys with underscores.""" - if target_dict is None: - target_dict = {} - for key, value in d.items(): - new_key = f"{prefix}{key}" if prefix else key - if isinstance(value, dict): - self._flatten_dict(value, f"{new_key}_", target_dict) - else: - target_dict[new_key] = value - return target_dict - def _update_page_number_with_breaks(self, content: str, current_page: int) -> int: """ Update page number based on page breaks in content. @@ -263,7 +251,7 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis for split in splits: meta = {} if doc.meta: - meta = self._flatten_dict(doc.meta) + meta = doc.meta.copy() meta.update({"source_id": doc.id, "page_number": current_page}) if split.get("meta"): meta.update(split["meta"]) From 9a8ca7676d90a19638533f42199e693d5185fb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Tue, 21 Oct 2025 10:56:48 +0200 Subject: [PATCH 73/85] add minor sanity checks --- test/components/preprocessors/test_markdown_header_splitter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 0d1091797b..67f1e040e1 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -83,6 +83,9 @@ def test_split_no_headers(): assert len(split_docs) == 2 for doc in split_docs: assert "header" not in doc.meta + # Sanity Checks + assert split_docs[0].content == docs[0].content + assert split_docs[1].content == docs[1].content def test_split_multiple_documents(sample_text): From 2f1e2037d94f0be8bab8158924ab9046fd7bff05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Tue, 21 Oct 2025 10:53:51 +0200 Subject: [PATCH 74/85] Update test/components/preprocessors/test_markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../preprocessors/test_markdown_header_splitter.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 67f1e040e1..8d5694e6cd 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -45,8 +45,13 @@ def test_basic_split(sample_text): assert "Subheader 1.2.3" in headers # Check that content is present and correct - header1_doc = next(doc for doc in split_docs if doc.meta["header"] == "Header 1") - assert "Content under header 1." in header1_doc.content + # Test first split + header1_doc = split_docs[0] + assert header1_doc.meta["header"] == "Header 1" + assert header1_doc.meta["split_id"] == 0 + assert header1_doc.meta["page_number"] == 1 + assert header1_doc.meta["parent_headers"] == [] + assert header1_doc.content == "# Header 1\nContent under header 1." subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1") assert "Content under sub-header 1.1.1" in subheader111_doc.content From b22feb5c001155223dda12af69dff784a0c4e5af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 10:33:46 +0100 Subject: [PATCH 75/85] add warmup --- .../preprocessors/markdown_header_splitter.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 4e837a8db5..f6792a1862 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -57,6 +57,7 @@ def __init__( self.skip_empty_documents = skip_empty_documents self.keep_headers = keep_headers self._header_pattern = re.compile(r"(?m)^(#{1,6}) (.+)$") # ATX-style .md-headers + self._is_warmed_up = False # initialize secondary_splitter only if needed if self.secondary_split: @@ -67,6 +68,14 @@ def __init__( split_threshold=self.split_threshold, ) + def warm_up(self): + """ + Warm up the MarkdownHeaderSplitter. + """ + if self.secondary_split and not self._is_warmed_up: + self.secondary_splitter.warm_up() + self._is_warmed_up = True + def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: """Split text by ATX-style headers (#) and create chunks with appropriate metadata.""" logger.debug("Splitting text by markdown headers") @@ -97,7 +106,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # get content start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) - content = text[start:end].strip() + content = text[start:end] # update header stack to track nesting header_stack[level - 1] = header_text From 85018315d8af7d8497a76aeefb0ca3cc8b2df6c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= <45487933+OGuggenbuehl@users.noreply.github.com> Date: Fri, 21 Nov 2025 10:29:47 +0100 Subject: [PATCH 76/85] Update haystack/components/preprocessors/markdown_header_splitter.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../components/preprocessors/markdown_header_splitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f6792a1862..cf933abae6 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -46,8 +46,9 @@ def __init__( :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. - :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents. - Defaults to True. + :param skip_empty_documents: Choose whether to skip documents with empty content. Default is True. + Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text + from non-textual documents. """ self.page_break_character = page_break_character self.secondary_split = secondary_split From 23da68ed7b2ffe858f5a0f1fd955c4b64461b6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 10:54:54 +0100 Subject: [PATCH 77/85] fix splitting when keeping headers --- .../preprocessors/markdown_header_splitter.py | 4 +- .../test_markdown_header_splitter.py | 124 ++++++++++++++---- 2 files changed, 102 insertions(+), 26 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index cf933abae6..463ea6ceed 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -118,7 +118,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: header_line = f"{header_prefix} {header_text}" # skip splits w/o content - if not content: + if not content.strip(): # add as parent for subsequent headers active_parents = [h for h in header_stack[: level - 1] if h is not None] active_parents.append(header_text) @@ -138,7 +138,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: chunk_content = "" if pending_headers: chunk_content += "\n".join(pending_headers) + "\n" - chunk_content += f"{header_line}\n{content}" + chunk_content += f"{header_line}{content}" chunks.append( { "content": chunk_content, diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 8d5694e6cd..0cfc732bbe 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -31,39 +31,115 @@ def sample_text(): # Basic splitting and structure def test_basic_split(sample_text): - splitter = MarkdownHeaderSplitter(keep_headers=False) + splitter = MarkdownHeaderSplitter() docs = [Document(content=sample_text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Should split into all headers with content - headers = [doc.meta["header"] for doc in split_docs] - assert "Header 1" in headers - assert "Subheader 1.1.1" in headers - assert "Subheader 1.2.1" in headers - assert "Subheader 1.2.2" in headers - assert "Subheader 1.2.3" in headers - # Check that content is present and correct # Test first split header1_doc = split_docs[0] - assert header1_doc.meta["header"] == "Header 1" + # assert header1_doc.meta["header"] == "Header 1" assert header1_doc.meta["split_id"] == 0 assert header1_doc.meta["page_number"] == 1 - assert header1_doc.meta["parent_headers"] == [] - assert header1_doc.content == "# Header 1\nContent under header 1." - - subheader111_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.1.1") - assert "Content under sub-header 1.1.1" in subheader111_doc.content - - subheader121_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.1") - assert "Content under header 1.2.1." in subheader121_doc.content - - subheader122_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.2") - assert "Content under header 1.2.2." in subheader122_doc.content - - subheader123_doc = next(doc for doc in split_docs if doc.meta["header"] == "Subheader 1.2.3") - assert "Content under header 1.2.3." in subheader123_doc.content + # assert header1_doc.meta["parent_headers"] == [] + assert header1_doc.content == "# Header 1\nContent under header 1.\n" + + # Test second split + subheader111_doc = split_docs[1] + # assert subheader111_doc.meta["header"] == "Subheader 1.1.1" + assert subheader111_doc.meta["split_id"] == 1 + assert subheader111_doc.meta["page_number"] == 1 + # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"] + assert subheader111_doc.content == "## Header 1.1\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n" + + # Test third split + subheader121_doc = split_docs[2] + # assert subheader121_doc.meta["header"] == "Subheader 1.2.1" + assert subheader121_doc.meta["split_id"] == 2 + assert subheader121_doc.meta["page_number"] == 1 + # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader121_doc.content == "## Header 1.2\n### Subheader 1.2.1\nContent under header 1.2.1.\n" + + # Test fourth split + subheader122_doc = split_docs[3] + # assert subheader122_doc.meta["header"] == "Subheader 1.2.2" + assert subheader122_doc.meta["split_id"] == 3 + assert subheader122_doc.meta["page_number"] == 1 + # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader122_doc.content == "### Subheader 1.2.2\nContent under header 1.2.2.\n" + + # Test fifth split + subheader123_doc = split_docs[4] + # assert subheader123_doc.meta["header"] == "Subheader 1.2.3" + assert subheader123_doc.meta["split_id"] == 4 + assert subheader123_doc.meta["page_number"] == 1 + # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader123_doc.content == "### Subheader 1.2.3\nContent under header 1.2.3." + + # Sanity check: reconstruct original text + reconstructed_doc = "".join([doc.content for doc in split_docs]) + assert reconstructed_doc == sample_text + + +# def test_split_without_headers(sample_text): +# splitter = MarkdownHeaderSplitter(keep_headers=False) +# docs = [Document(content=sample_text)] +# result = splitter.run(documents=docs) +# split_docs = result["documents"] + +# # Should split into all headers with content +# headers = [doc.meta["header"] for doc in split_docs] +# assert "Header 1" in headers +# assert "Subheader 1.1.1" in headers +# assert "Subheader 1.2.1" in headers +# assert "Subheader 1.2.2" in headers +# assert "Subheader 1.2.3" in headers + +# # Check that content is present and correct +# # Test first split +# header1_doc = split_docs[0] +# # assert header1_doc.meta["header"] == "Header 1" +# assert header1_doc.meta["split_id"] == 0 +# assert header1_doc.meta["page_number"] == 1 +# # assert header1_doc.meta["parent_headers"] == [] +# assert header1_doc.content == "# Header 1\n\nContent under header 1.\n" + +# # Test second split +# subheader111_doc = split_docs[1] +# # assert subheader111_doc.meta["header"] == "Subheader 1.1.1" +# assert subheader111_doc.meta["split_id"] == 1 +# assert subheader111_doc.meta["page_number"] == 1 +# # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"] +# assert subheader111_doc.content == "## Header 1.1\n\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n" + +# # Test third split +# subheader121_doc = split_docs[2] +# # assert subheader121_doc.meta["header"] == "Subheader 1.2.1" +# assert subheader121_doc.meta["split_id"] == 2 +# assert subheader121_doc.meta["page_number"] == 1 +# # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] +# assert subheader121_doc.content == "## Header 1.2\n\n### Subheader 1.2.1\nContent under header 1.2.1.\n" + +# # Test fourth split +# subheader122_doc = split_docs[3] +# # assert subheader122_doc.meta["header"] == "Subheader 1.2.2" +# assert subheader122_doc.meta["split_id"] == 3 +# assert subheader122_doc.meta["page_number"] == 1 +# # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] +# assert subheader122_doc.content == "### Subheader 1.2.2\n\nContent under header 1.2.2.\n" + +# # Test fifth split +# subheader123_doc = split_docs[4] +# # assert subheader123_doc.meta["header"] == "Subheader 1.2.3" +# assert subheader123_doc.meta["split_id"] == 4 +# assert subheader123_doc.meta["page_number"] == 1 +# # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] +# assert subheader123_doc.content == "### Subheader 1.2.3\n\nContent under header 1.2.3.\n" + +# # Sanity check: reconstruct original text +# reconstructed_doc = "".join([doc.content for doc in split_docs]) +# assert reconstructed_doc == sample_text def test_split_parentheaders(sample_text): From ccc10577c5196cea168b307ba815814aa23eef40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 11:34:27 +0100 Subject: [PATCH 78/85] test cleanup to cover keep_headers=True --- .../test_markdown_header_splitter.py | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 0cfc732bbe..ad9cf2a6c4 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -205,7 +205,7 @@ def test_split_only_headers(): # Metadata preservation def test_preserve_document_metadata(): """Test that document metadata is preserved through splitting.""" - splitter = MarkdownHeaderSplitter(keep_headers=False) + splitter = MarkdownHeaderSplitter(keep_headers=False) # keep_headers=True case is covered by this test too docs = [Document(content="# Header\nContent", meta={"source": "test", "importance": "high", "custom_field": 123})] result = splitter.run(documents=docs) @@ -273,7 +273,7 @@ def test_empty_content_handling(): def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test primary splitting - splitter = MarkdownHeaderSplitter(keep_headers=False) + splitter = MarkdownHeaderSplitter() docs = [Document(content=sample_text)] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -320,6 +320,7 @@ def test_secondary_split_with_overlap(): "### Subsection\n" "This subsection contains additional information and should also be split with overlap." ) + # keep_headers=False splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2, keep_headers=False) docs = [Document(content=text)] result = splitter.run(documents=docs) @@ -333,10 +334,38 @@ def test_secondary_split_with_overlap(): prev_words = prev_doc.content.split() curr_words = curr_doc.content.split() assert prev_words[-2:] == curr_words[:2] + # keep_headers=True + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=4, split_overlap=2) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + assert len(split_docs) == 24 + + assert split_docs[0].content.startswith("# Introduction") + assert all("header" not in doc.meta for doc in split_docs) def test_secondary_split_with_threshold(): text = "# Header\n" + " ".join([f"word{i}" for i in range(1, 11)]) + # keep_headers=True + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=True) + docs = [Document(content=text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + for i, doc in enumerate(split_docs): + words = doc.content.split() + if i == 0: + # First chunk includes header-hashtag plus split_length words + assert words[:2] == ["#", "Header"] + assert len(words) == 4 + elif i < len(split_docs) - 1: + # Subsequent chunks should have split_length words + assert len(words) == 3 + else: + # Last chunk should have at least split_threshold words + assert len(words) >= 2 + + # keep_headers=False splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False) docs = [Document(content=text)] result = splitter.run(documents=docs) @@ -361,7 +390,7 @@ def test_page_break_handling_in_secondary_split(): def test_page_break_handling_with_multiple_headers(): text = "# Header\nFirst page\f Second page\f Third page" - splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1, keep_headers=True) + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] From c4a5c171b8e9d6a90425fccbfd9d91bc091d9834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 11:47:08 +0100 Subject: [PATCH 79/85] add tests for keep_headers=False splitting --- .../preprocessors/markdown_header_splitter.py | 2 + .../test_markdown_header_splitter.py | 112 +++++++++--------- 2 files changed, 56 insertions(+), 58 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 463ea6ceed..f28437dcc0 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -108,6 +108,8 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) content = text[start:end] + if not self.keep_headers and content.startswith("\n"): + content = content[1:] # update header stack to track nesting header_stack[level - 1] = header_text diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index ad9cf2a6c4..fb51c3e8de 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -82,64 +82,60 @@ def test_basic_split(sample_text): assert reconstructed_doc == sample_text -# def test_split_without_headers(sample_text): -# splitter = MarkdownHeaderSplitter(keep_headers=False) -# docs = [Document(content=sample_text)] -# result = splitter.run(documents=docs) -# split_docs = result["documents"] - -# # Should split into all headers with content -# headers = [doc.meta["header"] for doc in split_docs] -# assert "Header 1" in headers -# assert "Subheader 1.1.1" in headers -# assert "Subheader 1.2.1" in headers -# assert "Subheader 1.2.2" in headers -# assert "Subheader 1.2.3" in headers - -# # Check that content is present and correct -# # Test first split -# header1_doc = split_docs[0] -# # assert header1_doc.meta["header"] == "Header 1" -# assert header1_doc.meta["split_id"] == 0 -# assert header1_doc.meta["page_number"] == 1 -# # assert header1_doc.meta["parent_headers"] == [] -# assert header1_doc.content == "# Header 1\n\nContent under header 1.\n" - -# # Test second split -# subheader111_doc = split_docs[1] -# # assert subheader111_doc.meta["header"] == "Subheader 1.1.1" -# assert subheader111_doc.meta["split_id"] == 1 -# assert subheader111_doc.meta["page_number"] == 1 -# # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"] -# assert subheader111_doc.content == "## Header 1.1\n\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n" - -# # Test third split -# subheader121_doc = split_docs[2] -# # assert subheader121_doc.meta["header"] == "Subheader 1.2.1" -# assert subheader121_doc.meta["split_id"] == 2 -# assert subheader121_doc.meta["page_number"] == 1 -# # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] -# assert subheader121_doc.content == "## Header 1.2\n\n### Subheader 1.2.1\nContent under header 1.2.1.\n" - -# # Test fourth split -# subheader122_doc = split_docs[3] -# # assert subheader122_doc.meta["header"] == "Subheader 1.2.2" -# assert subheader122_doc.meta["split_id"] == 3 -# assert subheader122_doc.meta["page_number"] == 1 -# # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] -# assert subheader122_doc.content == "### Subheader 1.2.2\n\nContent under header 1.2.2.\n" - -# # Test fifth split -# subheader123_doc = split_docs[4] -# # assert subheader123_doc.meta["header"] == "Subheader 1.2.3" -# assert subheader123_doc.meta["split_id"] == 4 -# assert subheader123_doc.meta["page_number"] == 1 -# # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] -# assert subheader123_doc.content == "### Subheader 1.2.3\n\nContent under header 1.2.3.\n" - -# # Sanity check: reconstruct original text -# reconstructed_doc = "".join([doc.content for doc in split_docs]) -# assert reconstructed_doc == sample_text +def test_split_without_headers(sample_text): + splitter = MarkdownHeaderSplitter(keep_headers=False) + docs = [Document(content=sample_text)] + result = splitter.run(documents=docs) + split_docs = result["documents"] + + # Should split into all headers with content + headers = [doc.meta["header"] for doc in split_docs] + assert "Header 1" in headers + assert "Subheader 1.1.1" in headers + assert "Subheader 1.2.1" in headers + assert "Subheader 1.2.2" in headers + assert "Subheader 1.2.3" in headers + + # Check that content is present and correct + # Test first split + header1_doc = split_docs[0] + assert header1_doc.meta["header"] == "Header 1" + assert header1_doc.meta["split_id"] == 0 + assert header1_doc.meta["page_number"] == 1 + assert header1_doc.meta["parent_headers"] == [] + assert header1_doc.content == "Content under header 1.\n" + + # Test second split + subheader111_doc = split_docs[1] + assert subheader111_doc.meta["header"] == "Subheader 1.1.1" + assert subheader111_doc.meta["split_id"] == 1 + assert subheader111_doc.meta["page_number"] == 1 + assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"] + assert subheader111_doc.content == "Content under sub-header 1.1.1\n" + + # Test third split + subheader121_doc = split_docs[2] + assert subheader121_doc.meta["header"] == "Subheader 1.2.1" + assert subheader121_doc.meta["split_id"] == 2 + assert subheader121_doc.meta["page_number"] == 1 + assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader121_doc.content == "Content under header 1.2.1.\n" + + # Test fourth split + subheader122_doc = split_docs[3] + assert subheader122_doc.meta["header"] == "Subheader 1.2.2" + assert subheader122_doc.meta["split_id"] == 3 + assert subheader122_doc.meta["page_number"] == 1 + assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader122_doc.content == "Content under header 1.2.2.\n" + + # Test fifth split + subheader123_doc = split_docs[4] + assert subheader123_doc.meta["header"] == "Subheader 1.2.3" + assert subheader123_doc.meta["split_id"] == 4 + assert subheader123_doc.meta["page_number"] == 1 + assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] + assert subheader123_doc.content == "Content under header 1.2.3." def test_split_parentheaders(sample_text): From f3d77990549bc6ace3f4d75d3411fca0d54e4bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 11:57:53 +0100 Subject: [PATCH 80/85] remove strip() --- .../preprocessors/markdown_header_splitter.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f28437dcc0..f053b19667 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -101,7 +101,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: for i, match in enumerate(matches): # extract header info header_prefix = match.group(1) - header_text = match.group(2).strip() + header_text = match.group(2) level = len(header_prefix) # get content @@ -109,22 +109,20 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: end = matches[i + 1].start() if i + 1 < len(matches) else len(text) content = text[start:end] if not self.keep_headers and content.startswith("\n"): - content = content[1:] + content = content[1:] # remove leading newline if headers not kept # update header stack to track nesting header_stack[level - 1] = header_text for j in range(level, 6): header_stack[j] = None - # prepare header_line if keep_headers - header_line = f"{header_prefix} {header_text}" - # skip splits w/o content - if not content.strip(): + if not content.strip(): # this strip is needed to avoid counting whitespace as content # add as parent for subsequent headers active_parents = [h for h in header_stack[: level - 1] if h is not None] active_parents.append(header_text) if self.keep_headers: + header_line = f"{header_prefix} {header_text}" pending_headers.append(header_line) continue @@ -136,6 +134,7 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: ) if self.keep_headers: + header_line = f"{header_prefix} {header_text}" # add pending & current header to content chunk_content = "" if pending_headers: From f842fdb6650c87eb5ec57b6c36c0afedaa081843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 12:20:05 +0100 Subject: [PATCH 81/85] simplify doc handling --- .../preprocessors/markdown_header_splitter.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index f053b19667..5bb2047838 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -183,10 +183,6 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document if header_match: content_for_splitting = doc.content[header_match.end() :] - if not content_for_splitting or not content_for_splitting.strip(): # skip empty content - result_docs.append(doc) - continue - # track page from meta current_page = doc.meta.get("page_number", 1) @@ -303,7 +299,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: if not isinstance(doc.content, str): raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).") - processed_documents = [] + final_docs = [] for doc in documents: # handle empty documents if not doc.content or not doc.content.strip(): @@ -311,22 +307,23 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue # keep empty documents - processed_documents.append(doc) + final_docs.append(doc) logger.warning( "Document ID {doc_id} has an empty content. Keeping this document as per configuration.", doc_id=doc.id, ) continue - processed_documents.append(doc) + # split this document by headers + header_split_docs = self._split_documents_by_markdown_headers([doc]) - if not processed_documents: - return {"documents": []} - - header_split_docs = self._split_documents_by_markdown_headers(processed_documents) + # apply secondary splitting if configured + if self.secondary_split: + doc_splits = self._apply_secondary_splitting(header_split_docs) + else: + doc_splits = header_split_docs - # secondary splitting if configured - final_docs = self._apply_secondary_splitting(header_split_docs) if self.secondary_split else header_split_docs + final_docs.extend(doc_splits) # assign split_id to all output documents for idx, doc in enumerate(final_docs): From c7fc2e45d25949c8cb6a4bd12d839447be26f6f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 13:00:56 +0100 Subject: [PATCH 82/85] fix split id assignment --- .../preprocessors/markdown_header_splitter.py | 23 ++++++++----- .../test_markdown_header_splitter.py | 33 +++++++++++++------ 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 5bb2047838..e7dca1b68e 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -169,6 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document Ensures page counting is maintained across splits. """ result_docs = [] + current_split_id = 0 # track split_id across all secondary splits from the same parent for doc in documents: if doc.content is None: @@ -186,8 +187,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document # track page from meta current_page = doc.meta.get("page_number", 1) + # create a clean meta dict without split_id for secondary splitting + clean_meta = {k: v for k, v in doc.meta.items() if k != "split_id"} + secondary_splits = self.secondary_splitter.run( - documents=[Document(content=content_for_splitting, meta=doc.meta)] + documents=[Document(content=content_for_splitting, meta=clean_meta)] )["documents"] # split processing @@ -196,8 +200,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document if i > 0 and secondary_splits[i - 1].content: current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page) - # set page number to meta + # set page number and split_id to meta split.meta["page_number"] = current_page + split.meta["split_id"] = current_split_id + # ensure source_id is preserved from the original document + if "source_id" in doc.meta: + split.meta["source_id"] = doc.meta["source_id"] + current_split_id += 1 # preserve header metadata if we're not keeping headers in content if not self.keep_headers: @@ -255,11 +264,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis current_page=current_page, total_pages=total_pages, ) - for split in splits: + for split_idx, split in enumerate(splits): meta = {} if doc.meta: meta = doc.meta.copy() - meta.update({"source_id": doc.id, "page_number": current_page}) + meta.update({"source_id": doc.id, "page_number": current_page, "split_id": split_idx}) if split.get("meta"): meta.update(split["meta"]) current_page = self._update_page_number_with_breaks(split["content"], current_page) @@ -284,7 +293,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: - `documents`: List of documents with the split texts. Each document includes: - A metadata field `source_id` to track the original document. - A metadata field `page_number` to track the original page number. - - A metadata field `split_id` to uniquely identify each split chunk. + - A metadata field `split_id` to identify the split chunk index within its parent document. - All other metadata copied from the original document. """ # validate input documents @@ -325,8 +334,4 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: final_docs.extend(doc_splits) - # assign split_id to all output documents - for idx, doc in enumerate(final_docs): - doc.meta["split_id"] = idx - return {"documents": final_docs} diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index fb51c3e8de..e37f06484e 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from collections import defaultdict from unittest.mock import ANY import pytest @@ -181,10 +182,14 @@ def test_split_multiple_documents(sample_text): headers = {doc.meta["header"] for doc in split_docs} assert {"Another Header", "H1", "H2"}.issubset(headers) - # Verify that all documents have a split_id and they're sequential - split_ids = [doc.meta.get("split_id") for doc in split_docs] - assert all(split_id is not None for split_id in split_ids) - assert split_ids == list(range(len(split_ids))) + # Verify that split_ids are per-parent-document + splits_by_source = defaultdict(list) + for doc in split_docs: + splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"]) + + # Each parent document should have split_ids starting from 0 + for source_id, split_ids in splits_by_source.items(): + assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0" def test_split_only_headers(): @@ -268,7 +273,7 @@ def test_empty_content_handling(): def test_split_id_sequentiality_primary_and_secondary(sample_text): - # Test primary splitting + # Test primary splitting with single document splitter = MarkdownHeaderSplitter() docs = [Document(content=sample_text)] result = splitter.run(documents=docs) @@ -277,11 +282,11 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test number of documents assert len(split_docs) == 5 - # Check that split_ids are sequential + # Check that split_ids are sequential from 0 for this single parent document split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) - # Test secondary splitting + # Test secondary splitting with single document splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3) docs = [Document(content=sample_text)] result = splitter.run(documents=docs) @@ -290,10 +295,12 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test number of documents assert len(split_docs) == 12 + # Check that split_ids are sequential from 0 for this single parent document split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) - # Test with multiple input documents + # Test with multiple input documents - each should have its own split_id sequence + splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3) # Use fresh instance docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")] result = splitter.run(documents=docs) split_docs = result["documents"] @@ -301,8 +308,14 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): # Test number of documents assert len(split_docs) == 14 - split_ids = [doc.meta["split_id"] for doc in split_docs] - assert split_ids == list(range(len(split_ids))) + # Verify split_ids are per-parent-document + splits_by_source = defaultdict(list) + for doc in split_docs: + splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"]) + + # Each parent document should have split_ids starting from 0 + for source_id, split_ids in splits_by_source.items(): + assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0" def test_secondary_split_with_overlap(): From 64ff6fb1927a8f46c27ab687538b533c660f5c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 13:09:47 +0100 Subject: [PATCH 83/85] test cleanup --- .../preprocessors/test_markdown_header_splitter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index e37f06484e..82b55963ac 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -188,8 +188,8 @@ def test_split_multiple_documents(sample_text): splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"]) # Each parent document should have split_ids starting from 0 - for source_id, split_ids in splits_by_source.items(): - assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0" + for split_ids in splits_by_source.values(): + assert split_ids == list(range(len(split_ids))) def test_split_only_headers(): @@ -314,8 +314,8 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"]) # Each parent document should have split_ids starting from 0 - for source_id, split_ids in splits_by_source.items(): - assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0" + for split_ids in splits_by_source.values(): + assert split_ids == list(range(len(split_ids))) def test_secondary_split_with_overlap(): From eb3e568bb73565c30b629d47c2ad3f0f9a596e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 13:29:31 +0100 Subject: [PATCH 84/85] test splits more explicitly --- .../test_markdown_header_splitter.py | 37 +++++++++---------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 82b55963ac..440135bc14 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -361,28 +361,25 @@ def test_secondary_split_with_threshold(): docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - for i, doc in enumerate(split_docs): - words = doc.content.split() - if i == 0: - # First chunk includes header-hashtag plus split_length words - assert words[:2] == ["#", "Header"] - assert len(words) == 4 - elif i < len(split_docs) - 1: - # Subsequent chunks should have split_length words - assert len(words) == 3 - else: - # Last chunk should have at least split_threshold words - assert len(words) >= 2 + + # Explicitly test each split + assert len(split_docs) == 4 + assert len(split_docs[0].content.split()) == 4 # "# Header" + 2 words + assert len(split_docs[1].content.split()) == 3 # 3 words (split_length) + assert len(split_docs[2].content.split()) == 3 # 3 words (split_length) + assert len(split_docs[3].content.split()) == 2 # 2 words (meets threshold) # keep_headers=False splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3, split_threshold=2, keep_headers=False) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - for doc in split_docs[:-1]: - assert len(doc.content.split()) == 3 - # The last chunk should have at least 2 words (threshold) - assert len(split_docs[-1].content.split()) >= 2 + + # Explicitly test each split + assert len(split_docs) == 3 + assert len(split_docs[0].content.split()) == 3 # 3 words + assert len(split_docs[1].content.split()) == 3 # 3 words + assert len(split_docs[2].content.split()) == 4 # 4 words (due to threshold, not possible to split 3-1) def test_page_break_handling_in_secondary_split(): @@ -391,10 +388,10 @@ def test_page_break_handling_in_secondary_split(): docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - page_numbers = [doc.meta.get("page_number") for doc in split_docs] - # Should start at 1 and increment at each \f - assert page_numbers[0] == 1 - assert max(page_numbers) == 3 + # Explicitly check the page number of each split + expected_page_numbers = [1, 1, 1, 2, 3] + actual_page_numbers = [doc.meta.get("page_number") for doc in split_docs] + assert actual_page_numbers == expected_page_numbers def test_page_break_handling_with_multiple_headers(): From ad155cc8b2036e039a728d4fc919ff9ec5f61f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Guggenb=C3=BChl?= Date: Fri, 21 Nov 2025 15:06:43 +0100 Subject: [PATCH 85/85] cleanup tests minor commenting --- .../preprocessors/markdown_header_splitter.py | 2 +- .../test_markdown_header_splitter.py | 22 +++++-------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index e7dca1b68e..02ccf8c99c 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -311,7 +311,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: final_docs = [] for doc in documents: # handle empty documents - if not doc.content or not doc.content.strip(): + if not doc.content or not doc.content.strip(): # avoid counting whitespace as content if self.skip_empty_documents: logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 440135bc14..3efa22fd89 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -40,45 +40,35 @@ def test_basic_split(sample_text): # Check that content is present and correct # Test first split header1_doc = split_docs[0] - # assert header1_doc.meta["header"] == "Header 1" assert header1_doc.meta["split_id"] == 0 assert header1_doc.meta["page_number"] == 1 - # assert header1_doc.meta["parent_headers"] == [] assert header1_doc.content == "# Header 1\nContent under header 1.\n" # Test second split subheader111_doc = split_docs[1] - # assert subheader111_doc.meta["header"] == "Subheader 1.1.1" assert subheader111_doc.meta["split_id"] == 1 assert subheader111_doc.meta["page_number"] == 1 - # assert subheader111_doc.meta["parent_headers"] == ["Header 1", "Header 1.1"] assert subheader111_doc.content == "## Header 1.1\n### Subheader 1.1.1\nContent under sub-header 1.1.1\n" # Test third split subheader121_doc = split_docs[2] - # assert subheader121_doc.meta["header"] == "Subheader 1.2.1" assert subheader121_doc.meta["split_id"] == 2 assert subheader121_doc.meta["page_number"] == 1 - # assert subheader121_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] assert subheader121_doc.content == "## Header 1.2\n### Subheader 1.2.1\nContent under header 1.2.1.\n" # Test fourth split subheader122_doc = split_docs[3] - # assert subheader122_doc.meta["header"] == "Subheader 1.2.2" assert subheader122_doc.meta["split_id"] == 3 assert subheader122_doc.meta["page_number"] == 1 - # assert subheader122_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] assert subheader122_doc.content == "### Subheader 1.2.2\nContent under header 1.2.2.\n" # Test fifth split subheader123_doc = split_docs[4] - # assert subheader123_doc.meta["header"] == "Subheader 1.2.3" assert subheader123_doc.meta["split_id"] == 4 assert subheader123_doc.meta["page_number"] == 1 - # assert subheader123_doc.meta["parent_headers"] == ["Header 1", "Header 1.2"] assert subheader123_doc.content == "### Subheader 1.2.3\nContent under header 1.2.3." - # Sanity check: reconstruct original text + # Reconstruct original text reconstructed_doc = "".join([doc.content for doc in split_docs]) assert reconstructed_doc == sample_text @@ -299,7 +289,7 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text): split_ids = [doc.meta["split_id"] for doc in split_docs] assert split_ids == list(range(len(split_ids))) - # Test with multiple input documents - each should have its own split_id sequence + # Test with multiple input documents; each should have its own split_id sequence splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3) # Use fresh instance docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")] result = splitter.run(documents=docs) @@ -383,13 +373,13 @@ def test_secondary_split_with_threshold(): def test_page_break_handling_in_secondary_split(): - text = "# Header\nFirst page\fSecond page\fThird page" + text = "# Header\nFirst page\f Second page\f Third page" splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=1) docs = [Document(content=text)] result = splitter.run(documents=docs) split_docs = result["documents"] - # Explicitly check the page number of each split - expected_page_numbers = [1, 1, 1, 2, 3] + + expected_page_numbers = [1, 1, 1, 2, 2, 3, 3] actual_page_numbers = [doc.meta.get("page_number") for doc in split_docs] assert actual_page_numbers == expected_page_numbers @@ -430,6 +420,6 @@ def test_page_break_handling_with_multiple_headers(): assert split_docs[6].content == "page" assert split_docs[6].meta == {"source_id": ANY, "page_number": 3, "split_id": 6, "split_idx_start": 40} - # Check reconstruction + # Reconstruct original text reconstructed_text = "".join(doc.content for doc in split_docs) assert reconstructed_text == text