mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 22:11:51 +00:00
text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622)
**Description:** Previously, when transitioning from a deeper Markdown header (e.g., ###) to a shallower one (e.g., ##), the ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the metadata. This commit updates the `_resolve_header_stack` method to remove headers at the same or deeper levels before appending the current header. As a result, each chunk now reflects only the active header context. Fixes unexpected metadata leakage across sections in nested Markdown documents. Additionally, test cases have been updated to: - Validate correct header resolution and metadata assignment. - Cover edge cases with nested headers and horizontal rules. **Issue:** Fixes [#31596](https://github.com/langchain-ai/langchain/issues/31596) **Dependencies:** None **Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur) **LinkedIn:** -> [https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/)
This commit is contained in:
@@ -376,10 +376,10 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
||||
|
||||
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
|
||||
for i, (depth, _) in enumerate(self.current_header_stack):
|
||||
if depth == header_depth:
|
||||
self.current_header_stack[i] = (header_depth, header_text)
|
||||
self.current_header_stack = self.current_header_stack[: i + 1]
|
||||
return
|
||||
if depth >= header_depth:
|
||||
# Truncate everything from this level onward
|
||||
self.current_header_stack = self.current_header_stack[:i]
|
||||
break
|
||||
self.current_header_stack.append((header_depth, header_text))
|
||||
|
||||
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
|
||||
|
Reference in New Issue
Block a user