mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 20:28:10 +00:00
Fix bug in MarkdownHeaderTextSplitter for codeblock (#10262)
- Description: The previous version of the MarkdownHeaderTextSplitter did not take into account the possibility of '#' appearing within code blocks, which caused segmentation anomalies in these situations. This PR has fixed this issue. - Issue: - Dependencies: No - Tag maintainer: - Twitter handle: cc @baskaryan @eyurtsev @rlancemartin --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
656d2303f7
commit
f1269830a0
@ -390,8 +390,22 @@ class MarkdownHeaderTextSplitter:
|
||||
header_stack: List[HeaderType] = []
|
||||
initial_metadata: Dict[str, str] = {}
|
||||
|
||||
in_code_block = False
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
|
||||
if stripped_line.startswith("```"):
|
||||
# code block in one row
|
||||
if stripped_line.count("```") >= 2:
|
||||
in_code_block = False
|
||||
else:
|
||||
in_code_block = not in_code_block
|
||||
|
||||
if in_code_block:
|
||||
current_content.append(stripped_line)
|
||||
continue
|
||||
|
||||
# Check each line against each of the header types (e.g., #, ##)
|
||||
for sep, name in self.headers_to_split_on:
|
||||
# Check if line starts with a header that we intend to split on
|
||||
|
@ -783,6 +783,10 @@ ____________
|
||||
#### Code blocks
|
||||
```
|
||||
This is a code block
|
||||
|
||||
# sample code
|
||||
a = 1
|
||||
b = 2
|
||||
```
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
@ -808,6 +812,8 @@ This is a code block
|
||||
"```",
|
||||
"This is a code",
|
||||
"block",
|
||||
"# sample code",
|
||||
"a = 1\nb = 2",
|
||||
"```",
|
||||
]
|
||||
# Special test for special characters
|
||||
|
Loading…
Reference in New Issue
Block a user