mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
Fix bug in MarkdownHeaderTextSplitter for codeblock (#10262)
- Description: The previous version of the MarkdownHeaderTextSplitter did not take into account the possibility of '#' appearing within code blocks, which caused segmentation anomalies in these situations. This PR has fixed this issue. - Issue: - Dependencies: No - Tag maintainer: - Twitter handle: cc @baskaryan @eyurtsev @rlancemartin --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
656d2303f7
commit
f1269830a0
@ -390,8 +390,22 @@ class MarkdownHeaderTextSplitter:
|
|||||||
header_stack: List[HeaderType] = []
|
header_stack: List[HeaderType] = []
|
||||||
initial_metadata: Dict[str, str] = {}
|
initial_metadata: Dict[str, str] = {}
|
||||||
|
|
||||||
|
in_code_block = False
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped_line = line.strip()
|
stripped_line = line.strip()
|
||||||
|
|
||||||
|
if stripped_line.startswith("```"):
|
||||||
|
# code block in one row
|
||||||
|
if stripped_line.count("```") >= 2:
|
||||||
|
in_code_block = False
|
||||||
|
else:
|
||||||
|
in_code_block = not in_code_block
|
||||||
|
|
||||||
|
if in_code_block:
|
||||||
|
current_content.append(stripped_line)
|
||||||
|
continue
|
||||||
|
|
||||||
# Check each line against each of the header types (e.g., #, ##)
|
# Check each line against each of the header types (e.g., #, ##)
|
||||||
for sep, name in self.headers_to_split_on:
|
for sep, name in self.headers_to_split_on:
|
||||||
# Check if line starts with a header that we intend to split on
|
# Check if line starts with a header that we intend to split on
|
||||||
|
@ -783,6 +783,10 @@ ____________
|
|||||||
#### Code blocks
|
#### Code blocks
|
||||||
```
|
```
|
||||||
This is a code block
|
This is a code block
|
||||||
|
|
||||||
|
# sample code
|
||||||
|
a = 1
|
||||||
|
b = 2
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
chunks = splitter.split_text(code)
|
chunks = splitter.split_text(code)
|
||||||
@ -808,6 +812,8 @@ This is a code block
|
|||||||
"```",
|
"```",
|
||||||
"This is a code",
|
"This is a code",
|
||||||
"block",
|
"block",
|
||||||
|
"# sample code",
|
||||||
|
"a = 1\nb = 2",
|
||||||
"```",
|
"```",
|
||||||
]
|
]
|
||||||
# Special test for special characters
|
# Special test for special characters
|
||||||
|
Loading…
Reference in New Issue
Block a user