mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-03 03:38:06 +00:00
Fix token text splitter duplicates (#14848)
- **Description:** - Add a break case to `text_splitter.py::split_text_on_tokens()` to avoid unwanted item at the end of result. - Add a testcase to enforce the behavior. - **Issue:** - #14649 - #5897 - **Dependencies:** n/a, --- **Quick illustration of change:** ``` text = "foo bar baz 123" tokenizer = Tokenizer( chunk_overlap=3, tokens_per_chunk=7 ) output = split_text_on_tokens(text=text, tokenizer=tokenizer) ``` output before change: `["foo bar", "bar baz", "baz 123", "123"]` output after change: `["foo bar", "bar baz", "baz 123"]`
This commit is contained in:
parent
14d04180eb
commit
ea331f3136
@ -670,6 +670,8 @@ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
|
|||||||
chunk_ids = input_ids[start_idx:cur_idx]
|
chunk_ids = input_ids[start_idx:cur_idx]
|
||||||
while start_idx < len(input_ids):
|
while start_idx < len(input_ids):
|
||||||
splits.append(tokenizer.decode(chunk_ids))
|
splits.append(tokenizer.decode(chunk_ids))
|
||||||
|
if cur_idx == len(input_ids):
|
||||||
|
break
|
||||||
start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
|
start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
|
||||||
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
||||||
chunk_ids = input_ids[start_idx:cur_idx]
|
chunk_ids = input_ids[start_idx:cur_idx]
|
||||||
|
@ -13,6 +13,8 @@ from langchain.text_splitter import (
|
|||||||
MarkdownHeaderTextSplitter,
|
MarkdownHeaderTextSplitter,
|
||||||
PythonCodeTextSplitter,
|
PythonCodeTextSplitter,
|
||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
|
Tokenizer,
|
||||||
|
split_text_on_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
FAKE_PYTHON_TEXT = """
|
FAKE_PYTHON_TEXT = """
|
||||||
@ -1175,3 +1177,18 @@ def test_html_header_text_splitter(tmp_path: Path) -> None:
|
|||||||
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
||||||
|
|
||||||
assert docs_from_file == expected
|
assert docs_from_file == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_text_on_tokens() -> None:
|
||||||
|
"""Test splitting by tokens per chunk."""
|
||||||
|
text = "foo bar baz 123"
|
||||||
|
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
chunk_overlap=3,
|
||||||
|
tokens_per_chunk=7,
|
||||||
|
decode=(lambda it: "".join(chr(i) for i in it)),
|
||||||
|
encode=(lambda it: [ord(c) for c in it]),
|
||||||
|
)
|
||||||
|
output = split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||||
|
expected_output = ["foo bar", "bar baz", "baz 123"]
|
||||||
|
assert output == expected_output
|
||||||
|
Loading…
Reference in New Issue
Block a user