mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 19:03:25 +00:00
Fix token text splitter duplicates (#14848)
- **Description:** - Add a break case to `text_splitter.py::split_text_on_tokens()` to avoid unwanted item at the end of result. - Add a testcase to enforce the behavior. - **Issue:** - #14649 - #5897 - **Dependencies:** n/a, --- **Quick illustration of change:** ``` text = "foo bar baz 123" tokenizer = Tokenizer( chunk_overlap=3, tokens_per_chunk=7 ) output = split_text_on_tokens(text=text, tokenizer=tokenizer) ``` output before change: `["foo bar", "bar baz", "baz 123", "123"]` output after change: `["foo bar", "bar baz", "baz 123"]`
This commit is contained in:
parent
14d04180eb
commit
ea331f3136
@ -670,6 +670,8 @@ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
|
||||
chunk_ids = input_ids[start_idx:cur_idx]
|
||||
while start_idx < len(input_ids):
|
||||
splits.append(tokenizer.decode(chunk_ids))
|
||||
if cur_idx == len(input_ids):
|
||||
break
|
||||
start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
|
||||
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
||||
chunk_ids = input_ids[start_idx:cur_idx]
|
||||
|
@ -13,6 +13,8 @@ from langchain.text_splitter import (
|
||||
MarkdownHeaderTextSplitter,
|
||||
PythonCodeTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
Tokenizer,
|
||||
split_text_on_tokens,
|
||||
)
|
||||
|
||||
FAKE_PYTHON_TEXT = """
|
||||
@ -1175,3 +1177,18 @@ def test_html_header_text_splitter(tmp_path: Path) -> None:
|
||||
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
||||
|
||||
assert docs_from_file == expected
|
||||
|
||||
|
||||
def test_split_text_on_tokens() -> None:
|
||||
"""Test splitting by tokens per chunk."""
|
||||
text = "foo bar baz 123"
|
||||
|
||||
tokenizer = Tokenizer(
|
||||
chunk_overlap=3,
|
||||
tokens_per_chunk=7,
|
||||
decode=(lambda it: "".join(chr(i) for i in it)),
|
||||
encode=(lambda it: [ord(c) for c in it]),
|
||||
)
|
||||
output = split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||
expected_output = ["foo bar", "bar baz", "baz 123"]
|
||||
assert output == expected_output
|
||||
|
Loading…
Reference in New Issue
Block a user