From e61c80247205d33e159e94a906230f90fb953739 Mon Sep 17 00:00:00 2001 From: Venrite Date: Sat, 26 Jul 2025 18:18:34 -0700 Subject: [PATCH 1/5] Fix: improved start_index logic in TextSplitter when called with tokens --- libs/text-splitters/langchain_text_splitters/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 891f988d448..e479743ea5d 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -79,11 +79,20 @@ class TextSplitter(BaseDocumentTransformer, ABC): for i, text in enumerate(texts): index = 0 previous_chunk_len = 0 + not_found = -1 + retries = 1 for chunk in self.split_text(text): metadata = copy.deepcopy(_metadatas[i]) if self._add_start_index: + search_boundary = len(chunk) + 1 offset = index + previous_chunk_len - self._chunk_overlap - index = text.find(chunk, max(0, offset)) + prev_index = index + index = text.find(chunk, max(0, offset), max(0,offset) + search_boundary) + if index == not_found: + for token_offset in range(0,retries): + index = text.find(chunk, prev_index + token_offset) + if index is prev_index: + continue metadata["start_index"] = index previous_chunk_len = len(chunk) new_doc = Document(page_content=chunk, metadata=metadata) From 74af25e2c1ac6cbada88085bcb3deba663e905d4 Mon Sep 17 00:00:00 2001 From: Venrite Date: Mon, 28 Jul 2025 15:45:22 -0700 Subject: [PATCH 2/5] undoing previous work --- .../langchain_text_splitters/base.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index e479743ea5d..2f3891f64f3 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -79,20 +79,11 @@ class TextSplitter(BaseDocumentTransformer, ABC): for i, text in enumerate(texts): index = 0 previous_chunk_len = 0 - not_found = -1 - retries = 1 for chunk in self.split_text(text): metadata = copy.deepcopy(_metadatas[i]) if self._add_start_index: - search_boundary = len(chunk) + 1 offset = index + previous_chunk_len - self._chunk_overlap - prev_index = index - index = text.find(chunk, max(0, offset), max(0,offset) + search_boundary) - if index == not_found: - for token_offset in range(0,retries): - index = text.find(chunk, prev_index + token_offset) - if index is prev_index: - continue + index = text.find(chunk, max(0, offset)) metadata["start_index"] = index previous_chunk_len = len(chunk) new_doc = Document(page_content=chunk, metadata=metadata) @@ -234,7 +225,6 @@ class TextSplitter(BaseDocumentTransformer, ABC): class TokenTextSplitter(TextSplitter): """Splitting text to tokens using model tokenizer.""" - def __init__( self, encoding_name: str = "gpt2", @@ -294,8 +284,7 @@ class TokenTextSplitter(TextSplitter): ) return split_text_on_tokens(text=text, tokenizer=tokenizer) - - + class Language(str, Enum): """Enum of the programming languages.""" @@ -357,3 +346,4 @@ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]: cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] return splits + From a997a90b86e84b6c26a4c55743459dcc583e9b4f Mon Sep 17 00:00:00 2001 From: Venrite Date: Mon, 28 Jul 2025 16:41:09 -0700 Subject: [PATCH 3/5] Override of create_documents for TokenTextSplitter --- .../langchain_text_splitters/base.py | 27 +++++++++++++ .../tests/unit_tests/test_text_splitters.py | 38 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 2f3891f64f3..7ff118f9a3f 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -1,4 +1,5 @@ from __future__ import annotations +import os import copy import logging @@ -74,6 +75,8 @@ class TextSplitter(BaseDocumentTransformer, ABC): self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None ) -> list[Document]: """Create documents from a list of texts.""" + if isinstance(self,TokenTextSplitter): + return self.token_create_documents(texts,metadatas) _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): @@ -285,6 +288,30 @@ class TokenTextSplitter(TextSplitter): return split_text_on_tokens(text=text, tokenizer=tokenizer) + def create_documents( + self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None +) -> list[Document]: + """Override to create documents from a list of tokens.""" + _metadatas = metadatas or [{}] * len(texts) + documents = [] + for i, text in enumerate(texts): + metadata = _metadatas[i] + input_ids = self._tokenizer.encode(text) + start_idx = 0 + char_index = 0 + while start_idx < len(input_ids): + end_idx = min(start_idx + self._chunk_size, len(input_ids)) + chunk_ids = input_ids[start_idx:end_idx] + chunk_text = self._tokenizer.decode(chunk_ids) + if self._add_start_index: + char_index = text.find(chunk_text, char_index) + metadata["start_index"] = char_index + documents.append(Document(page_content=chunk_text,metadata=metadata)) + if end_idx == len(input_ids): + break + start_idx += self._chunk_size - self._chunk_overlap + return documents + class Language(str, Enum): """Enum of the programming languages.""" diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 0d72e806309..683a57b026d 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -13,6 +13,7 @@ from langchain_text_splitters import ( RecursiveCharacterTextSplitter, TextSplitter, Tokenizer, + TokenTextSplitter ) from langchain_text_splitters.base import split_text_on_tokens from langchain_text_splitters.character import CharacterTextSplitter @@ -3666,3 +3667,40 @@ def test_character_text_splitter_chunk_size_effect( keep_separator=False, ) assert splitter.split_text(text) == expected + +def test_token_splitter_create_documents() -> None: + splitter = TokenTextSplitter( + add_start_index=True, + chunk_size=10, + chunk_overlap=5 + ) + text=""" + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, + sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + """ + docs = splitter.create_documents([text]) + for doc in docs: + s_i = doc.metadata["start_index"] + assert text[s_i : s_i + len(doc.page_content)] == doc.page_content + +def test_token_splitter_create_documents_repeat_text() -> None: + splitter = TokenTextSplitter( + add_start_index=True, + chunk_size=10, + chunk_overlap=5 + ) + text=""" + "the quick brown fox jumped over the lazy fox + the quick brown fox jumped over the lazy fox + the quick brown fox jumped over the lazy fox + the quick brown fox jumped over the lazy fox + the quick brown fox jumped over the lazy fox" + """ + docs = splitter.create_documents([text]) + for doc in docs: + s_i = doc.metadata["start_index"] + assert text[s_i : s_i + len(doc.page_content)] == doc.page_content + From d50965f7b285d8ee62ecd3e23a8490025e71c74b Mon Sep 17 00:00:00 2001 From: Venrite Date: Mon, 28 Jul 2025 16:52:36 -0700 Subject: [PATCH 4/5] Override of create_documents for TokenTextSplitter, removed unused import --- libs/text-splitters/langchain_text_splitters/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 7ff118f9a3f..6f513071c1f 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -1,5 +1,4 @@ from __future__ import annotations -import os import copy import logging From 68bede3e249e4310f37e08ff82d82758d28e6e0a Mon Sep 17 00:00:00 2001 From: Venrite Date: Mon, 28 Jul 2025 17:07:31 -0700 Subject: [PATCH 5/5] Running make lint, make format, and make test --- .../langchain_text_splitters/base.py | 19 ++++++++--------- .../tests/unit_tests/test_text_splitters.py | 21 +++++++------------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 6f513071c1f..2150d834085 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -74,8 +74,6 @@ class TextSplitter(BaseDocumentTransformer, ABC): self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None ) -> list[Document]: """Create documents from a list of texts.""" - if isinstance(self,TokenTextSplitter): - return self.token_create_documents(texts,metadatas) _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): @@ -227,6 +225,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): class TokenTextSplitter(TextSplitter): """Splitting text to tokens using model tokenizer.""" + def __init__( self, encoding_name: str = "gpt2", @@ -286,10 +285,10 @@ class TokenTextSplitter(TextSplitter): ) return split_text_on_tokens(text=text, tokenizer=tokenizer) - + def create_documents( - self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None -) -> list[Document]: + self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None + ) -> list[Document]: """Override to create documents from a list of tokens.""" _metadatas = metadatas or [{}] * len(texts) documents = [] @@ -299,18 +298,19 @@ class TokenTextSplitter(TextSplitter): start_idx = 0 char_index = 0 while start_idx < len(input_ids): - end_idx = min(start_idx + self._chunk_size, len(input_ids)) - chunk_ids = input_ids[start_idx:end_idx] + end_idx = min(start_idx + self._chunk_size, len(input_ids)) + chunk_ids = input_ids[start_idx:end_idx] chunk_text = self._tokenizer.decode(chunk_ids) if self._add_start_index: char_index = text.find(chunk_text, char_index) metadata["start_index"] = char_index - documents.append(Document(page_content=chunk_text,metadata=metadata)) + documents.append(Document(page_content=chunk_text, metadata=metadata)) if end_idx == len(input_ids): break start_idx += self._chunk_size - self._chunk_overlap return documents - + + class Language(str, Enum): """Enum of the programming languages.""" @@ -372,4 +372,3 @@ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]: cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] return splits - diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 683a57b026d..73208b7f1ac 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -13,7 +13,7 @@ from langchain_text_splitters import ( RecursiveCharacterTextSplitter, TextSplitter, Tokenizer, - TokenTextSplitter + TokenTextSplitter, ) from langchain_text_splitters.base import split_text_on_tokens from langchain_text_splitters.character import CharacterTextSplitter @@ -3668,13 +3668,10 @@ def test_character_text_splitter_chunk_size_effect( ) assert splitter.split_text(text) == expected + def test_token_splitter_create_documents() -> None: - splitter = TokenTextSplitter( - add_start_index=True, - chunk_size=10, - chunk_overlap=5 - ) - text=""" + splitter = TokenTextSplitter(add_start_index=True, chunk_size=10, chunk_overlap=5) + text = """ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. @@ -3686,13 +3683,10 @@ def test_token_splitter_create_documents() -> None: s_i = doc.metadata["start_index"] assert text[s_i : s_i + len(doc.page_content)] == doc.page_content + def test_token_splitter_create_documents_repeat_text() -> None: - splitter = TokenTextSplitter( - add_start_index=True, - chunk_size=10, - chunk_overlap=5 - ) - text=""" + splitter = TokenTextSplitter(add_start_index=True,chunk_size=10,chunk_overlap=5) + text = """ "the quick brown fox jumped over the lazy fox the quick brown fox jumped over the lazy fox the quick brown fox jumped over the lazy fox @@ -3703,4 +3697,3 @@ def test_token_splitter_create_documents_repeat_text() -> None: for doc in docs: s_i = doc.metadata["start_index"] assert text[s_i : s_i + len(doc.page_content)] == doc.page_content -