From e61c80247205d33e159e94a906230f90fb953739 Mon Sep 17 00:00:00 2001 From: Venrite Date: Sat, 26 Jul 2025 18:18:34 -0700 Subject: [PATCH] Fix: improved start_index logic in TextSplitter when called with tokens --- libs/text-splitters/langchain_text_splitters/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 891f988d448..e479743ea5d 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -79,11 +79,20 @@ class TextSplitter(BaseDocumentTransformer, ABC): for i, text in enumerate(texts): index = 0 previous_chunk_len = 0 + not_found = -1 + retries = 1 for chunk in self.split_text(text): metadata = copy.deepcopy(_metadatas[i]) if self._add_start_index: + search_boundary = len(chunk) + 1 offset = index + previous_chunk_len - self._chunk_overlap - index = text.find(chunk, max(0, offset)) + prev_index = index + index = text.find(chunk, max(0, offset), max(0,offset) + search_boundary) + if index == not_found: + for token_offset in range(0,retries): + index = text.find(chunk, prev_index + token_offset) + if index is prev_index: + continue metadata["start_index"] = index previous_chunk_len = len(chunk) new_doc = Document(page_content=chunk, metadata=metadata)