Fix: improved start_index logic in TextSplitter when called with tokens

2025-08-03 02:06:33 +00:00 · 2025-07-26 18:18:34 -07:00 · 2025-07-26 18:18:34 -07:00 · e61c802472
commit e61c802472
parent 5ecbb5f277
1 changed files with 10 additions and 1 deletions
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@ -79,11 +79,20 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        for i, text in enumerate(texts):
            index = 0
            previous_chunk_len = 0
            not_found = -1
            retries = 1
            for chunk in self.split_text(text):
                metadata = copy.deepcopy(_metadatas[i])
                if self._add_start_index:
                    search_boundary = len(chunk) + 1 
                    offset = index + previous_chunk_len - self._chunk_overlap
-                    index = text.find(chunk, max(0, offset))
+                    prev_index = index
                    index = text.find(chunk, max(0, offset), max(0,offset) + search_boundary)
                    if index == not_found:
                        for token_offset in range(0,retries):
                            index = text.find(chunk, prev_index + token_offset)
                            if index is prev_index:
                                continue
                    metadata["start_index"] = index
                    previous_chunk_len = len(chunk)
                new_doc = Document(page_content=chunk, metadata=metadata)