patch[experimental] Fix start_index in SemanticChunker (#24761)

- Cause chunks are joined by space, so they can't be found in text, and the final `start_index` is very possibility to be -1. - The simplest way is to use the natural index of the chunk as `start_index`.
2025-08-20 09:57:32 +00:00 · 2024-08-23 02:59:40 +08:00 · 2024-08-23 02:59:40 +08:00 · 8bde04079b
commit 8bde04079b
parent 6fbd53bc60
1 changed files with 3 additions and 3 deletions
--- a/libs/experimental/langchain_experimental/text_splitter.py
+++ b/libs/experimental/langchain_experimental/text_splitter.py
@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer):
        _metadatas = metadatas or [{}] * len(texts)
        documents = []
        for i, text in enumerate(texts):
-            index = -1
+            start_index = 0
            for chunk in self.split_text(text):
                metadata = copy.deepcopy(_metadatas[i])
                if self._add_start_index:
-                    index = text.find(chunk, index + 1)
-                    metadata["start_index"] = index
+                    metadata["start_index"] = start_index
                new_doc = Document(page_content=chunk, metadata=metadata)
                documents.append(new_doc)
+                start_index += len(chunk)
        return documents

    def split_documents(self, documents: Iterable[Document]) -> List[Document]: