diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index be3a795b06a..2151a0e5ce9 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer): _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): - index = -1 + start_index = 0 for chunk in self.split_text(text): metadata = copy.deepcopy(_metadatas[i]) if self._add_start_index: - index = text.find(chunk, index + 1) - metadata["start_index"] = index + metadata["start_index"] = start_index new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) + start_index += len(chunk) return documents def split_documents(self, documents: Iterable[Document]) -> List[Document]: