From 8bde04079b26379b874a10fa26c954192bf2e0fc Mon Sep 17 00:00:00 2001 From: ZhangShenao <15201440436@163.com> Date: Fri, 23 Aug 2024 02:59:40 +0800 Subject: [PATCH] patch[experimental] Fix start_index in `SemanticChunker` (#24761) - Cause chunks are joined by space, so they can't be found in text, and the final `start_index` is very possibility to be -1. - The simplest way is to use the natural index of the chunk as `start_index`. --- libs/experimental/langchain_experimental/text_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index be3a795b06a..2151a0e5ce9 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer): _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): - index = -1 + start_index = 0 for chunk in self.split_text(text): metadata = copy.deepcopy(_metadatas[i]) if self._add_start_index: - index = text.find(chunk, index + 1) - metadata["start_index"] = index + metadata["start_index"] = start_index new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) + start_index += len(chunk) return documents def split_documents(self, documents: Iterable[Document]) -> List[Document]: