patch[experimental] Fix start_index in SemanticChunker (#24761)

- Cause chunks are joined by space, so they can't be found in text, and
the final `start_index` is very possibility to be -1.
- The simplest way is to use the natural index of the chunk as
`start_index`.
This commit is contained in:
ZhangShenao 2024-08-23 02:59:40 +08:00 committed by GitHub
parent 6fbd53bc60
commit 8bde04079b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer):
_metadatas = metadatas or [{}] * len(texts) _metadatas = metadatas or [{}] * len(texts)
documents = [] documents = []
for i, text in enumerate(texts): for i, text in enumerate(texts):
index = -1 start_index = 0
for chunk in self.split_text(text): for chunk in self.split_text(text):
metadata = copy.deepcopy(_metadatas[i]) metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index: if self._add_start_index:
index = text.find(chunk, index + 1) metadata["start_index"] = start_index
metadata["start_index"] = index
new_doc = Document(page_content=chunk, metadata=metadata) new_doc = Document(page_content=chunk, metadata=metadata)
documents.append(new_doc) documents.append(new_doc)
start_index += len(chunk)
return documents return documents
def split_documents(self, documents: Iterable[Document]) -> List[Document]: def split_documents(self, documents: Iterable[Document]) -> List[Document]: