mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 12:48:12 +00:00
patch[experimental] Fix start_index in SemanticChunker
(#24761)
- Cause chunks are joined by space, so they can't be found in text, and the final `start_index` is very possibility to be -1. - The simplest way is to use the natural index of the chunk as `start_index`.
This commit is contained in:
parent
6fbd53bc60
commit
8bde04079b
@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer):
|
|||||||
_metadatas = metadatas or [{}] * len(texts)
|
_metadatas = metadatas or [{}] * len(texts)
|
||||||
documents = []
|
documents = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
index = -1
|
start_index = 0
|
||||||
for chunk in self.split_text(text):
|
for chunk in self.split_text(text):
|
||||||
metadata = copy.deepcopy(_metadatas[i])
|
metadata = copy.deepcopy(_metadatas[i])
|
||||||
if self._add_start_index:
|
if self._add_start_index:
|
||||||
index = text.find(chunk, index + 1)
|
metadata["start_index"] = start_index
|
||||||
metadata["start_index"] = index
|
|
||||||
new_doc = Document(page_content=chunk, metadata=metadata)
|
new_doc = Document(page_content=chunk, metadata=metadata)
|
||||||
documents.append(new_doc)
|
documents.append(new_doc)
|
||||||
|
start_index += len(chunk)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
||||||
|
Loading…
Reference in New Issue
Block a user