mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 04:38:26 +00:00
patch[experimental] Fix start_index in SemanticChunker
(#24761)
- Cause chunks are joined by space, so they can't be found in text, and the final `start_index` is very possibility to be -1. - The simplest way is to use the natural index of the chunk as `start_index`.
This commit is contained in:
parent
6fbd53bc60
commit
8bde04079b
@ -262,14 +262,14 @@ class SemanticChunker(BaseDocumentTransformer):
|
||||
_metadatas = metadatas or [{}] * len(texts)
|
||||
documents = []
|
||||
for i, text in enumerate(texts):
|
||||
index = -1
|
||||
start_index = 0
|
||||
for chunk in self.split_text(text):
|
||||
metadata = copy.deepcopy(_metadatas[i])
|
||||
if self._add_start_index:
|
||||
index = text.find(chunk, index + 1)
|
||||
metadata["start_index"] = index
|
||||
metadata["start_index"] = start_index
|
||||
new_doc = Document(page_content=chunk, metadata=metadata)
|
||||
documents.append(new_doc)
|
||||
start_index += len(chunk)
|
||||
return documents
|
||||
|
||||
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
||||
|
Loading…
Reference in New Issue
Block a user