mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-30 16:24:24 +00:00
Fix: improved start_index logic in TextSplitter when called with tokens
This commit is contained in:
parent
5ecbb5f277
commit
e61c802472
@ -79,11 +79,20 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
for i, text in enumerate(texts):
|
||||
index = 0
|
||||
previous_chunk_len = 0
|
||||
not_found = -1
|
||||
retries = 1
|
||||
for chunk in self.split_text(text):
|
||||
metadata = copy.deepcopy(_metadatas[i])
|
||||
if self._add_start_index:
|
||||
search_boundary = len(chunk) + 1
|
||||
offset = index + previous_chunk_len - self._chunk_overlap
|
||||
index = text.find(chunk, max(0, offset))
|
||||
prev_index = index
|
||||
index = text.find(chunk, max(0, offset), max(0,offset) + search_boundary)
|
||||
if index == not_found:
|
||||
for token_offset in range(0,retries):
|
||||
index = text.find(chunk, prev_index + token_offset)
|
||||
if index is prev_index:
|
||||
continue
|
||||
metadata["start_index"] = index
|
||||
previous_chunk_len = len(chunk)
|
||||
new_doc = Document(page_content=chunk, metadata=metadata)
|
||||
|
Loading…
Reference in New Issue
Block a user