mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-06 13:33:37 +00:00
core[minor]: add new clean up strategy "scoped_full" to indexing (#28505)
~Note that this PR is now Draft, so I didn't add change to `aindex` function and didn't add test codes for my change. After we have an agreement on the direction, I will add commits.~ `batch_size` is very difficult to decide because setting a large number like >10000 will impact VectorDB and RecordManager, while setting a small number will delete records unnecessarily, leading to redundant work, as the `IMPORTANT` section says. On the other hand, we can't use `full` because the loader returns just a subset of the dataset in our use case. I guess many people are in the same situation as us. So, as one of the possible solutions for it, I would like to introduce a new argument, `scoped_full_cleanup`. This argument will be valid only when `claneup` is Full. If True, Full cleanup deletes all documents that haven't been updated AND that are associated with source ids that were seen during indexing. Default is False. This change keeps backward compatibility. --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
@@ -364,6 +364,306 @@ async def test_aincremental_fails_with_bad_source_ids(
|
||||
)
|
||||
|
||||
|
||||
def test_index_simple_delete_scoped_full(
|
||||
record_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
|
||||
) -> None:
|
||||
"""Test Indexing with scoped_full strategy."""
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="This is a test document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is yet another document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is a test document from another source.",
|
||||
metadata={"source": "2"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||
):
|
||||
assert index(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 4,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 0,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||
):
|
||||
assert index(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 4,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="mutated document 1",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.", # <-- Same as original
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 3).timestamp()
|
||||
):
|
||||
assert index(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 1,
|
||||
"num_deleted": 2,
|
||||
"num_skipped": 1,
|
||||
"num_updated": 0,
|
||||
}
|
||||
doc_texts = {
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
}
|
||||
assert doc_texts == {
|
||||
"mutated document 1",
|
||||
"This is another document.",
|
||||
"This is a test document from another source.",
|
||||
}
|
||||
|
||||
# Attempt to index again verify that nothing changes
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 4).timestamp()
|
||||
):
|
||||
assert index(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 2,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
|
||||
async def test_aindex_simple_delete_scoped_full(
|
||||
arecord_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
|
||||
) -> None:
|
||||
"""Test Indexing with scoped_full strategy."""
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="This is a test document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is yet another document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is a test document from another source.",
|
||||
metadata={"source": "2"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
arecord_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||
):
|
||||
assert await aindex(
|
||||
loader,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 4,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 0,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
with patch.object(
|
||||
arecord_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||
):
|
||||
assert await aindex(
|
||||
loader,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 4,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="mutated document 1",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.", # <-- Same as original
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
arecord_manager, "get_time", return_value=datetime(2021, 1, 3).timestamp()
|
||||
):
|
||||
assert await aindex(
|
||||
loader,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 1,
|
||||
"num_deleted": 2,
|
||||
"num_skipped": 1,
|
||||
"num_updated": 0,
|
||||
}
|
||||
doc_texts = {
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
}
|
||||
assert doc_texts == {
|
||||
"mutated document 1",
|
||||
"This is another document.",
|
||||
"This is a test document from another source.",
|
||||
}
|
||||
|
||||
# Attempt to index again verify that nothing changes
|
||||
with patch.object(
|
||||
arecord_manager, "get_time", return_value=datetime(2021, 1, 4).timestamp()
|
||||
):
|
||||
assert await aindex(
|
||||
loader,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 2,
|
||||
"num_updated": 0,
|
||||
}
|
||||
|
||||
|
||||
def test_scoped_full_fails_with_bad_source_ids(
|
||||
record_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
|
||||
) -> None:
|
||||
"""Test Indexing with scoped_full strategy."""
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="This is a test document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.",
|
||||
metadata={"source": "2"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is yet another document.",
|
||||
metadata={"source": None},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
index(loader, record_manager, vector_store, cleanup="scoped_full")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
index(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
)
|
||||
|
||||
|
||||
async def test_ascoped_full_fails_with_bad_source_ids(
|
||||
arecord_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
|
||||
) -> None:
|
||||
"""Test Indexing with scoped_full strategy."""
|
||||
loader = ToyLoader(
|
||||
documents=[
|
||||
Document(
|
||||
page_content="This is a test document.",
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is another document.",
|
||||
metadata={"source": "2"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is yet another document.",
|
||||
metadata={"source": None},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
await aindex(loader, arecord_manager, vector_store, cleanup="scoped_full")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
await aindex(
|
||||
loader,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
cleanup="scoped_full",
|
||||
source_id_key="source",
|
||||
)
|
||||
|
||||
|
||||
def test_no_delete(
|
||||
record_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
|
||||
) -> None:
|
||||
|
Reference in New Issue
Block a user