core[patch]: stop deleting records with "scoped_full" when doc is empty (#30520)

Fix a bug that causes `scoped_full` in index to delete records when there are no input docs.
This commit is contained in:
Keiichi Hirobe 2025-03-28 00:04:34 +09:00 committed by GitHub
parent b28a474e79
commit 956b09f468
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 158 additions and 2 deletions

View File

@ -473,7 +473,9 @@ def index(
record_manager.delete_keys(uids_to_delete)
num_deleted += len(uids_to_delete)
if cleanup == "full" or cleanup == "scoped_full":
if cleanup == "full" or (
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
):
delete_group_ids: Optional[Sequence[str]] = None
if cleanup == "scoped_full":
delete_group_ids = list(scoped_full_cleanup_source_ids)
@ -786,7 +788,9 @@ async def aindex(
await record_manager.adelete_keys(uids_to_delete)
num_deleted += len(uids_to_delete)
if cleanup == "full" or cleanup == "scoped_full":
if cleanup == "full" or (
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
):
delete_group_ids: Optional[Sequence[str]] = None
if cleanup == "scoped_full":
delete_group_ids = list(scoped_full_cleanup_source_ids)

View File

@ -822,6 +822,158 @@ async def test_ascoped_full_fails_with_bad_source_ids(
)
def test_index_empty_doc_scoped_full(
record_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
) -> None:
"""Test Indexing with scoped_full strategy"""
loader = ToyLoader(
documents=[
Document(
page_content="This is a test document.",
metadata={"source": "1"},
),
Document(
page_content="This is another document.",
metadata={"source": "1"},
),
Document(
page_content="This is yet another document.",
metadata={"source": "1"},
),
Document(
page_content="This is a test document from another source.",
metadata={"source": "2"},
),
]
)
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
):
assert index(
loader,
record_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 4,
"num_deleted": 0,
"num_skipped": 0,
"num_updated": 0,
}
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
):
assert index(
loader,
record_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 4,
"num_updated": 0,
}
loader = ToyLoader(documents=[])
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 3).timestamp()
):
assert index(
loader,
record_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 0,
"num_updated": 0,
}
async def test_aindex_empty_doc_scoped_full(
arecord_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
) -> None:
"""Test Indexing with scoped_full strategy."""
loader = ToyLoader(
documents=[
Document(
page_content="This is a test document.",
metadata={"source": "1"},
),
Document(
page_content="This is another document.",
metadata={"source": "1"},
),
Document(
page_content="This is yet another document.",
metadata={"source": "1"},
),
Document(
page_content="This is a test document from another source.",
metadata={"source": "2"},
),
]
)
with patch.object(
arecord_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
):
assert await aindex(
loader,
arecord_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 4,
"num_deleted": 0,
"num_skipped": 0,
"num_updated": 0,
}
with patch.object(
arecord_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
):
assert await aindex(
loader,
arecord_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 4,
"num_updated": 0,
}
loader = ToyLoader(documents=[])
with patch.object(
arecord_manager, "get_time", return_value=datetime(2021, 1, 3).timestamp()
):
assert await aindex(
loader,
arecord_manager,
vector_store,
cleanup="scoped_full",
source_id_key="source",
) == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 0,
"num_updated": 0,
}
def test_no_delete(
record_manager: InMemoryRecordManager, vector_store: InMemoryVectorStore
) -> None: