core[patch]: Reverts PR #25754 and add unit tests (#28702)

I reported the bug 2 weeks ago here:
https://github.com/langchain-ai/langchain/issues/28447

I believe this is a critical bug for the indexer, so I submitted a PR to
revert the change and added unit tests to prevent similar bugs from
being introduced in the future.

@eyurtsev Could you check this?
This commit is contained in:
Keiichi Hirobe
2024-12-14 05:13:06 +09:00
committed by GitHub
parent b0a298894d
commit da28cf1f54
2 changed files with 94 additions and 306 deletions

View File

@@ -408,18 +408,17 @@ def index(
# mypy isn't good enough to determine that source ids cannot be None
# here due to a check that's happening above, so we check again.
if any(source_id is None for source_id in source_ids):
msg = "Source ids cannot be if cleanup=='incremental'."
raise AssertionError(msg)
for source_id in source_ids:
if source_id is None:
msg = "Source ids cannot be None here."
raise AssertionError(msg)
indexed_source_ids = cast(
Sequence[str], [source_id_assigner(doc) for doc in docs_to_index]
)
_source_ids = cast(Sequence[str], source_ids)
uids_to_delete = record_manager.list_keys(
group_ids=indexed_source_ids, before=index_start_dt
group_ids=_source_ids, before=index_start_dt
)
if indexed_source_ids and uids_to_delete:
if uids_to_delete:
# Then delete from vector store.
destination.delete(uids_to_delete)
# First delete from record store.
@@ -669,18 +668,17 @@ async def aindex(
# mypy isn't good enough to determine that source ids cannot be None
# here due to a check that's happening above, so we check again.
if any(source_id is None for source_id in source_ids):
msg = "Source ids cannot be if cleanup=='incremental'."
raise AssertionError(msg)
for source_id in source_ids:
if source_id is None:
msg = "Source ids cannot be None here."
raise AssertionError(msg)
indexed_source_ids = cast(
Sequence[str], [source_id_assigner(doc) for doc in docs_to_index]
)
_source_ids = cast(Sequence[str], source_ids)
uids_to_delete = await record_manager.alist_keys(
group_ids=indexed_source_ids, before=index_start_dt
group_ids=_source_ids, before=index_start_dt
)
if indexed_source_ids and uids_to_delete:
if uids_to_delete:
# Then delete from vector store.
await destination.adelete(uids_to_delete)
# First delete from record store.