community[patch]: Implement Unique ID Enforcement in FAISS (#17244)

**Description:**
Implemented unique ID validation in the FAISS component to ensure all
document IDs are distinct. This update resolves issues related to
non-unique IDs, such as inconsistent behavior during deletion processes.
This commit is contained in:
ByeongUk Choi 2024-02-09 05:03:33 +09:00 committed by GitHub
parent 88609565a3
commit b88329e9a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 0 deletions

View File

@ -190,6 +190,9 @@ class FAISS(VectorStore):
_len_check_if_sized(documents, embeddings, "documents", "embeddings")
_len_check_if_sized(documents, ids, "documents", "ids")
if ids and len(ids) != len(set(ids)):
raise ValueError("Duplicate ids found in the ids list.")
# Add to the index.
vector = np.array(embeddings, dtype=np.float32)
if self._normalize_L2:

View File

@ -774,3 +774,15 @@ async def test_async_delete() -> None:
result = await docsearch.asimilarity_search("bar", k=2)
assert sorted([d.page_content for d in result]) == ["baz", "foo"]
assert docsearch.index_to_docstore_id == {0: ids[0], 1: ids[2]}
@pytest.mark.requires("faiss")
def test_faiss_with_duplicate_ids() -> None:
"""Test whether FAISS raises an exception for duplicate ids."""
texts = ["foo", "bar", "baz"]
duplicate_ids = ["id1", "id1", "id2"]
with pytest.raises(ValueError) as exc_info:
FAISS.from_texts(texts, FakeEmbeddings(), ids=duplicate_ids)
assert "Duplicate ids found in the ids list." in str(exc_info.value)