From 603a0bea29f53d51870873b4714aa79067285744 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Singh <9626333+SKRohit@users.noreply.github.com> Date: Wed, 12 Jul 2023 13:05:49 +0530 Subject: [PATCH] Fixes incorrect docstore creation in faiss.py (#7026) - **Description**: Current implementation assumes that the length of `texts` and `ids` should be same but if the passed `ids` length is not equal to the passed length of `texts`, current code `dict(zip(index_to_id.values(), documents))` is not failing or giving any warning and silently creating docstores only for the passed `ids` i.e. if `ids = ['A']` and `texts=["I love Open Source","I love langchain"]` then only one `docstore` will be created. But either two docstores should be created assuming same id value for all the elements of `texts` or an error should be raised. - **Issue**: My change fixes this by using dictionary comprehension instead of `zip`. This was if lengths of `ids` and `texts` mismatches an explicit `IndexError` will be raised. @rlancemartin, @eyurtsev --- langchain/vectorstores/faiss.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index 74b5d747414..eb3df5685c7 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -521,6 +521,13 @@ class FAISS(VectorStore): metadata = metadatas[i] if metadatas else {} documents.append(Document(page_content=text, metadata=metadata)) index_to_id = dict(enumerate(ids)) + + if len(index_to_id) != len(documents): + raise Exception( + f"{len(index_to_id)} ids provided for {len(documents)} documents." + " Each document should have an id." + ) + docstore = InMemoryDocstore(dict(zip(index_to_id.values(), documents))) return cls( embedding.embed_query,