From 7bc4e320f1b41b03208af863dd7d9234bf4c3bde Mon Sep 17 00:00:00 2001 From: Vincent Min <93780551+VMinB12@users.noreply.github.com> Date: Fri, 25 Oct 2024 23:07:04 +0200 Subject: [PATCH] core[patch]: improve performance of InMemoryVectorStore (#27538) **Description:** We improve the performance of the InMemoryVectorStore. **Isue:** Originally, similarity was computed document by document: ``` for doc in self.store.values(): vector = doc["vector"] similarity = float(cosine_similarity([embedding], [vector]).item(0)) ``` This is inefficient and does not make use of numpy vectorization. This PR computes the similarity in one vectorized go: ``` docs = list(self.store.values()) similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs]) ``` **Dependencies:** None **Twitter handle:** @b12_consulting, @Vincent_Min --------- Co-authored-by: Eugene Yurtsev --- .../langchain_core/vectorstores/in_memory.py | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/libs/core/langchain_core/vectorstores/in_memory.py b/libs/core/langchain_core/vectorstores/in_memory.py index 06aa13f785a..d6eb978d643 100644 --- a/libs/core/langchain_core/vectorstores/in_memory.py +++ b/libs/core/langchain_core/vectorstores/in_memory.py @@ -329,23 +329,38 @@ class InMemoryVectorStore(VectorStore): filter: Optional[Callable[[Document], bool]] = None, **kwargs: Any, ) -> list[tuple[Document, float, list[float]]]: - result = [] - for doc in self.store.values(): - vector = doc["vector"] - similarity = float(cosine_similarity([embedding], [vector]).item(0)) - result.append( - ( - Document( - id=doc["id"], page_content=doc["text"], metadata=doc["metadata"] - ), - similarity, - vector, - ) - ) - result.sort(key=lambda x: x[1], reverse=True) + # get all docs with fixed order in list + docs = list(self.store.values()) + if filter is not None: - result = [r for r in result if filter(r[0])] - return result[:k] + docs = [ + doc + for doc in docs + if filter(Document(page_content=doc["text"], metadata=doc["metadata"])) + ] + + if not docs: + return [] + + similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs])[0] + + # get the indices ordered by similarity score + top_k_idx = similarity.argsort()[::-1][:k] + + return [ + ( + Document( + id=doc_dict["id"], + page_content=doc_dict["text"], + metadata=doc_dict["metadata"], + ), + float(similarity[idx].item()), + doc_dict["vector"], + ) + for idx in top_k_idx + # Assign using walrus operator to avoid multiple lookups + if (doc_dict := docs[idx]) + ] def similarity_search_with_score_by_vector( self,