diff --git a/libs/community/langchain_community/vectorstores/inmemory.py b/libs/community/langchain_community/vectorstores/inmemory.py new file mode 100644 index 00000000000..7b73e0843d1 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/inmemory.py @@ -0,0 +1,199 @@ +import uuid +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple + +import numpy as np +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +from langchain_community.utils.math import cosine_similarity +from langchain_community.vectorstores.utils import maximal_marginal_relevance + + +class InMemoryVectorStore(VectorStore): + """In-memory implementation of VectorStore using a dictionary. + Uses numpy to compute cosine similarity for search. + + Args: + embedding: embedding function to use. + """ + + def __init__(self, embedding: Embeddings) -> None: + self.store: Dict[str, Dict[str, Any]] = {} + self.embedding = embedding + + @property + def embeddings(self) -> Embeddings: + return self.embedding + + def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: + if ids: + for _id in ids: + self.store.pop(_id, None) + + async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: + self.delete(ids) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + ids = [] + vectors = self.embedding.embed_documents(list(texts)) + + for i, text in enumerate(texts): + doc_id = str(uuid.uuid4()) + ids.append(doc_id) + self.store[doc_id] = { + "id": doc_id, + "vector": vectors[i], + "text": text, + "metadata": metadatas[i] if metadatas else {}, + } + return ids + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + return self.add_texts(texts, metadatas, **kwargs) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + ) -> List[Tuple[Document, float]]: + docs_with_similarity = [] + for doc in self.store.values(): + similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0)) + docs_with_similarity.append( + ( + Document(page_content=doc["text"], metadata=doc["metadata"]), + similarity, + ) + ) + docs_with_similarity.sort(key=lambda x: x[1], reverse=True) + return docs_with_similarity[:k] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + embedding = self.embedding.embed_query(query) + docs = self.similarity_search_with_score_by_vector( + embedding, + k, + ) + return docs + + async def asimilarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + return self.similarity_search_with_score(query, k, **kwargs) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + **kwargs: Any, + ) -> List[Document]: + docs_and_scores = self.similarity_search_with_score_by_vector( + embedding, + k, + ) + return [doc for doc, _ in docs_and_scores] + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + return self.similarity_search_by_vector(embedding, k, **kwargs) + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)] + + async def asimilarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + return self.similarity_search(query, k, **kwargs) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + docs_with_similarity = [] + for doc in self.store.values(): + similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0)) + docs_with_similarity.append( + ( + doc, + similarity, + ) + ) + docs_with_similarity.sort(key=lambda x: x[1], reverse=True) + prefetch_hits = docs_with_similarity[:fetch_k] + + mmr_chosen_indices = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + [doc["vector"] for doc, _ in prefetch_hits], + k=k, + lambda_mult=lambda_mult, + ) + return [ + Document( + page_content=prefetch_hits[idx][0]["text"], + metadata=prefetch_hits[idx][0]["metadata"], + ) + for idx in mmr_chosen_indices + ] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + embedding_vector = self.embedding.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding_vector, + k, + fetch_k, + lambda_mult=lambda_mult, + ) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "InMemoryVectorStore": + store = cls( + embedding=embedding, + ) + store.add_texts(texts=texts, metadatas=metadatas) + return store + + @classmethod + async def afrom_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "InMemoryVectorStore": + return cls.from_texts(texts, embedding, metadatas, **kwargs) diff --git a/libs/community/tests/unit_tests/vectorstores/test_inmemory.py b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py new file mode 100644 index 00000000000..d571d680d20 --- /dev/null +++ b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py @@ -0,0 +1,33 @@ +from langchain_core.documents import Document + +from langchain_community.vectorstores.inmemory import InMemoryVectorStore +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +async def test_inmemory() -> None: + """Test end to end construction and search.""" + store = await InMemoryVectorStore.afrom_texts( + ["foo", "bar", "baz"], ConsistentFakeEmbeddings() + ) + output = await store.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + output = await store.asimilarity_search("bar", k=2) + assert output == [Document(page_content="bar"), Document(page_content="baz")] + + output2 = await store.asimilarity_search_with_score("bar", k=2) + assert output2[0][1] > output2[1][1] + + +async def test_inmemory_mmr() -> None: + texts = ["foo", "foo", "fou", "foy"] + docsearch = await InMemoryVectorStore.afrom_texts(texts, ConsistentFakeEmbeddings()) + # make sure we can k > docstore size + output = await docsearch.amax_marginal_relevance_search( + "foo", k=10, lambda_mult=0.1 + ) + assert len(output) == len(texts) + assert output[0] == Document(page_content="foo") + assert output[1] == Document(page_content="foy")