From f765e8fa9d766632be685479cfb0e2553b8f6691 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 8 Jul 2024 17:11:51 -0400 Subject: [PATCH] core[minor],community[patch],standard-tests[patch]: Move InMemoryImplementation to langchain-core (#23986) This PR moves the in memory implementation to langchain-core. * The implementation remains importable from langchain-community. * Supporting utilities are marked as private for now. --- .../vectorstores/inmemory.py | 252 +--------------- .../langchain_core/vectorstores/__init__.py | 9 + .../{vectorstores.py => vectorstores/base.py} | 0 .../langchain_core/vectorstores/in_memory.py | 275 ++++++++++++++++++ .../core/langchain_core/vectorstores/utils.py | 100 +++++++ libs/core/poetry.lock | 23 +- libs/core/pyproject.toml | 6 + .../unit_tests/vectorstores/test_in_memory.py | 97 ++++++ .../unit_tests/test_in_memory_vectorstore.py | 12 +- 9 files changed, 515 insertions(+), 259 deletions(-) create mode 100644 libs/core/langchain_core/vectorstores/__init__.py rename libs/core/langchain_core/{vectorstores.py => vectorstores/base.py} (100%) create mode 100644 libs/core/langchain_core/vectorstores/in_memory.py create mode 100644 libs/core/langchain_core/vectorstores/utils.py create mode 100644 libs/core/tests/unit_tests/vectorstores/test_in_memory.py diff --git a/libs/community/langchain_community/vectorstores/inmemory.py b/libs/community/langchain_community/vectorstores/inmemory.py index 61a8aa13d24..997633ec928 100644 --- a/libs/community/langchain_community/vectorstores/inmemory.py +++ b/libs/community/langchain_community/vectorstores/inmemory.py @@ -1,249 +1,5 @@ -import json -import uuid -from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple +from langchain_core.vectorstores import InMemoryVectorStore -import numpy as np -from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings -from langchain_core.indexing import UpsertResponse -from langchain_core.load import dumpd, load -from langchain_core.vectorstores import VectorStore - -from langchain_community.utils.math import cosine_similarity -from langchain_community.vectorstores.utils import maximal_marginal_relevance - - -class InMemoryVectorStore(VectorStore): - """In-memory implementation of VectorStore using a dictionary. - Uses numpy to compute cosine similarity for search. - - Args: - embedding: embedding function to use. - """ - - def __init__(self, embedding: Embeddings) -> None: - self.store: Dict[str, Dict[str, Any]] = {} - self.embedding = embedding - - @property - def embeddings(self) -> Embeddings: - return self.embedding - - def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: - if ids: - for _id in ids: - self.store.pop(_id, None) - - async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: - self.delete(ids) - - def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse: - vectors = self.embedding.embed_documents([item.page_content for item in items]) - ids = [] - for item, vector in zip(items, vectors): - doc_id = item.id if item.id else str(uuid.uuid4()) - ids.append(doc_id) - self.store[doc_id] = { - "id": doc_id, - "vector": vector, - "text": item.page_content, - "metadata": item.metadata, - } - return { - "succeeded": ids, - "failed": [], - } - - def get_by_ids(self, ids: Sequence[str], /) -> List[Document]: - """Get documents by their ids.""" - documents = [] - - for doc_id in ids: - doc = self.store.get(doc_id) - if doc: - documents.append( - Document( - id=doc["id"], - page_content=doc["text"], - metadata=doc["metadata"], - ) - ) - return documents - - async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]: - return self.get_by_ids(ids) - - async def aadd_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> List[str]: - return self.add_texts(texts, metadatas, **kwargs) - - def _similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int = 4, - filter: Optional[Callable[[Document], bool]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float, List[float]]]: - result = [] - for doc in self.store.values(): - vector = doc["vector"] - similarity = float(cosine_similarity([embedding], [vector]).item(0)) - result.append( - ( - Document( - id=doc["id"], page_content=doc["text"], metadata=doc["metadata"] - ), - similarity, - vector, - ) - ) - result.sort(key=lambda x: x[1], reverse=True) - if filter is not None: - result = [r for r in result if filter(r[0])] - return result[:k] - - def similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int = 4, - filter: Optional[Callable[[Document], bool]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - return [ - (doc, similarity) - for doc, similarity, _ in self._similarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs - ) - ] - - def similarity_search_with_score( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - embedding = self.embedding.embed_query(query) - docs = self.similarity_search_with_score_by_vector( - embedding, - k, - **kwargs, - ) - return docs - - async def asimilarity_search_with_score( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - return self.similarity_search_with_score(query, k, **kwargs) - - def similarity_search_by_vector( - self, - embedding: List[float], - k: int = 4, - **kwargs: Any, - ) -> List[Document]: - docs_and_scores = self.similarity_search_with_score_by_vector( - embedding, - k, - **kwargs, - ) - return [doc for doc, _ in docs_and_scores] - - async def asimilarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any - ) -> List[Document]: - return self.similarity_search_by_vector(embedding, k, **kwargs) - - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)] - - async def asimilarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - return self.similarity_search(query, k, **kwargs) - - def max_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - prefetch_hits = self._similarity_search_with_score_by_vector( - embedding=embedding, - k=fetch_k, - **kwargs, - ) - - mmr_chosen_indices = maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - [vector for _, _, vector in prefetch_hits], - k=k, - lambda_mult=lambda_mult, - ) - return [prefetch_hits[idx][0] for idx in mmr_chosen_indices] - - def max_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - embedding_vector = self.embedding.embed_query(query) - return self.max_marginal_relevance_search_by_vector( - embedding_vector, - k, - fetch_k, - lambda_mult=lambda_mult, - **kwargs, - ) - - @classmethod - def from_texts( - cls, - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> "InMemoryVectorStore": - store = cls( - embedding=embedding, - ) - store.add_texts(texts=texts, metadatas=metadatas, **kwargs) - return store - - @classmethod - async def afrom_texts( - cls, - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> "InMemoryVectorStore": - return cls.from_texts(texts, embedding, metadatas, **kwargs) - - @classmethod - def load( - cls, path: str, embedding: Embeddings, **kwargs: Any - ) -> "InMemoryVectorStore": - _path: Path = Path(path) - with _path.open("r") as f: - store = load(json.load(f)) - vectorstore = cls(embedding=embedding, **kwargs) - vectorstore.store = store - return vectorstore - - def dump(self, path: str) -> None: - _path: Path = Path(path) - _path.parent.mkdir(exist_ok=True, parents=True) - with _path.open("w") as f: - json.dump(dumpd(self.store), f, indent=2) +__all__ = [ + "InMemoryVectorStore", +] diff --git a/libs/core/langchain_core/vectorstores/__init__.py b/libs/core/langchain_core/vectorstores/__init__.py new file mode 100644 index 00000000000..c80958679f1 --- /dev/null +++ b/libs/core/langchain_core/vectorstores/__init__.py @@ -0,0 +1,9 @@ +from langchain_core.vectorstores.base import VST, VectorStore, VectorStoreRetriever +from langchain_core.vectorstores.in_memory import InMemoryVectorStore + +__all__ = [ + "VectorStore", + "VST", + "VectorStoreRetriever", + "InMemoryVectorStore", +] diff --git a/libs/core/langchain_core/vectorstores.py b/libs/core/langchain_core/vectorstores/base.py similarity index 100% rename from libs/core/langchain_core/vectorstores.py rename to libs/core/langchain_core/vectorstores/base.py diff --git a/libs/core/langchain_core/vectorstores/in_memory.py b/libs/core/langchain_core/vectorstores/in_memory.py new file mode 100644 index 00000000000..deb93a5ce91 --- /dev/null +++ b/libs/core/langchain_core/vectorstores/in_memory.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +import json +import uuid +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + Tuple, +) + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.load import dumpd, load +from langchain_core.vectorstores import VectorStore +from langchain_core.vectorstores.utils import _cosine_similarity as cosine_similarity +from langchain_core.vectorstores.utils import ( + _maximal_marginal_relevance as maximal_marginal_relevance, +) + +if TYPE_CHECKING: + from langchain_core.indexing import UpsertResponse + + +class InMemoryVectorStore(VectorStore): + """In-memory implementation of VectorStore using a dictionary. + + Uses numpy to compute cosine similarity for search. + + Args: + embedding: embedding function to use. + """ + + def __init__(self, embedding: Embeddings) -> None: + """Initialize with the given embedding function.""" + # TODO: would be nice to change to + # Dict[str, Document] at some point (will be a breaking change) + self.store: Dict[str, Dict[str, Any]] = {} + self.embedding = embedding + + @property + def embeddings(self) -> Embeddings: + return self.embedding + + def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: + if ids: + for _id in ids: + self.store.pop(_id, None) + + async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: + self.delete(ids) + + def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse: + vectors = self.embedding.embed_documents([item.page_content for item in items]) + ids = [] + for item, vector in zip(items, vectors): + doc_id = item.id if item.id else str(uuid.uuid4()) + ids.append(doc_id) + self.store[doc_id] = { + "id": doc_id, + "vector": vector, + "text": item.page_content, + "metadata": item.metadata, + } + return { + "succeeded": ids, + "failed": [], + } + + def get_by_ids(self, ids: Sequence[str], /) -> List[Document]: + """Get documents by their ids.""" + documents = [] + + for doc_id in ids: + doc = self.store.get(doc_id) + if doc: + documents.append( + Document( + id=doc["id"], + page_content=doc["text"], + metadata=doc["metadata"], + ) + ) + return documents + + async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]: + return self.get_by_ids(ids) + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + return self.add_texts(texts, metadatas, **kwargs) + + def _similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Callable[[Document], bool]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float, List[float]]]: + result = [] + for doc in self.store.values(): + vector = doc["vector"] + similarity = float(cosine_similarity([embedding], [vector]).item(0)) + result.append( + ( + Document( + id=doc["id"], page_content=doc["text"], metadata=doc["metadata"] + ), + similarity, + vector, + ) + ) + result.sort(key=lambda x: x[1], reverse=True) + if filter is not None: + result = [r for r in result if filter(r[0])] + return result[:k] + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Callable[[Document], bool]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + return [ + (doc, similarity) + for doc, similarity, _ in self._similarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter, **kwargs + ) + ] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + embedding = self.embedding.embed_query(query) + docs = self.similarity_search_with_score_by_vector( + embedding, + k, + **kwargs, + ) + return docs + + async def asimilarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + return self.similarity_search_with_score(query, k, **kwargs) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + **kwargs: Any, + ) -> List[Document]: + docs_and_scores = self.similarity_search_with_score_by_vector( + embedding, + k, + **kwargs, + ) + return [doc for doc, _ in docs_and_scores] + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + return self.similarity_search_by_vector(embedding, k, **kwargs) + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)] + + async def asimilarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + return self.similarity_search(query, k, **kwargs) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + prefetch_hits = self._similarity_search_with_score_by_vector( + embedding=embedding, + k=fetch_k, + **kwargs, + ) + + try: + import numpy as np + except ImportError: + raise ImportError( + "numpy must be installed to use max_marginal_relevance_search " + "pip install numpy" + ) + + mmr_chosen_indices = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + [vector for _, _, vector in prefetch_hits], + k=k, + lambda_mult=lambda_mult, + ) + return [prefetch_hits[idx][0] for idx in mmr_chosen_indices] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + embedding_vector = self.embedding.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding_vector, + k, + fetch_k, + lambda_mult=lambda_mult, + **kwargs, + ) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "InMemoryVectorStore": + store = cls( + embedding=embedding, + ) + store.add_texts(texts=texts, metadatas=metadatas, **kwargs) + return store + + @classmethod + async def afrom_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "InMemoryVectorStore": + return cls.from_texts(texts, embedding, metadatas, **kwargs) + + @classmethod + def load( + cls, path: str, embedding: Embeddings, **kwargs: Any + ) -> "InMemoryVectorStore": + _path: Path = Path(path) + with _path.open("r") as f: + store = load(json.load(f)) + vectorstore = cls(embedding=embedding, **kwargs) + vectorstore.store = store + return vectorstore + + def dump(self, path: str) -> None: + _path: Path = Path(path) + _path.parent.mkdir(exist_ok=True, parents=True) + with _path.open("w") as f: + json.dump(dumpd(self.store), f, indent=2) diff --git a/libs/core/langchain_core/vectorstores/utils.py b/libs/core/langchain_core/vectorstores/utils.py new file mode 100644 index 00000000000..115ca6f0f9f --- /dev/null +++ b/libs/core/langchain_core/vectorstores/utils.py @@ -0,0 +1,100 @@ +"""Internal utilities for the in memory implementation of VectorStore. + +These are part of a private API and users should not used them directly +as they can change without notice. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, List, Union + +if TYPE_CHECKING: + import numpy as np + + Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] + +logger = logging.getLogger(__name__) + + +def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: + """Row-wise cosine similarity between two equal-width matrices.""" + try: + import numpy as np + except ImportError: + raise ImportError( + "cosine_similarity requires numpy to be installed. " + "Please install numpy with `pip install numpy`." + ) + + if len(X) == 0 or len(Y) == 0: + return np.array([]) + + X = np.array(X) + Y = np.array(Y) + if X.shape[1] != Y.shape[1]: + raise ValueError( + f"Number of columns in X and Y must be the same. X has shape {X.shape} " + f"and Y has shape {Y.shape}." + ) + try: + import simsimd as simd # type: ignore + + X = np.array(X, dtype=np.float32) + Y = np.array(Y, dtype=np.float32) + Z = 1 - np.array(simd.cdist(X, Y, metric="cosine")) + return Z + except ImportError: + logger.debug( + "Unable to import simsimd, defaulting to NumPy implementation. If you want " + "to use simsimd please install with `pip install simsimd`." + ) + X_norm = np.linalg.norm(X, axis=1) + Y_norm = np.linalg.norm(Y, axis=1) + # Ignore divide by zero errors run time warnings as those are handled below. + with np.errstate(divide="ignore", invalid="ignore"): + similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) + similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 + return similarity + + +def _maximal_marginal_relevance( + query_embedding: np.ndarray, + embedding_list: list, + lambda_mult: float = 0.5, + k: int = 4, +) -> List[int]: + """Calculate maximal marginal relevance.""" + try: + import numpy as np + except ImportError: + raise ImportError( + "maximal_marginal_relevance requires numpy to be installed. " + "Please install numpy with `pip install numpy`." + ) + + if min(k, len(embedding_list)) <= 0: + return [] + if query_embedding.ndim == 1: + query_embedding = np.expand_dims(query_embedding, axis=0) + similarity_to_query = _cosine_similarity(query_embedding, embedding_list)[0] + most_similar = int(np.argmax(similarity_to_query)) + idxs = [most_similar] + selected = np.array([embedding_list[most_similar]]) + while len(idxs) < min(k, len(embedding_list)): + best_score = -np.inf + idx_to_add = -1 + similarity_to_selected = _cosine_similarity(embedding_list, selected) + for i, query_score in enumerate(similarity_to_query): + if i in idxs: + continue + redundant_score = max(similarity_to_selected[i]) + equation_score = ( + lambda_mult * query_score - (1 - lambda_mult) * redundant_score + ) + if equation_score > best_score: + best_score = equation_score + idx_to_add = i + idxs.append(idx_to_add) + selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) + return idxs diff --git a/libs/core/poetry.lock b/libs/core/poetry.lock index 30e62659c1e..5bb2de9d1d7 100644 --- a/libs/core/poetry.lock +++ b/libs/core/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -1197,6 +1197,24 @@ files = [ {file = "jupyterlab_widgets-3.0.11.tar.gz", hash = "sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27"}, ] +[[package]] +name = "langchain-standard-tests" +version = "0.1.1" +description = "Standard tests for LangChain implementations" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [] +develop = true + +[package.dependencies] +httpx = "^0.27.0" +langchain-core = ">=0.1.40,<0.3" +pytest = ">=7,<9" + +[package.source] +type = "directory" +url = "../standard-tests" + [[package]] name = "langchain-text-splitters" version = "0.2.2" @@ -2185,7 +2203,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -3004,4 +3021,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "0a40678314005533ead4fefdbfd8bd27b043641ba99c2211409d8039703ed516" +content-hash = "8db47de0615d9a5324dc0e28f6110908e9b16ccfee699aeafef21f68c879e62a" diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml index 62e270f4013..f735b840249 100644 --- a/libs/core/pyproject.toml +++ b/libs/core/pyproject.toml @@ -85,6 +85,12 @@ pytest-asyncio = "^0.21.1" grandalf = "^0.8" pytest-profiling = "^1.7.0" responses = "^0.25.0" + +[tool.poetry.group.test.dependencies.langchain-standard-tests] +path = "../standard-tests" +develop = true + + [[tool.poetry.group.test.dependencies.numpy]] version = "^1.24.0" python = "<3.12" diff --git a/libs/core/tests/unit_tests/vectorstores/test_in_memory.py b/libs/core/tests/unit_tests/vectorstores/test_in_memory.py new file mode 100644 index 00000000000..057d5321f4e --- /dev/null +++ b/libs/core/tests/unit_tests/vectorstores/test_in_memory.py @@ -0,0 +1,97 @@ +from pathlib import Path + +import pytest +from langchain_standard_tests.integration_tests.vectorstores import ( + AsyncReadWriteTestSuite, + ReadWriteTestSuite, +) + +from langchain_core.documents import Document +from langchain_core.embeddings.fake import DeterministicFakeEmbedding +from langchain_core.vectorstores import InMemoryVectorStore +from tests.unit_tests.stubs import AnyStr + + +class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite): + @pytest.fixture + def vectorstore(self) -> InMemoryVectorStore: + return InMemoryVectorStore(embedding=self.get_embeddings()) + + +class TestAsyncInMemoryReadWriteTestSuite(AsyncReadWriteTestSuite): + @pytest.fixture + async def vectorstore(self) -> InMemoryVectorStore: + return InMemoryVectorStore(embedding=self.get_embeddings()) + + +async def test_inmemory() -> None: + """Test end to end construction and search.""" + store = await InMemoryVectorStore.afrom_texts( + ["foo", "bar", "baz"], DeterministicFakeEmbedding(size=6) + ) + output = await store.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", id=AnyStr())] + + output = await store.asimilarity_search("bar", k=2) + assert output == [ + Document(page_content="bar", id=AnyStr()), + Document(page_content="baz", id=AnyStr()), + ] + + output2 = await store.asimilarity_search_with_score("bar", k=2) + assert output2[0][1] > output2[1][1] + + +async def test_add_by_ids() -> None: + vectorstore = InMemoryVectorStore(embedding=DeterministicFakeEmbedding(size=6)) + + # Check sync version + ids1 = vectorstore.add_texts(["foo", "bar", "baz"], ids=["1", "2", "3"]) + assert ids1 == ["1", "2", "3"] + assert sorted(vectorstore.store.keys()) == ["1", "2", "3"] + + ids2 = await vectorstore.aadd_texts(["foo", "bar", "baz"], ids=["4", "5", "6"]) + assert ids2 == ["4", "5", "6"] + assert sorted(vectorstore.store.keys()) == ["1", "2", "3", "4", "5", "6"] + + +async def test_inmemory_mmr() -> None: + texts = ["foo", "foo", "fou", "foy"] + docsearch = await InMemoryVectorStore.afrom_texts( + texts, DeterministicFakeEmbedding(size=6) + ) + # make sure we can k > docstore size + output = await docsearch.amax_marginal_relevance_search( + "foo", k=10, lambda_mult=0.1 + ) + assert len(output) == len(texts) + assert output[0] == Document(page_content="foo", id=AnyStr()) + assert output[1] == Document(page_content="foy", id=AnyStr()) + + +async def test_inmemory_dump_load(tmp_path: Path) -> None: + """Test end to end construction and search.""" + embedding = DeterministicFakeEmbedding(size=6) + store = await InMemoryVectorStore.afrom_texts(["foo", "bar", "baz"], embedding) + output = await store.asimilarity_search("foo", k=1) + + test_file = str(tmp_path / "test.json") + store.dump(test_file) + + loaded_store = InMemoryVectorStore.load(test_file, embedding) + loaded_output = await loaded_store.asimilarity_search("foo", k=1) + + assert output == loaded_output + + +async def test_inmemory_filter() -> None: + """Test end to end construction and search.""" + store = await InMemoryVectorStore.afrom_texts( + ["foo", "bar"], + DeterministicFakeEmbedding(size=6), + [{"id": 1}, {"id": 2}], + ) + output = await store.asimilarity_search( + "baz", filter=lambda doc: doc.metadata["id"] == 1 + ) + assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())] diff --git a/libs/standard-tests/tests/unit_tests/test_in_memory_vectorstore.py b/libs/standard-tests/tests/unit_tests/test_in_memory_vectorstore.py index d34bf25c388..9b710877707 100644 --- a/libs/standard-tests/tests/unit_tests/test_in_memory_vectorstore.py +++ b/libs/standard-tests/tests/unit_tests/test_in_memory_vectorstore.py @@ -1,18 +1,14 @@ import pytest -from langchain_core.vectorstores import VectorStore +from langchain_core.vectorstores import ( + InMemoryVectorStore, + VectorStore, +) from langchain_standard_tests.integration_tests.vectorstores import ( AsyncReadWriteTestSuite, ReadWriteTestSuite, ) -# We'll need to move this dependency to core -pytest.importorskip("langchain_community") - -from langchain_community.vectorstores.inmemory import ( # type: ignore # noqa - InMemoryVectorStore, -) - class TestInMemoryVectorStore(ReadWriteTestSuite): @pytest.fixture