From fc79b372cb4b83566eca79f88178f0d49d665d15 Mon Sep 17 00:00:00 2001 From: Jesse S Date: Mon, 20 May 2024 18:01:47 -0700 Subject: [PATCH] community[minor]: add aerospike vectorstore integration (#21735) Please let me know if you see any possible areas of improvement. I would very much appreciate your constructive criticism if time allows. **Description:** - Added a aerospike vector store integration that utilizes [Aerospike-Vector-Search](https://aerospike.com/products/vector-database-search-llm/) add-on. - Added both unit tests and integration tests - Added a docker compose file for spinning up a test environment - Added a notebook **Dependencies:** any dependencies required for this change - aerospike-vector-search **Twitter handle:** - No twitter, you can use my GitHub handle or LinkedIn if you'd like Thanks! --------- Co-authored-by: Jesse Schumacher Co-authored-by: Bagatur --- docs/docs/how_to/indexing.ipynb | 2 +- .../integrations/vectorstores/aerospike.ipynb | 706 +++++++++++++++ .../vectorstores/__init__.py | 5 + .../vectorstores/aerospike.py | 598 +++++++++++++ libs/community/poetry.lock | 26 +- libs/community/pyproject.toml | 2 + .../aerospike/aerospike-proximus.yml | 36 + .../docker-compose/aerospike/aerospike.conf | 62 ++ .../aerospike/docker-compose.yml | 23 + .../vectorstores/test_aerospike.py | 838 ++++++++++++++++++ .../unit_tests/vectorstores/test_aerospike.py | 378 ++++++++ .../unit_tests/vectorstores/test_imports.py | 1 + .../vectorstores/test_indexing_docs.py | 1 + .../vectorstores/test_public_api.py | 1 + 14 files changed, 2675 insertions(+), 4 deletions(-) create mode 100644 docs/docs/integrations/vectorstores/aerospike.ipynb create mode 100644 libs/community/langchain_community/vectorstores/aerospike.py create mode 100644 libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike-proximus.yml create mode 100644 libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike.conf create mode 100644 libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/docker-compose.yml create mode 100644 libs/community/tests/integration_tests/vectorstores/test_aerospike.py create mode 100644 libs/community/tests/unit_tests/vectorstores/test_aerospike.py diff --git a/docs/docs/how_to/indexing.ipynb b/docs/docs/how_to/indexing.ipynb index a80dbd12814..48c4401b6e4 100644 --- a/docs/docs/how_to/indexing.ipynb +++ b/docs/docs/how_to/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", + "Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", " \n", "## Caution\n", "\n", diff --git a/docs/docs/integrations/vectorstores/aerospike.ipynb b/docs/docs/integrations/vectorstores/aerospike.ipynb new file mode 100644 index 00000000000..9a72250db10 --- /dev/null +++ b/docs/docs/integrations/vectorstores/aerospike.ipynb @@ -0,0 +1,706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Aerospike\n", + "\n", + "[Aerospike Vector Search](https://aerospike.com/docs/vector) (AVS) is an\n", + "extension to the Aerospike Database that enables searches across very large\n", + "datasets stored in Aerospike. This new service lives outside of Aerospike and\n", + "builds an index to perform those searches.\n", + "\n", + "This notebook showcases the functionality of the LangChain Aerospike VectorStore\n", + "integration.\n", + "\n", + "## Install AVS\n", + "\n", + "Before using this notebook, we need to have a running AVS instance. Use one of\n", + "the [available installation methods](https://aerospike.com/docs/vector/install). \n", + "\n", + "When finished, store your AVS instance's IP address and port to use later\n", + "in this demo:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "PROXIMUS_HOST = \"\"\n", + "PROXIMUS_PORT = 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Dependencies \n", + "The `sentence-transformers` dependency is large. This step could take several minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "!pip install --upgrade --quiet aerospike-vector-search==0.6.1 sentence-transformers langchain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Quotes Dataset\n", + "\n", + "We will download a dataset of approximately 100,000 quotes and use a subset of those quotes for semantic search." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-05-10 17:28:17-- https://github.com/aerospike/aerospike-vector-search-examples/raw/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz\n", + "Resolving github.com (github.com)... 140.82.116.4\n", + "Connecting to github.com (github.com)|140.82.116.4|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://raw.githubusercontent.com/aerospike/aerospike-vector-search-examples/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz [following]\n", + "--2024-05-10 17:28:17-- https://raw.githubusercontent.com/aerospike/aerospike-vector-search-examples/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11597643 (11M) [application/octet-stream]\n", + "Saving to: ‘quotes.csv.tgz’\n", + "\n", + "quotes.csv.tgz 100%[===================>] 11.06M 1.94MB/s in 6.1s \n", + "\n", + "2024-05-10 17:28:23 (1.81 MB/s) - ‘quotes.csv.tgz’ saved [11597643/11597643]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://github.com/aerospike/aerospike-vector-search-examples/raw/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Quotes Into Documents\n", + "\n", + "We will load our quotes dataset using the `CSVLoader` document loader. In this case, `lazy_load` returns an iterator to ingest our quotes more efficiently. In this example, we only load 5,000 quotes." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "import os\n", + "import tarfile\n", + "\n", + "from langchain_community.document_loaders.csv_loader import CSVLoader\n", + "\n", + "filename = \"./quotes.csv\"\n", + "\n", + "if not os.path.exists(filename) and os.path.exists(filename + \".tgz\"):\n", + " # Untar the file\n", + " with tarfile.open(filename + \".tgz\", \"r:gz\") as tar:\n", + " tar.extractall(path=os.path.dirname(filename))\n", + "\n", + "NUM_QUOTES = 5000\n", + "documents = CSVLoader(filename, metadata_columns=[\"author\", \"category\"]).lazy_load()\n", + "documents = list(\n", + " itertools.islice(documents, NUM_QUOTES)\n", + ") # Allows us to slice an iterator" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content=\"quote: I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.\" metadata={'source': './quotes.csv', 'row': 0, 'author': 'Marilyn Monroe', 'category': 'attributed-no-source, best, life, love, mistakes, out-of-control, truth, worst'}\n" + ] + } + ], + "source": [ + "print(documents[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your Embedder\n", + "\n", + "In this step, we use HuggingFaceEmbeddings and the \"all-MiniLM-L6-v2\" sentence transformer model to embed our documents so we can perform a vector search." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "60662fc2676a46a2ac48fbf30d9c85fe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "modules.json: 0%| | 0.00/349 [00:00 Any: + try: + from aerospike_vector_search import Client + except ImportError as e: + raise ImportError( + "Could not import aerospike_vector_search python package. " + "Please install it with `pip install aerospike_vector`." + ) from e + return Client + + +AVST = TypeVar("AVST", bound="Aerospike") + + +class Aerospike(VectorStore): + """`Aerospike` vector store. + + To use, you should have the ``aerospike_vector_search`` python package installed. + """ + + def __init__( + self, + client: Client, + embedding: Union[Embeddings, Callable], + namespace: str, + index_name: Optional[str] = None, + vector_key: str = "_vector", + text_key: str = "_text", + id_key: str = "_id", + set_name: Optional[str] = None, + distance_strategy: Optional[ + Union[DistanceStrategy, VectorDistanceMetric] + ] = DistanceStrategy.EUCLIDEAN_DISTANCE, + ): + """Initialize with Aerospike client. + + Args: + client: Aerospike client. + embedding: Embeddings object or Callable (deprecated) to embed text. + namespace: Namespace to use for storing vectors. This should match + index_name: Name of the index previously created in Aerospike. This + vector_key: Key to use for vector in metadata. This should match the + key used during index creation. + text_key: Key to use for text in metadata. + id_key: Key to use for id in metadata. + set_name: Default set name to use for storing vectors. + distance_strategy: Distance strategy to use for similarity search + This should match the distance strategy used during index creation. + """ + + aerospike = _import_aerospike() + + if not isinstance(embedding, Embeddings): + warnings.warn( + "Passing in `embedding` as a Callable is deprecated. Please pass in an" + " Embeddings object instead." + ) + + if not isinstance(client, aerospike): + raise ValueError( + f"client should be an instance of aerospike_vector_search.Client, " + f"got {type(client)}" + ) + + self._client = client + self._embedding = embedding + self._text_key = text_key + self._vector_key = vector_key + self._id_key = id_key + self._index_name = index_name + self._namespace = namespace + self._set_name = set_name + self._distance_strategy = self.convert_distance_strategy(distance_strategy) + + @property + def embeddings(self) -> Optional[Embeddings]: + """Access the query embedding object if available.""" + if isinstance(self._embedding, Embeddings): + return self._embedding + return None + + def _embed_documents(self, texts: Iterable[str]) -> List[List[float]]: + """Embed search docs.""" + if isinstance(self._embedding, Embeddings): + return self._embedding.embed_documents(list(texts)) + return [self._embedding(t) for t in texts] + + def _embed_query(self, text: str) -> List[float]: + """Embed query text.""" + if isinstance(self._embedding, Embeddings): + return self._embedding.embed_query(text) + return self._embedding(text) + + @staticmethod + def convert_distance_strategy( + distance_strategy: Union[VectorDistanceMetric, DistanceStrategy], + ) -> DistanceStrategy: + """ + Convert Aerospikes distance strategy to langchains DistanceStrategy + enum. This is a convenience method to allow users to pass in the same + distance metric used to create the index. + """ + from aerospike_vector_search.types import VectorDistanceMetric + + if isinstance(distance_strategy, DistanceStrategy): + return distance_strategy + + if distance_strategy == VectorDistanceMetric.COSINE: + return DistanceStrategy.COSINE + + if distance_strategy == VectorDistanceMetric.DOT_PRODUCT: + return DistanceStrategy.DOT_PRODUCT + + if distance_strategy == VectorDistanceMetric.SQUARED_EUCLIDEAN: + return DistanceStrategy.EUCLIDEAN_DISTANCE + + raise ValueError( + "Unknown distance strategy, must be cosine, dot_product" ", or euclidean" + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + set_name: Optional[str] = None, + embedding_chunk_size: int = 1000, + index_name: Optional[str] = None, + wait_for_index: bool = True, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + set_name: Optional aerospike set name to add the texts to. + batch_size: Batch size to use when adding the texts to the vectorstore. + embedding_chunk_size: Chunk size to use when embedding the texts. + index_name: Optional aerospike index name used for waiting for index + completion. If not provided, the default index_name will be used. + wait_for_index: If True, wait for the all the texts to be indexed + before returning. Requires index_name to be provided. Defaults + to True. + **kwargs: Additional keyword arguments to pass to the client upsert call. + + Returns: + List of ids from adding the texts into the vectorstore. + + """ + if set_name is None: + set_name = self._set_name + + if index_name is None: + index_name = self._index_name + + if wait_for_index and index_name is None: + raise ValueError("if wait_for_index is True, index_name must be provided") + + texts = list(texts) + ids = ids or [str(uuid.uuid4()) for _ in texts] + + # We need to shallow copy so that we can add the vector and text keys + if metadatas: + metadatas = [m.copy() for m in metadatas] + else: + metadatas = metadatas or [{} for _ in texts] + + for i in range(0, len(texts), embedding_chunk_size): + chunk_texts = texts[i : i + embedding_chunk_size] + chunk_ids = ids[i : i + embedding_chunk_size] + chunk_metadatas = metadatas[i : i + embedding_chunk_size] + embeddings = self._embed_documents(chunk_texts) + + for metadata, embedding, text in zip( + chunk_metadatas, embeddings, chunk_texts + ): + metadata[self._vector_key] = embedding + metadata[self._text_key] = text + + for id, metadata in zip(chunk_ids, chunk_metadatas): + metadata[self._id_key] = id + self._client.upsert( + namespace=self._namespace, + key=id, + set_name=set_name, + record_data=metadata, + **kwargs, + ) + + if wait_for_index: + self._client.wait_for_index_completion( + namespace=self._namespace, + name=index_name, + ) + + return ids + + def delete( + self, + ids: Optional[List[str]] = None, + set_name: Optional[str] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete by vector ID or other criteria. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments to pass to client delete call. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + from aerospike_vector_search import AVSServerError + + if ids: + for id in ids: + try: + self._client.delete( + namespace=self._namespace, + key=id, + set_name=set_name, + **kwargs, + ) + except AVSServerError: + return False + + return True + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return aerospike documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + metadata_keys: List of metadata keys to return with the documents. + If None, all metadata keys will be returned. Defaults to None. + index_name: Name of the index to search. Overrides the default + index_name. + kwargs: Additional keyword arguments to pass to the search method. + + Returns: + List of Documents most similar to the query and associated scores. + """ + + return self.similarity_search_by_vector_with_score( + self._embed_query(query), + k=k, + metadata_keys=metadata_keys, + index_name=index_name, + **kwargs, + ) + + def similarity_search_by_vector_with_score( + self, + embedding: List[float], + k: int = 4, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return aerospike documents most similar to embedding, along with scores. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + metadata_keys: List of metadata keys to return with the documents. + If None, all metadata keys will be returned. Defaults to None. + index_name: Name of the index to search. Overrides the default + index_name. + kwargs: Additional keyword arguments to pass to the client + vector_search method. + + Returns: + List of Documents most similar to the query and associated scores. + + """ + + docs = [] + + if metadata_keys and self._text_key not in metadata_keys: + metadata_keys = [self._text_key] + metadata_keys + + if index_name is None: + index_name = self._index_name + + if index_name is None: + raise ValueError("index_name must be provided") + + results: list[Neighbor] = self._client.vector_search( + index_name=index_name, + namespace=self._namespace, + query=embedding, + limit=k, + field_names=metadata_keys, + **kwargs, + ) + + for result in results: + metadata = result.fields + + if self._text_key in metadata: + text = metadata.pop(self._text_key) + score = result.distance + docs.append((Document(page_content=text, metadata=metadata), score)) + else: + logger.warning( + f"Found document with no `{self._text_key}` key. Skipping." + ) + continue + + return docs + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + metadata_keys: List of metadata keys to return with the documents. + If None, all metadata keys will be returned. Defaults to None. + index_name: Name of the index to search. Overrides the default + index_name. + kwargs: Additional keyword arguments to pass to the search method. + + + Returns: + List of Documents most similar to the query vector. + """ + return [ + doc + for doc, _ in self.similarity_search_by_vector_with_score( + embedding, + k=k, + metadata_keys=metadata_keys, + index_name=index_name, + **kwargs, + ) + ] + + def similarity_search( + self, + query: str, + k: int = 4, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return aerospike documents most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + metadata_keys: List of metadata keys to return with the documents. + If None, all metadata keys will be returned. Defaults to None. + index_name: Optional name of the index to search. Overrides the + default index_name. + + Returns: + List of Documents most similar to the query and score for each + """ + docs_and_scores = self.similarity_search_with_score( + query, k=k, metadata_keys=metadata_keys, index_name=index_name, **kwargs + ) + return [doc for doc, _ in docs_and_scores] + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + + 0 is dissimilar, 1 is similar. + + Aerospike's relevance_fn assume euclidean and dot product embeddings are + normalized to unit norm. + """ + if self._distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.DOT_PRODUCT: + return self._max_inner_product_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: + return self._euclidean_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, dot_product" + ", or euclidean" + ) + + @staticmethod + def _cosine_relevance_score_fn(score: float) -> float: + """Aerospike returns cosine distance scores between [0,2] + + 0 is dissimilar, 1 is similar. + """ + return 1 - (score / 2) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree of + diversity among the results with 0 corresponding to maximum + diversity and 1 to minimum diversity. Defaults to 0.5. + metadata_keys: List of metadata keys to return with the documents. + If None, all metadata keys will be returned. Defaults to None. + index_name: Optional name of the index to search. Overrides the + default index_name. + Returns: + List of Documents selected by maximal marginal relevance. + """ + + if metadata_keys and self._vector_key not in metadata_keys: + metadata_keys = [self._vector_key] + metadata_keys + + docs = self.similarity_search_by_vector( + embedding, + k=fetch_k, + metadata_keys=metadata_keys, + index_name=index_name, + **kwargs, + ) + mmr_selected = maximal_marginal_relevance( + np.array([embedding], dtype=np.float32), + [doc.metadata[self._vector_key] for doc in docs], + k=k, + lambda_mult=lambda_mult, + ) + + if metadata_keys and self._vector_key in metadata_keys: + for i in mmr_selected: + docs[i].metadata.pop(self._vector_key) + + return [docs[i] for i in mmr_selected] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + metadata_keys: Optional[List[str]] = None, + index_name: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + index_name: Name of the index to search. + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = self._embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, + k, + fetch_k, + lambda_mult, + metadata_keys=metadata_keys, + index_name=index_name, + **kwargs, + ) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + client: Client = None, + namespace: str = "test", + index_name: Optional[str] = None, + ids: Optional[List[str]] = None, + embeddings_chunk_size: int = 1000, + client_kwargs: Optional[dict] = None, + **kwargs: Any, + ) -> Aerospike: + """ + This is a user friendly interface that: + 1. Embeds text. + 2. Converts the texts into documents. + 3. Adds the documents to a provided Aerospike index + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import Aerospike + from langchain_openai import OpenAIEmbeddings + from aerospike_vector_search import Client, HostPort + + client = Client(seeds=HostPort(host="localhost", port=5000)) + aerospike = Aerospike.from_texts( + ["foo", "bar", "baz"], + embedder, + client, + "namespace", + index_name="index", + vector_key="vector", + distance_strategy=MODEL_DISTANCE_CALC, + ) + """ + aerospike = cls( + client, + embedding, + namespace, + **kwargs, + ) + + aerospike.add_texts( + texts, + metadatas=metadatas, + ids=ids, + index_name=index_name, + embedding_chunk_size=embeddings_chunk_size, + **(client_kwargs or {}), + ) + return aerospike diff --git a/libs/community/poetry.lock b/libs/community/poetry.lock index b9a063f9296..915ff9e4bd7 100644 --- a/libs/community/poetry.lock +++ b/libs/community/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aenum" @@ -12,6 +12,21 @@ files = [ {file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"}, ] +[[package]] +name = "aerospike-vector-search" +version = "0.6.1" +description = "Aerospike Vector Search Client Library for Python" +optional = true +python-versions = ">3.8" +files = [ + {file = "aerospike-vector-search-0.6.1.tar.gz", hash = "sha256:1d3dcf84221a08434a0b2fb4bbac040b3718a169cdd7e44a725eae2fdbad6a43"}, + {file = "aerospike_vector_search-0.6.1-py3-none-any.whl", hash = "sha256:cc7cc7c829f218c4ee9ccd93ca0ecad7104d81deac236309dcdf87e9c399fd35"}, +] + +[package.dependencies] +grpcio = "*" +protobuf = "*" + [[package]] name = "aiodns" version = "3.1.1" @@ -6691,26 +6706,31 @@ python-versions = ">=3.8" files = [ {file = "PyMuPDF-1.23.26-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:645a05321aecc8c45739f71f0eb574ce33138d19189582ffa5241fea3a8e2549"}, {file = "PyMuPDF-1.23.26-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2dfc9e010669ae92fade6fb72aaea49ebe3b8dcd7ee4dcbbe50115abcaa4d3fe"}, + {file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:734ee380b3abd038602be79114194a3cb74ac102b7c943bcb333104575922c50"}, {file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:b22f8d854f8196ad5b20308c1cebad3d5189ed9f0988acbafa043947ea7e6c55"}, {file = "PyMuPDF-1.23.26-cp310-none-win32.whl", hash = "sha256:cc0f794e3466bc96b5bf79d42fbc1551428751e3fef38ebc10ac70396b676144"}, {file = "PyMuPDF-1.23.26-cp310-none-win_amd64.whl", hash = "sha256:2eb701247d8e685a24e45899d1175f01a3ce5fc792a4431c91fbb68633b29298"}, {file = "PyMuPDF-1.23.26-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:e2804a64bb57da414781e312fb0561f6be67658ad57ed4a73dce008b23fc70a6"}, {file = "PyMuPDF-1.23.26-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:97b40bb22e3056874634617a90e0ed24a5172cf71791b9e25d1d91c6743bc567"}, + {file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:fab8833559bc47ab26ce736f915b8fc1dd37c108049b90396f7cd5e1004d7593"}, {file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:f25aafd3e7fb9d7761a22acf2b67d704f04cc36d4dc33a3773f0eb3f4ec3606f"}, {file = "PyMuPDF-1.23.26-cp311-none-win32.whl", hash = "sha256:05e672ed3e82caca7ef02a88ace30130b1dd392a1190f03b2b58ffe7aa331400"}, {file = "PyMuPDF-1.23.26-cp311-none-win_amd64.whl", hash = "sha256:92b3c4dd4d0491d495f333be2d41f4e1c155a409bc9d04b5ff29655dccbf4655"}, {file = "PyMuPDF-1.23.26-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:a217689ede18cc6991b4e6a78afee8a440b3075d53b9dec4ba5ef7487d4547e9"}, {file = "PyMuPDF-1.23.26-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:42ad2b819b90ce1947e11b90ec5085889df0a2e3aa0207bc97ecacfc6157cabc"}, + {file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:99607649f89a02bba7d8ebe96e2410664316adc95e9337f7dfeff6a154f93049"}, {file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:bb42d4b8407b4de7cb58c28f01449f16f32a6daed88afb41108f1aeb3552bdd4"}, {file = "PyMuPDF-1.23.26-cp312-none-win32.whl", hash = "sha256:c40d044411615e6f0baa7d3d933b3032cf97e168c7fa77d1be8a46008c109aee"}, {file = "PyMuPDF-1.23.26-cp312-none-win_amd64.whl", hash = "sha256:3f876533aa7f9a94bcd9a0225ce72571b7808260903fec1d95c120bc842fb52d"}, {file = "PyMuPDF-1.23.26-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:52df831d46beb9ff494f5fba3e5d069af6d81f49abf6b6e799ee01f4f8fa6799"}, {file = "PyMuPDF-1.23.26-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:0bbb0cf6593e53524f3fc26fb5e6ead17c02c64791caec7c4afe61b677dedf80"}, + {file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:5ef4360f20015673c20cf59b7e19afc97168795188c584254ed3778cde43ce77"}, {file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:d7cd88842b2e7f4c71eef4d87c98c35646b80b60e6375392d7ce40e519261f59"}, {file = "PyMuPDF-1.23.26-cp38-none-win32.whl", hash = "sha256:6577e2f473625e2d0df5f5a3bf1e4519e94ae749733cc9937994d1b256687bfa"}, {file = "PyMuPDF-1.23.26-cp38-none-win_amd64.whl", hash = "sha256:fbe1a3255b2cd0d769b2da2c4efdd0c0f30d4961a1aac02c0f75cf951b337aa4"}, {file = "PyMuPDF-1.23.26-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:73fce034f2afea886a59ead2d0caedf27e2b2a8558b5da16d0286882e0b1eb82"}, {file = "PyMuPDF-1.23.26-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:b3de8618b7cb5b36db611083840b3bcf09b11a893e2d8262f4e042102c7e65de"}, + {file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:879e7f5ad35709d8760ab6103c3d5dac8ab8043a856ab3653fd324af7358ee87"}, {file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:deee96c2fd415ded7b5070d8d5b2c60679aee6ed0e28ac0d2cb998060d835c2c"}, {file = "PyMuPDF-1.23.26-cp39-none-win32.whl", hash = "sha256:9f7f4ef99dd8ac97fb0b852efa3dcbee515798078b6c79a6a13c7b1e7c5d41a4"}, {file = "PyMuPDF-1.23.26-cp39-none-win_amd64.whl", hash = "sha256:ba9a54552c7afb9ec85432c765e2fa9a81413acfaa7d70db7c9b528297749e5b"}, @@ -10079,9 +10099,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] cli = ["typer"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "azure-identity", "azure-search-documents", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "httpx-sse", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "oracledb", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pyjwt", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "vdms", "xata", "xmltodict"] +extended-testing = ["aerospike-vector-search", "aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "azure-identity", "azure-search-documents", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "httpx-sse", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "oracledb", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pyjwt", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "vdms", "xata", "xmltodict"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "34179305bdc9ea3a20dd788263a3671c0917dace6513e75d0171a24d9e2cb77b" +content-hash = "6fbb50e2a8146f8fc2590c8de1a194c7bbc7dd2cfd3d2fd090247aadc01e63f1" diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml index b9a306f3c94..64c50283543 100644 --- a/libs/community/pyproject.toml +++ b/libs/community/pyproject.toml @@ -104,6 +104,7 @@ vdms = {version = "^0.0.20", optional = true} httpx-sse = {version = "^0.4.0", optional = true} pyjwt = {version = "^2.8.0", optional = true} oracledb = {version = "^2.2.0", optional = true} +aerospike-vector-search = {version = "^0.6.1", optional = true} [tool.poetry.group.test] optional = true @@ -201,6 +202,7 @@ cli = ["typer"] # Please use new-line on formatting to make it easier to add new packages without # merge-conflicts extended_testing = [ + "aerospike-vector-search", "aleph-alpha-client", "aiosqlite", "assemblyai", diff --git a/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike-proximus.yml b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike-proximus.yml new file mode 100644 index 00000000000..24870678065 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike-proximus.yml @@ -0,0 +1,36 @@ +cluster: + + # Unique identifier for this cluster. + cluster-name: aerospike-vector + +# The Proximus service listening ports, TLS and network interface. +service: + ports: + 5002: {} + # Uncomment for local debugging + advertised-listeners: + default: + address: 127.0.0.1 + port: 5002 + +# Management API listening ports, TLS and network interface. +manage: + ports: + 5040: {} + +# Intra cluster interconnect listening ports, TLS and network interface. +interconnect: + ports: + 5001: {} + +# Target Aerospike cluster +aerospike: + seeds: + - aerospike: + port: 3000 + +# The logging properties. +logging: + enable-console-logging: true + levels: + metrics-ticker: off diff --git a/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike.conf b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike.conf new file mode 100644 index 00000000000..fba3a7a33e9 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/aerospike.conf @@ -0,0 +1,62 @@ +# Aerospike database configuration file for use with systemd. + +service { + cluster-name quote-demo + proto-fd-max 15000 +} + + +logging { + file /var/log/aerospike/aerospike.log { + context any info + } + + # Send log messages to stdout + console { + context any info + context query critical + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 1 + nsup-period 60 + + storage-engine device { + file /opt/aerospike/data/test.dat + filesize 1G + } +} + +namespace proximus-meta { + replication-factor 1 + nsup-period 100 + + storage-engine memory { + data-size 1G + } +} + diff --git a/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/docker-compose.yml b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/docker-compose.yml new file mode 100644 index 00000000000..ea6642dfc97 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/docker-compose/aerospike/docker-compose.yml @@ -0,0 +1,23 @@ +services: + aerospike: + image: aerospike/aerospike-server-enterprise:7.0.0.2 + ports: + - "3000:3000" + networks: + - aerospike-test + volumes: + - .:/opt/aerospike/etc/aerospike + command: + - "--config-file" + - "/opt/aerospike/etc/aerospike/aerospike.conf" + proximus: + image: aerospike/aerospike-proximus:0.4.0 + ports: + - "5002:5002" + networks: + - aerospike-test + volumes: + - .:/etc/aerospike-proximus + +networks: + aerospike-test: {} diff --git a/libs/community/tests/integration_tests/vectorstores/test_aerospike.py b/libs/community/tests/integration_tests/vectorstores/test_aerospike.py new file mode 100644 index 00000000000..4bcbce11fea --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_aerospike.py @@ -0,0 +1,838 @@ +"""Test Aerospike functionality.""" + +import inspect +import os +import subprocess +import time +from typing import Any, Generator + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores.aerospike import ( + Aerospike, +) +from langchain_community.vectorstores.utils import DistanceStrategy +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +pytestmark = pytest.mark.requires("aerospike_vector_search") + +TEST_INDEX_NAME = "test-index" +TEST_NAMESPACE = "test" +TEST_AEROSPIKE_HOST_PORT = ("localhost", 5002) +TEXT_KEY = "_text" +VECTOR_KEY = "_vector" +ID_KEY = "_id" +EUCLIDEAN_SCORE = 1.0 +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + "/docker-compose/aerospike" +FEAT_KEY_PATH = DIR_PATH + "/features.conf" + + +def compose_up() -> None: + subprocess.run(["docker", "compose", "up", "-d"], cwd=DIR_PATH) + time.sleep(10) + + +def compose_down() -> None: + subprocess.run(["docker", "compose", "down"], cwd=DIR_PATH) + + +@pytest.fixture(scope="class", autouse=True) +def docker_compose() -> Generator[None, None, None]: + try: + import aerospike_vector_search # noqa + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + if not os.path.exists(FEAT_KEY_PATH): + pytest.skip( + "Aerospike feature key file not found at path {}".format(FEAT_KEY_PATH) + ) + + compose_up() + yield + compose_down() + + +@pytest.fixture(scope="class") +def seeds() -> Generator[Any, None, None]: + try: + from aerospike_vector_search.types import HostPort + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + yield HostPort( + host=TEST_AEROSPIKE_HOST_PORT[0], + port=TEST_AEROSPIKE_HOST_PORT[1], + ) + + +@pytest.fixture(scope="class") +@pytest.mark.requires("aerospike_vector_search") +def admin_client(seeds: Any) -> Generator[Any, None, None]: + try: + from aerospike_vector_search.admin import Client as AdminClient + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + with AdminClient(seeds=seeds) as admin_client: + yield admin_client + + +@pytest.fixture(scope="class") +@pytest.mark.requires("aerospike_vector_search") +def client(seeds: Any) -> Generator[Any, None, None]: + try: + from aerospike_vector_search import Client + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + with Client(seeds=seeds) as client: + yield client + + +@pytest.fixture +def embedder() -> Any: + return ConsistentFakeEmbeddings() + + +@pytest.fixture +def aerospike( + client: Any, embedder: ConsistentFakeEmbeddings +) -> Generator[Aerospike, None, None]: + yield Aerospike( + client, + embedder, + TEST_NAMESPACE, + vector_key=VECTOR_KEY, + text_key=TEXT_KEY, + id_key=ID_KEY, + ) + + +def get_func_name() -> str: + """ + Used to get the name of the calling function. The name is used for the index + and set name in Aerospike tests for debugging purposes. + """ + return inspect.stack()[1].function + + +""" +TODO: Add tests for delete() +""" + + +class TestAerospike: + def test_from_text( + self, + client: Any, + admin_client: Any, + embedder: ConsistentFakeEmbeddings, + ) -> None: + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike = Aerospike.from_texts( + ["foo", "bar", "baz", "bay", "bax", "baw", "bav"], + embedder, + client=client, + namespace=TEST_NAMESPACE, + index_name=index_name, + ids=["1", "2", "3", "4", "5", "6", "7"], + set_name=set_name, + ) + + expected = [ + Document( + page_content="foo", + metadata={ + ID_KEY: "1", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + }, + ), + Document( + page_content="bar", + metadata={ + ID_KEY: "2", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + }, + ), + Document( + page_content="baz", + metadata={ + ID_KEY: "3", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], + }, + ), + ] + actual = aerospike.search( + "foo", k=3, index_name=index_name, search_type="similarity" + ) + + assert actual == expected + + def test_from_documents( + self, + client: Any, + admin_client: Any, + embedder: ConsistentFakeEmbeddings, + ) -> None: + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + documents = [ + Document( + page_content="foo", + metadata={ + ID_KEY: "1", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + }, + ), + Document( + page_content="bar", + metadata={ + ID_KEY: "2", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + }, + ), + Document( + page_content="baz", + metadata={ + ID_KEY: "3", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], + }, + ), + Document( + page_content="bay", + metadata={ + ID_KEY: "4", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0], + }, + ), + Document( + page_content="bax", + metadata={ + ID_KEY: "5", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0], + }, + ), + Document( + page_content="baw", + metadata={ + ID_KEY: "6", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 5.0], + }, + ), + Document( + page_content="bav", + metadata={ + ID_KEY: "7", + "_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.0], + }, + ), + ] + aerospike = Aerospike.from_documents( + documents, + embedder, + client=client, + namespace=TEST_NAMESPACE, + index_name=index_name, + ids=["1", "2", "3", "4", "5", "6", "7"], + set_name=set_name, + ) + + actual = aerospike.search( + "foo", k=3, index_name=index_name, search_type="similarity" + ) + + expected = documents[:3] + + assert actual == expected + + def test_delete(self, aerospike: Aerospike, admin_client: Any, client: Any) -> None: + """Test end to end construction and search.""" + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) + + assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="1") + assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="2") + assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="3") + + aerospike.delete(["1", "2", "3"], set_name=set_name) + + assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="1") + assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="2") + assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="3") + + def test_search_blocking(self, aerospike: Aerospike, admin_client: Any) -> None: + """Test end to end construction and search.""" + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) # Blocks until all vectors are indexed + expected = [Document(page_content="foo", metadata={ID_KEY: "1"})] + actual = aerospike.search( + "foo", + k=1, + index_name=index_name, + search_type="similarity", + metadata_keys=[ID_KEY], + ) + + assert actual == expected + + def test_search_nonblocking(self, aerospike: Aerospike, admin_client: Any) -> None: + """Test end to end construction and search.""" + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + wait_for_index=True, + ) # blocking + aerospike.add_texts( + ["bay"], index_name=index_name, set_name=set_name, wait_for_index=False + ) + expected = [ + Document(page_content="foo", metadata={ID_KEY: "1"}), + Document(page_content="bar", metadata={ID_KEY: "2"}), + Document(page_content="baz", metadata={ID_KEY: "3"}), + ] + actual = aerospike.search( + "foo", + k=4, + index_name=index_name, + search_type="similarity", + metadata_keys=[ID_KEY], + ) + + # "bay" + assert actual == expected + + def test_similarity_search_with_score( + self, aerospike: Aerospike, admin_client: Any + ) -> None: + """Test end to end construction and search.""" + + expected = [(Document(page_content="foo", metadata={ID_KEY: "1"}), 0.0)] + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) + actual = aerospike.similarity_search_with_score( + "foo", k=1, index_name=index_name, metadata_keys=[ID_KEY] + ) + + assert actual == expected + + def test_similarity_search_by_vector_with_score( + self, + aerospike: Aerospike, + admin_client: Any, + embedder: ConsistentFakeEmbeddings, + ) -> None: + """Test end to end construction and search.""" + + expected = [ + (Document(page_content="foo", metadata={"a": "b", ID_KEY: "1"}), 0.0) + ] + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + metadatas=[{"a": "b", "1": "2"}, {"a": "c"}, {"a": "d"}], + ) + actual = aerospike.similarity_search_by_vector_with_score( + embedder.embed_query("foo"), + k=1, + index_name=index_name, + metadata_keys=["a", ID_KEY], + ) + + assert actual == expected + + def test_similarity_search_by_vector( + self, + aerospike: Aerospike, + admin_client: Any, + embedder: ConsistentFakeEmbeddings, + ) -> None: + """Test end to end construction and search.""" + + expected = [ + Document(page_content="foo", metadata={"a": "b", ID_KEY: "1"}), + Document(page_content="bar", metadata={"a": "c", ID_KEY: "2"}), + ] + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + metadatas=[{"a": "b", "1": "2"}, {"a": "c"}, {"a": "d"}], + ) + actual = aerospike.similarity_search_by_vector( + embedder.embed_query("foo"), + k=2, + index_name=index_name, + metadata_keys=["a", ID_KEY], + ) + + assert actual == expected + + def test_similarity_search(self, aerospike: Aerospike, admin_client: Any) -> None: + """Test end to end construction and search.""" + + expected = [ + Document(page_content="foo", metadata={ID_KEY: "1"}), + Document(page_content="bar", metadata={ID_KEY: "2"}), + Document(page_content="baz", metadata={ID_KEY: "3"}), + ] + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) # blocking + actual = aerospike.similarity_search( + "foo", k=3, index_name=index_name, metadata_keys=[ID_KEY] + ) + + assert actual == expected + + def test_max_marginal_relevance_search_by_vector( + self, + client: Any, + admin_client: Any, + embedder: ConsistentFakeEmbeddings, + ) -> None: + """Test max marginal relevance search.""" + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike = Aerospike.from_texts( + ["foo", "bar", "baz", "bay", "bax", "baw", "bav"], + embedder, + client=client, + namespace=TEST_NAMESPACE, + index_name=index_name, + ids=["1", "2", "3", "4", "5", "6", "7"], + set_name=set_name, + ) + + mmr_output = aerospike.max_marginal_relevance_search_by_vector( + embedder.embed_query("foo"), index_name=index_name, k=3, fetch_k=3 + ) + sim_output = aerospike.similarity_search("foo", index_name=index_name, k=3) + + assert len(mmr_output) == 3 + assert mmr_output == sim_output + + mmr_output = aerospike.max_marginal_relevance_search_by_vector( + embedder.embed_query("foo"), index_name=index_name, k=2, fetch_k=3 + ) + + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == "foo" + assert mmr_output[1].page_content == "bar" + + mmr_output = aerospike.max_marginal_relevance_search_by_vector( + embedder.embed_query("foo"), + index_name=index_name, + k=2, + fetch_k=3, + lambda_mult=0.1, # more diversity + ) + + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == "foo" + assert mmr_output[1].page_content == "baz" + + # if fetch_k < k, then the output will be less than k + mmr_output = aerospike.max_marginal_relevance_search_by_vector( + embedder.embed_query("foo"), index_name=index_name, k=3, fetch_k=2 + ) + assert len(mmr_output) == 2 + + def test_max_marginal_relevance_search( + self, aerospike: Aerospike, admin_client: Any + ) -> None: + """Test max marginal relevance search.""" + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "bar", "baz", "bay", "bax", "baw", "bav"], + ids=["1", "2", "3", "4", "5", "6", "7"], + index_name=index_name, + set_name=set_name, + ) + + mmr_output = aerospike.max_marginal_relevance_search( + "foo", index_name=index_name, k=3, fetch_k=3 + ) + sim_output = aerospike.similarity_search("foo", index_name=index_name, k=3) + + assert len(mmr_output) == 3 + assert mmr_output == sim_output + + mmr_output = aerospike.max_marginal_relevance_search( + "foo", index_name=index_name, k=2, fetch_k=3 + ) + + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == "foo" + assert mmr_output[1].page_content == "bar" + + mmr_output = aerospike.max_marginal_relevance_search( + "foo", + index_name=index_name, + k=2, + fetch_k=3, + lambda_mult=0.1, # more diversity + ) + + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == "foo" + assert mmr_output[1].page_content == "baz" + + # if fetch_k < k, then the output will be less than k + mmr_output = aerospike.max_marginal_relevance_search( + "foo", index_name=index_name, k=3, fetch_k=2 + ) + assert len(mmr_output) == 2 + + def test_cosine_distance(self, aerospike: Aerospike, admin_client: Any) -> None: + """Test cosine distance.""" + from aerospike_vector_search import types + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + vector_distance_metric=types.VectorDistanceMetric.COSINE, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) # blocking + + """ + foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0] + far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0] + cosine similarity ~= 0.71 + cosine distance ~= 1 - cosine similarity = 0.29 + """ + expected = pytest.approx(0.292, abs=0.002) + output = aerospike.similarity_search_with_score( + "far", index_name=index_name, k=3 + ) + + _, actual_score = output[2] + + assert actual_score == expected + + def test_dot_product_distance( + self, aerospike: Aerospike, admin_client: Any + ) -> None: + """Test dot product distance.""" + from aerospike_vector_search import types + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + vector_distance_metric=types.VectorDistanceMetric.DOT_PRODUCT, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) # blocking + + """ + foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0] + far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0] + dot product = 9.0 + dot product distance = dot product * -1 = -9.0 + """ + expected = -9.0 + output = aerospike.similarity_search_with_score( + "far", index_name=index_name, k=3 + ) + + _, actual_score = output[2] + + assert actual_score == expected + + def test_euclidean_distance(self, aerospike: Aerospike, admin_client: Any) -> None: + """Test dot product distance.""" + from aerospike_vector_search import types + + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + vector_distance_metric=types.VectorDistanceMetric.SQUARED_EUCLIDEAN, + ) + aerospike.add_texts( + ["foo", "bar", "baz"], + ids=["1", "2", "3"], + index_name=index_name, + set_name=set_name, + ) # blocking + + """ + foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0] + far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0] + euclidean distance = 9.0 + """ + expected = 9.0 + output = aerospike.similarity_search_with_score( + "far", index_name=index_name, k=3 + ) + + _, actual_score = output[2] + + assert actual_score == expected + + def test_as_retriever(self, aerospike: Aerospike, admin_client: Any) -> None: + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + ) + aerospike.add_texts( + ["foo", "foo", "foo", "foo", "bar"], + ids=["1", "2", "3", "4", "5"], + index_name=index_name, + set_name=set_name, + ) # blocking + + aerospike._index_name = index_name + retriever = aerospike.as_retriever( + search_type="similarity", search_kwargs={"k": 3} + ) + results = retriever.invoke("foo") + assert len(results) == 3 + assert all([d.page_content == "foo" for d in results]) + + def test_as_retriever_distance_threshold( + self, aerospike: Aerospike, admin_client: Any + ) -> None: + from aerospike_vector_search import types + + aerospike._distance_strategy = DistanceStrategy.COSINE + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + vector_distance_metric=types.VectorDistanceMetric.COSINE, + ) + aerospike.add_texts( + ["foo1", "foo2", "foo3", "bar4", "bar5", "bar6", "bar7", "bar8"], + ids=["1", "2", "3", "4", "5", "6", "7", "8"], + index_name=index_name, + set_name=set_name, + ) # blocking + + aerospike._index_name = index_name + retriever = aerospike.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 9, "score_threshold": 0.90}, + ) + results = retriever.invoke("foo1") + + assert all([d.page_content.startswith("foo") for d in results]) + assert len(results) == 3 + + def test_as_retriever_add_documents( + self, aerospike: Aerospike, admin_client: Any + ) -> None: + from aerospike_vector_search import types + + aerospike._distance_strategy = DistanceStrategy.COSINE + index_name = set_name = get_func_name() + admin_client.index_create( + namespace=TEST_NAMESPACE, + sets=set_name, + name=index_name, + vector_field=VECTOR_KEY, + dimensions=10, + vector_distance_metric=types.VectorDistanceMetric.COSINE, + ) + retriever = aerospike.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 9, "score_threshold": 0.90}, + ) + + documents = [ + Document( + page_content="foo1", + metadata={ + "a": 1, + }, + ), + Document( + page_content="foo2", + metadata={ + "a": 2, + }, + ), + Document( + page_content="foo3", + metadata={ + "a": 3, + }, + ), + Document( + page_content="bar4", + metadata={ + "a": 4, + }, + ), + Document( + page_content="bar5", + metadata={ + "a": 5, + }, + ), + Document( + page_content="bar6", + metadata={ + "a": 6, + }, + ), + Document( + page_content="bar7", + metadata={ + "a": 7, + }, + ), + ] + retriever.add_documents( + documents, + ids=["1", "2", "3", "4", "5", "6", "7", "8"], + index_name=index_name, + set_name=set_name, + wait_for_index=True, + ) + + aerospike._index_name = index_name + results = retriever.invoke("foo1") + + assert all([d.page_content.startswith("foo") for d in results]) + assert len(results) == 3 diff --git a/libs/community/tests/unit_tests/vectorstores/test_aerospike.py b/libs/community/tests/unit_tests/vectorstores/test_aerospike.py new file mode 100644 index 00000000000..6ff4bca9958 --- /dev/null +++ b/libs/community/tests/unit_tests/vectorstores/test_aerospike.py @@ -0,0 +1,378 @@ +import sys +from typing import Any, Callable, Generator +from unittest.mock import MagicMock, Mock, call + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores.aerospike import Aerospike +from langchain_community.vectorstores.utils import DistanceStrategy +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +pytestmark = pytest.mark.requires("aerospike_vector_search") and pytest.mark.skipif( + sys.version_info < (3, 9), reason="requires python3.9 or higher" +) + + +@pytest.fixture(scope="module") +def client() -> Generator[Any, None, None]: + try: + from aerospike_vector_search import Client + from aerospike_vector_search.types import HostPort + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + client = Client( + seeds=[ + HostPort(host="dummy-host", port=3000), + ], + ) + + yield client + + client.close() + + +@pytest.fixture +def mock_client(mocker: Any) -> None: + try: + from aerospike_vector_search import Client + except ImportError: + pytest.skip("aerospike_vector_search not installed") + + return mocker.MagicMock(Client) + + +def test_aerospike(client: Any) -> None: + """Ensure an error is raised when search with score in hybrid mode + because in this case Elasticsearch does not return any score. + """ + from aerospike_vector_search import AVSError + + query_string = "foo" + embedding = FakeEmbeddings() + + store = Aerospike( + client=client, + embedding=embedding, + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + # TODO: Remove grpc import when aerospike_vector_search wraps grpc errors + with pytest.raises(AVSError): + store.similarity_search_by_vector(embedding.embed_query(query_string)) + + +def test_init_aerospike_distance(client: Any) -> None: + from aerospike_vector_search.types import VectorDistanceMetric + + embedding = FakeEmbeddings() + aerospike = Aerospike( + client=client, + embedding=embedding, + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=VectorDistanceMetric.COSINE, + ) + + assert aerospike._distance_strategy == DistanceStrategy.COSINE + + +def test_init_bad_embedding(client: Any) -> None: + def bad_embedding() -> None: + return None + + with pytest.warns( + UserWarning, + match=( + "Passing in `embedding` as a Callable is deprecated. Please pass" + + " in an Embeddings object instead." + ), + ): + Aerospike( + client=client, + embedding=bad_embedding, + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + +def test_init_bad_client(client: Any) -> None: + class BadClient: + pass + + with pytest.raises( + ValueError, + match=( + "client should be an instance of aerospike_vector_search.Client," + + " got .BadClient'>" + ), + ): + Aerospike( + client=BadClient(), + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + +def test_convert_distance_strategy(client: Any) -> None: + from aerospike_vector_search.types import VectorDistanceMetric + + aerospike = Aerospike( + client=client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + converted_strategy = aerospike.convert_distance_strategy( + VectorDistanceMetric.COSINE + ) + assert converted_strategy == DistanceStrategy.COSINE + + converted_strategy = aerospike.convert_distance_strategy( + VectorDistanceMetric.DOT_PRODUCT + ) + assert converted_strategy == DistanceStrategy.DOT_PRODUCT + + converted_strategy = aerospike.convert_distance_strategy( + VectorDistanceMetric.SQUARED_EUCLIDEAN + ) + assert converted_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE + + with pytest.raises(ValueError): + aerospike.convert_distance_strategy(VectorDistanceMetric.HAMMING) + + +def test_add_texts_wait_for_index_error(client: Any) -> None: + aerospike = Aerospike( + client=client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + # index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + with pytest.raises( + ValueError, match="if wait_for_index is True, index_name must be provided" + ): + aerospike.add_texts(["foo", "bar"], wait_for_index=True) + + +def test_add_texts_returns_ids(mock_client: MagicMock) -> None: + aerospike = Aerospike( + client=mock_client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + excepted = ["0", "1"] + actual = aerospike.add_texts( + ["foo", "bar"], + metadatas=[{"foo": 0}, {"bar": 1}], + ids=["0", "1"], + set_name="otherset", + index_name="dummy_index", + wait_for_index=True, + ) + + assert excepted == actual + mock_client.upsert.assert_has_calls( + calls=[ + call( + namespace="test", + key="0", + set_name="otherset", + record_data={ + "_id": "0", + "text": "foo", + "vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + "foo": 0, + }, + ), + call( + namespace="test", + key="1", + set_name="otherset", + record_data={ + "_id": "1", + "text": "bar", + "vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + "bar": 1, + }, + ), + ] + ) + mock_client.wait_for_index_completion.assert_called_once_with( + namespace="test", + name="dummy_index", + ) + + +def test_delete_returns_false(mock_client: MagicMock) -> None: + from aerospike_vector_search import AVSServerError + + mock_client.delete.side_effect = Mock(side_effect=AVSServerError(rpc_error="")) + aerospike = Aerospike( + client=mock_client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + assert not aerospike.delete(["foo", "bar"], set_name="testset") + mock_client.delete.assert_called_once_with( + namespace="test", key="foo", set_name="testset" + ) + + +def test_similarity_search_by_vector_with_score_missing_index_name( + client: Any, +) -> None: + aerospike = Aerospike( + client=client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + # index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + with pytest.raises(ValueError, match="index_name must be provided"): + aerospike.similarity_search_by_vector_with_score([1.0, 2.0, 3.0]) + + +def test_similarity_search_by_vector_with_score_filters_missing_text_key( + mock_client: MagicMock, +) -> None: + from aerospike_vector_search.types import Neighbor + + text_key = "text" + mock_client.vector_search.return_value = [ + Neighbor(key="key1", fields={text_key: 1}, distance=1.0), + Neighbor(key="key2", fields={}, distance=0.0), + Neighbor(key="key3", fields={text_key: 3}, distance=3.0), + ] + aerospike = Aerospike( + client=mock_client, + embedding=FakeEmbeddings(), + text_key=text_key, + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + actual = aerospike.similarity_search_by_vector_with_score( + [1.0, 2.0, 3.0], k=10, metadata_keys=["foo"] + ) + + expected = [ + (Document(page_content="1"), 1.0), + (Document(page_content="3"), 3.0), + ] + mock_client.vector_search.assert_called_once_with( + index_name="dummy_index", + namespace="test", + query=[1.0, 2.0, 3.0], + limit=10, + field_names=[text_key, "foo"], + ) + + assert expected == actual + + +def test_similarity_search_by_vector_with_score_overwrite_index_name( + mock_client: MagicMock, +) -> None: + mock_client.vector_search.return_value = [] + aerospike = Aerospike( + client=mock_client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=DistanceStrategy.COSINE, + ) + + aerospike.similarity_search_by_vector_with_score( + [1.0, 2.0, 3.0], index_name="other_index" + ) + + mock_client.vector_search.assert_called_once_with( + index_name="other_index", + namespace="test", + query=[1.0, 2.0, 3.0], + limit=4, + field_names=None, + ) + + +@pytest.mark.parametrize( + "distance_strategy,expected_fn", + [ + (DistanceStrategy.COSINE, Aerospike._cosine_relevance_score_fn), + (DistanceStrategy.EUCLIDEAN_DISTANCE, Aerospike._euclidean_relevance_score_fn), + (DistanceStrategy.DOT_PRODUCT, Aerospike._max_inner_product_relevance_score_fn), + (DistanceStrategy.JACCARD, ValueError), + ], +) +def test_select_relevance_score_fn( + client: Any, distance_strategy: DistanceStrategy, expected_fn: Callable +) -> None: + aerospike = Aerospike( + client=client, + embedding=FakeEmbeddings(), + text_key="text", + vector_key="vector", + index_name="dummy_index", + namespace="test", + set_name="testset", + distance_strategy=distance_strategy, + ) + + if expected_fn == ValueError: + with pytest.raises(ValueError): + aerospike._select_relevance_score_fn() + + else: + fn = aerospike._select_relevance_score_fn() + + assert fn == expected_fn diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index ba3cbdf8f3f..560e857d072 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -4,6 +4,7 @@ from langchain_community import vectorstores from langchain_community.vectorstores import __all__, _module_lookup EXPECTED_ALL = [ + "Aerospike", "AlibabaCloudOpenSearch", "AlibabaCloudOpenSearchSettings", "AnalyticDB", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 3658f5bf540..79d8f583eb9 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -46,6 +46,7 @@ def test_compatible_vectorstore_documentation() -> None: # These are mentioned in the indexing.ipynb documentation documented = { + "Aerospike", "AnalyticDB", "AstraDB", "AzureCosmosDBVectorSearch", diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 958f7a85f10..a13cd1e0dc7 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -2,6 +2,7 @@ from langchain_community.vectorstores import __all__ as public_api _EXPECTED = [ + "Aerospike", "AlibabaCloudOpenSearch", "AlibabaCloudOpenSearchSettings", "AnalyticDB",