mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
community[minor]: add aerospike vectorstore integration (#21735)
Please let me know if you see any possible areas of improvement. I would very much appreciate your constructive criticism if time allows. **Description:** - Added a aerospike vector store integration that utilizes [Aerospike-Vector-Search](https://aerospike.com/products/vector-database-search-llm/) add-on. - Added both unit tests and integration tests - Added a docker compose file for spinning up a test environment - Added a notebook **Dependencies:** any dependencies required for this change - aerospike-vector-search **Twitter handle:** - No twitter, you can use my GitHub handle or LinkedIn if you'd like Thanks! --------- Co-authored-by: Jesse Schumacher <jschumacher@aerospike.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
3587c60396
commit
fc79b372cb
@ -60,7 +60,7 @@
|
||||
" * document addition by id (`add_documents` method with `ids` argument)\n",
|
||||
" * delete by id (`delete` method with `ids` argument)\n",
|
||||
"\n",
|
||||
"Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
|
||||
"Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
|
||||
" \n",
|
||||
"## Caution\n",
|
||||
"\n",
|
||||
|
706
docs/docs/integrations/vectorstores/aerospike.ipynb
Normal file
706
docs/docs/integrations/vectorstores/aerospike.ipynb
Normal file
@ -0,0 +1,706 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Aerospike\n",
|
||||
"\n",
|
||||
"[Aerospike Vector Search](https://aerospike.com/docs/vector) (AVS) is an\n",
|
||||
"extension to the Aerospike Database that enables searches across very large\n",
|
||||
"datasets stored in Aerospike. This new service lives outside of Aerospike and\n",
|
||||
"builds an index to perform those searches.\n",
|
||||
"\n",
|
||||
"This notebook showcases the functionality of the LangChain Aerospike VectorStore\n",
|
||||
"integration.\n",
|
||||
"\n",
|
||||
"## Install AVS\n",
|
||||
"\n",
|
||||
"Before using this notebook, we need to have a running AVS instance. Use one of\n",
|
||||
"the [available installation methods](https://aerospike.com/docs/vector/install). \n",
|
||||
"\n",
|
||||
"When finished, store your AVS instance's IP address and port to use later\n",
|
||||
"in this demo:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROXIMUS_HOST = \"<avs-ip>\"\n",
|
||||
"PROXIMUS_PORT = 5000"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Install Dependencies \n",
|
||||
"The `sentence-transformers` dependency is large. This step could take several minutes to complete."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "shellscript"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install --upgrade --quiet aerospike-vector-search==0.6.1 sentence-transformers langchain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download Quotes Dataset\n",
|
||||
"\n",
|
||||
"We will download a dataset of approximately 100,000 quotes and use a subset of those quotes for semantic search."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2024-05-10 17:28:17-- https://github.com/aerospike/aerospike-vector-search-examples/raw/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz\n",
|
||||
"Resolving github.com (github.com)... 140.82.116.4\n",
|
||||
"Connecting to github.com (github.com)|140.82.116.4|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 302 Found\n",
|
||||
"Location: https://raw.githubusercontent.com/aerospike/aerospike-vector-search-examples/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz [following]\n",
|
||||
"--2024-05-10 17:28:17-- https://raw.githubusercontent.com/aerospike/aerospike-vector-search-examples/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz\n",
|
||||
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
|
||||
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 11597643 (11M) [application/octet-stream]\n",
|
||||
"Saving to: ‘quotes.csv.tgz’\n",
|
||||
"\n",
|
||||
"quotes.csv.tgz 100%[===================>] 11.06M 1.94MB/s in 6.1s \n",
|
||||
"\n",
|
||||
"2024-05-10 17:28:23 (1.81 MB/s) - ‘quotes.csv.tgz’ saved [11597643/11597643]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget https://github.com/aerospike/aerospike-vector-search-examples/raw/7dfab0fccca0852a511c6803aba46578729694b5/quote-semantic-search/container-volumes/quote-search/data/quotes.csv.tgz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load the Quotes Into Documents\n",
|
||||
"\n",
|
||||
"We will load our quotes dataset using the `CSVLoader` document loader. In this case, `lazy_load` returns an iterator to ingest our quotes more efficiently. In this example, we only load 5,000 quotes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import itertools\n",
|
||||
"import os\n",
|
||||
"import tarfile\n",
|
||||
"\n",
|
||||
"from langchain_community.document_loaders.csv_loader import CSVLoader\n",
|
||||
"\n",
|
||||
"filename = \"./quotes.csv\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(filename) and os.path.exists(filename + \".tgz\"):\n",
|
||||
" # Untar the file\n",
|
||||
" with tarfile.open(filename + \".tgz\", \"r:gz\") as tar:\n",
|
||||
" tar.extractall(path=os.path.dirname(filename))\n",
|
||||
"\n",
|
||||
"NUM_QUOTES = 5000\n",
|
||||
"documents = CSVLoader(filename, metadata_columns=[\"author\", \"category\"]).lazy_load()\n",
|
||||
"documents = list(\n",
|
||||
" itertools.islice(documents, NUM_QUOTES)\n",
|
||||
") # Allows us to slice an iterator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content=\"quote: I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.\" metadata={'source': './quotes.csv', 'row': 0, 'author': 'Marilyn Monroe', 'category': 'attributed-no-source, best, life, love, mistakes, out-of-control, truth, worst'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create your Embedder\n",
|
||||
"\n",
|
||||
"In this step, we use HuggingFaceEmbeddings and the \"all-MiniLM-L6-v2\" sentence transformer model to embed our documents so we can perform a vector search."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "60662fc2676a46a2ac48fbf30d9c85fe",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "319412217d3944488f135c8bf8bca73b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "eb020ec2e2f4486294f85c490ef4a387",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "65d248263e4049bea4f6b554640a6aae",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c6b09a49fbd84c799ea28ace296406e3",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"config.json: 0%| | 0.00/612 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7e649688c67544d5af6bdd883c47d315",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "de447c7e4df1485ead14efae1faf96d6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "83ad1f289cd04f73aafca01a8e68e63b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "2b612221e29e433cb50a54a6b838f5af",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "1f5f0c29c58642478cd665731728dad0",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "dff1d16a5a6d4d20ac39adb5c9425cf6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from aerospike_vector_search.types import VectorDistanceMetric\n",
|
||||
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"MODEL_DIM = 384\n",
|
||||
"MODEL_DISTANCE_CALC = VectorDistanceMetric.COSINE\n",
|
||||
"embedder = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an Aerospike Index and Embed Documents\n",
|
||||
"\n",
|
||||
"Before we add documents, we need to create an index in the Aerospike Database. In the example below, we use some convenience code that checks to see if the expected index already exists."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"quote-miniLM-L6-v2 does not exist. Creating index\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from aerospike_vector_search import AdminClient, Client, HostPort\n",
|
||||
"from aerospike_vector_search.types import VectorDistanceMetric\n",
|
||||
"from langchain_community.vectorstores import Aerospike\n",
|
||||
"\n",
|
||||
"# Here we are using the AVS host and port you configured earlier\n",
|
||||
"seed = HostPort(host=PROXIMUS_HOST, port=PROXIMUS_PORT)\n",
|
||||
"\n",
|
||||
"# The namespace of where to place our vectors. This should match the vector configured in your docstore.conf file.\n",
|
||||
"NAMESPACE = \"test\"\n",
|
||||
"\n",
|
||||
"# The name of our new index.\n",
|
||||
"INDEX_NAME = \"quote-miniLM-L6-v2\"\n",
|
||||
"\n",
|
||||
"# AVS needs to know which metadata key contains our vector when creating the index and inserting documents.\n",
|
||||
"VECTOR_KEY = \"vector\"\n",
|
||||
"\n",
|
||||
"client = Client(seeds=seed)\n",
|
||||
"admin_client = AdminClient(\n",
|
||||
" seeds=seed,\n",
|
||||
")\n",
|
||||
"index_exists = False\n",
|
||||
"\n",
|
||||
"# Check if the index already exists. If not, create it\n",
|
||||
"for index in admin_client.index_list():\n",
|
||||
" if index[\"id\"][\"namespace\"] == NAMESPACE and index[\"id\"][\"name\"] == INDEX_NAME:\n",
|
||||
" index_exists = True\n",
|
||||
" print(f\"{INDEX_NAME} already exists. Skipping creation\")\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not index_exists:\n",
|
||||
" print(f\"{INDEX_NAME} does not exist. Creating index\")\n",
|
||||
" admin_client.index_create(\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" name=INDEX_NAME,\n",
|
||||
" vector_field=VECTOR_KEY,\n",
|
||||
" vector_distance_metric=MODEL_DISTANCE_CALC,\n",
|
||||
" dimensions=MODEL_DIM,\n",
|
||||
" index_meta_data={\n",
|
||||
" \"model\": \"miniLM-L6-v2\",\n",
|
||||
" \"date\": \"05/04/2024\",\n",
|
||||
" \"dim\": str(MODEL_DIM),\n",
|
||||
" \"distance\": \"cosine\",\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"admin_client.close()\n",
|
||||
"\n",
|
||||
"docstore = Aerospike.from_documents(\n",
|
||||
" documents,\n",
|
||||
" embedder,\n",
|
||||
" client=client,\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" vector_key=VECTOR_KEY,\n",
|
||||
" index_name=INDEX_NAME,\n",
|
||||
" distance_strategy=MODEL_DISTANCE_CALC,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Search the Documents\n",
|
||||
"Now that we have embedded our vectors, we can use vector search on our quotes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"~~~~ Document 0 ~~~~\n",
|
||||
"auto-generated id: f53589dd-e3e0-4f55-8214-766ca8dc082f\n",
|
||||
"author: Carl Sagan, Cosmos\n",
|
||||
"quote: The Cosmos is all that is or was or ever will be. Our feeblest contemplations of the Cosmos stir us -- there is a tingling in the spine, a catch in the voice, a faint sensation, as if a distant memory, of falling from a height. We know we are approaching the greatest of mysteries.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 1 ~~~~\n",
|
||||
"auto-generated id: dde3e5d1-30b7-47b4-aab7-e319d14e1810\n",
|
||||
"author: Elizabeth Gilbert\n",
|
||||
"quote: The love that moves the sun and the other stars.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 2 ~~~~\n",
|
||||
"auto-generated id: fd56575b-2091-45e7-91c1-9efff2fe5359\n",
|
||||
"author: Renee Ahdieh, The Rose & the Dagger\n",
|
||||
"quote: From the stars, to the stars.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 3 ~~~~\n",
|
||||
"auto-generated id: 8567ed4e-885b-44a7-b993-e0caf422b3c9\n",
|
||||
"author: Dante Alighieri, Paradiso\n",
|
||||
"quote: Love, that moves the sun and the other stars\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 4 ~~~~\n",
|
||||
"auto-generated id: f868c25e-c54d-48cd-a5a8-14bf402f9ea8\n",
|
||||
"author: Thich Nhat Hanh, Teachings on Love\n",
|
||||
"quote: Through my love for you, I want to express my love for the whole cosmos, the whole of humanity, and all beings. By living with you, I want to learn to love everyone and all species. If I succeed in loving you, I will be able to love everyone and all species on Earth... This is the real message of love.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"A quote about the beauty of the cosmos\"\n",
|
||||
"docs = docstore.similarity_search(\n",
|
||||
" query, k=5, index_name=INDEX_NAME, metadata_keys=[\"_id\", \"author\"]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def print_documents(docs):\n",
|
||||
" for i, doc in enumerate(docs):\n",
|
||||
" print(\"~~~~ Document\", i, \"~~~~\")\n",
|
||||
" print(\"auto-generated id:\", doc.metadata[\"_id\"])\n",
|
||||
" print(\"author: \", doc.metadata[\"author\"])\n",
|
||||
" print(doc.page_content)\n",
|
||||
" print(\"~~~~~~~~~~~~~~~~~~~~\\n\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print_documents(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Embedding Additional Quotes as Text\n",
|
||||
"\n",
|
||||
"We can use `add_texts` to add additional quotes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"New IDs\n",
|
||||
"['972846bd-87ae-493b-8ba3-a3d023c03948', '8171122e-cbda-4eb7-a711-6625b120893b', '53b54409-ac19-4d90-b518-d7c40bf5ee5d']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docstore = Aerospike(\n",
|
||||
" client,\n",
|
||||
" embedder,\n",
|
||||
" NAMESPACE,\n",
|
||||
" index_name=INDEX_NAME,\n",
|
||||
" vector_key=VECTOR_KEY,\n",
|
||||
" distance_strategy=MODEL_DISTANCE_CALC,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ids = docstore.add_texts(\n",
|
||||
" [\n",
|
||||
" \"quote: Rebellions are built on hope.\",\n",
|
||||
" \"quote: Logic is the beginning of wisdom, not the end.\",\n",
|
||||
" \"quote: If wishes were fishes, we’d all cast nets.\",\n",
|
||||
" ],\n",
|
||||
" metadatas=[\n",
|
||||
" {\"author\": \"Jyn Erso, Rogue One\"},\n",
|
||||
" {\"author\": \"Spock, Star Trek\"},\n",
|
||||
" {\"author\": \"Frank Herbert, Dune\"},\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"New IDs\")\n",
|
||||
"print(ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Search Documents Using Max Marginal Relevance Search\n",
|
||||
"\n",
|
||||
"We can use max marginal relevance search to find vectors that are similar to our query but dissimilar to each other. In this example, we create a retriever object using `as_retriever`, but this could be done just as easily by calling `docstore.max_marginal_relevance_search` directly. The `lambda_mult` search argument determines the diversity of our query response. 0 corresponds to maximum diversity and 1 to minimum diversity."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"~~~~ Document 0 ~~~~\n",
|
||||
"auto-generated id: 67d5b23f-b2d2-4872-80ad-5834ea08aa64\n",
|
||||
"author: John Grogan, Marley and Me: Life and Love With the World's Worst Dog\n",
|
||||
"quote: Such short little lives our pets have to spend with us, and they spend most of it waiting for us to come home each day. It is amazing how much love and laughter they bring into our lives and even how much closer we become with each other because of them.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 1 ~~~~\n",
|
||||
"auto-generated id: a9b28eb0-a21c-45bf-9e60-ab2b80e988d8\n",
|
||||
"author: John Grogan, Marley and Me: Life and Love With the World's Worst Dog\n",
|
||||
"quote: Dogs are great. Bad dogs, if you can really call them that, are perhaps the greatest of them all.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 2 ~~~~\n",
|
||||
"auto-generated id: ee7434c8-2551-4651-8a22-58514980fb4a\n",
|
||||
"author: Colleen Houck, Tiger's Curse\n",
|
||||
"quote: He then put both hands on the door on either side of my head and leaned in close, pinning me against it. I trembled like a downy rabbit caught in the clutches of a wolf. The wolf came closer. He bent his head and began nuzzling my cheek. The problem was…I wanted the wolf to devour me.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 3 ~~~~\n",
|
||||
"auto-generated id: 9170804c-a155-473b-ab93-8a561dd48f91\n",
|
||||
"author: Ray Bradbury\n",
|
||||
"quote: Stuff your eyes with wonder,\" he said, \"live as if you'd drop dead in ten seconds. See the world. It's more fantastic than any dream made or paid for in factories. Ask no guarantees, ask for no security, there never was such an animal. And if there were, it would be related to the great sloth which hangs upside down in a tree all day every day, sleeping its life away. To hell with that,\" he said, \"shake the tree and knock the great sloth down on his ass.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"A quote about our favorite four-legged pets\"\n",
|
||||
"retriever = docstore.as_retriever(\n",
|
||||
" search_type=\"mmr\", search_kwargs={\"fetch_k\": 20, \"lambda_mult\": 0.7}\n",
|
||||
")\n",
|
||||
"matched_docs = retriever.invoke(query)\n",
|
||||
"\n",
|
||||
"print_documents(matched_docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Search Documents with a Relevance Threshold\n",
|
||||
"\n",
|
||||
"Another useful feature is a similarity search with a relevance threshold. Generally, we only want results that are most similar to our query but also within some range of proximity. A relevance of 1 is most similar and a relevance of 0 is most dissimilar."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"~~~~ Document 0 ~~~~\n",
|
||||
"auto-generated id: 2c1d6ee1-b742-45ea-bed6-24a1f655c849\n",
|
||||
"author: Roy T. Bennett, The Light in the Heart\n",
|
||||
"quote: Never lose hope. Storms make people stronger and never last forever.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 1 ~~~~\n",
|
||||
"auto-generated id: 5962c2cf-ffb5-4e03-9257-bdd630b5c7e9\n",
|
||||
"author: Roy T. Bennett, The Light in the Heart\n",
|
||||
"quote: Difficulties and adversities viciously force all their might on us and cause us to fall apart, but they are necessary elements of individual growth and reveal our true potential. We have got to endure and overcome them, and move forward. Never lose hope. Storms make people stronger and never last forever.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 2 ~~~~\n",
|
||||
"auto-generated id: 3bbcc4ca-de89-4196-9a46-190a50bf6c47\n",
|
||||
"author: Vincent van Gogh, The Letters of Vincent van Gogh\n",
|
||||
"quote: There is peace even in the storm\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n",
|
||||
"~~~~ Document 3 ~~~~\n",
|
||||
"auto-generated id: 37d8cf02-fc2f-429d-b2b6-260a05286108\n",
|
||||
"author: Edwin Morgan, A Book of Lives\n",
|
||||
"quote: Valentine WeatherKiss me with rain on your eyelashes,come on, let us sway together,under the trees, and to hell with thunder.\n",
|
||||
"~~~~~~~~~~~~~~~~~~~~\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"A quote about stormy weather\"\n",
|
||||
"retriever = docstore.as_retriever(\n",
|
||||
" search_type=\"similarity_score_threshold\",\n",
|
||||
" search_kwargs={\n",
|
||||
" \"score_threshold\": 0.4\n",
|
||||
" }, # A greater value returns items with more relevance\n",
|
||||
")\n",
|
||||
"matched_docs = retriever.invoke(query)\n",
|
||||
"\n",
|
||||
"print_documents(matched_docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Clean up\n",
|
||||
"\n",
|
||||
"We need to make sure we close our client to release resources and clean up threads."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ready. Set. Search!\n",
|
||||
"\n",
|
||||
"Now that you are up to speed with Aerospike Vector Search's LangChain integration, you have the power of the Aerospike Database and the LangChain ecosystem at your finger tips. Happy building!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -27,6 +27,9 @@ if TYPE_CHECKING:
|
||||
VectorStore,
|
||||
)
|
||||
|
||||
from langchain_community.vectorstores.aerospike import (
|
||||
Aerospike,
|
||||
)
|
||||
from langchain_community.vectorstores.alibabacloud_opensearch import (
|
||||
AlibabaCloudOpenSearch,
|
||||
AlibabaCloudOpenSearchSettings,
|
||||
@ -292,6 +295,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Aerospike",
|
||||
"AlibabaCloudOpenSearch",
|
||||
"AlibabaCloudOpenSearchSettings",
|
||||
"AnalyticDB",
|
||||
@ -389,6 +393,7 @@ __all__ = [
|
||||
]
|
||||
|
||||
_module_lookup = {
|
||||
"Aerospike": "langchain_community.vectorstores.aerospike",
|
||||
"AlibabaCloudOpenSearch": "langchain_community.vectorstores.alibabacloud_opensearch", # noqa: E501
|
||||
"AlibabaCloudOpenSearchSettings": "langchain_community.vectorstores.alibabacloud_opensearch", # noqa: E501
|
||||
"AnalyticDB": "langchain_community.vectorstores.analyticdb",
|
||||
|
598
libs/community/langchain_community/vectorstores/aerospike.py
Normal file
598
libs/community/langchain_community/vectorstores/aerospike.py
Normal file
@ -0,0 +1,598 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.vectorstores.utils import (
|
||||
DistanceStrategy,
|
||||
maximal_marginal_relevance,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from aerospike_vector_search import Client
|
||||
from aerospike_vector_search.types import Neighbor, VectorDistanceMetric
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _import_aerospike() -> Any:
|
||||
try:
|
||||
from aerospike_vector_search import Client
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import aerospike_vector_search python package. "
|
||||
"Please install it with `pip install aerospike_vector`."
|
||||
) from e
|
||||
return Client
|
||||
|
||||
|
||||
AVST = TypeVar("AVST", bound="Aerospike")
|
||||
|
||||
|
||||
class Aerospike(VectorStore):
|
||||
"""`Aerospike` vector store.
|
||||
|
||||
To use, you should have the ``aerospike_vector_search`` python package installed.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: Client,
|
||||
embedding: Union[Embeddings, Callable],
|
||||
namespace: str,
|
||||
index_name: Optional[str] = None,
|
||||
vector_key: str = "_vector",
|
||||
text_key: str = "_text",
|
||||
id_key: str = "_id",
|
||||
set_name: Optional[str] = None,
|
||||
distance_strategy: Optional[
|
||||
Union[DistanceStrategy, VectorDistanceMetric]
|
||||
] = DistanceStrategy.EUCLIDEAN_DISTANCE,
|
||||
):
|
||||
"""Initialize with Aerospike client.
|
||||
|
||||
Args:
|
||||
client: Aerospike client.
|
||||
embedding: Embeddings object or Callable (deprecated) to embed text.
|
||||
namespace: Namespace to use for storing vectors. This should match
|
||||
index_name: Name of the index previously created in Aerospike. This
|
||||
vector_key: Key to use for vector in metadata. This should match the
|
||||
key used during index creation.
|
||||
text_key: Key to use for text in metadata.
|
||||
id_key: Key to use for id in metadata.
|
||||
set_name: Default set name to use for storing vectors.
|
||||
distance_strategy: Distance strategy to use for similarity search
|
||||
This should match the distance strategy used during index creation.
|
||||
"""
|
||||
|
||||
aerospike = _import_aerospike()
|
||||
|
||||
if not isinstance(embedding, Embeddings):
|
||||
warnings.warn(
|
||||
"Passing in `embedding` as a Callable is deprecated. Please pass in an"
|
||||
" Embeddings object instead."
|
||||
)
|
||||
|
||||
if not isinstance(client, aerospike):
|
||||
raise ValueError(
|
||||
f"client should be an instance of aerospike_vector_search.Client, "
|
||||
f"got {type(client)}"
|
||||
)
|
||||
|
||||
self._client = client
|
||||
self._embedding = embedding
|
||||
self._text_key = text_key
|
||||
self._vector_key = vector_key
|
||||
self._id_key = id_key
|
||||
self._index_name = index_name
|
||||
self._namespace = namespace
|
||||
self._set_name = set_name
|
||||
self._distance_strategy = self.convert_distance_strategy(distance_strategy)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
"""Access the query embedding object if available."""
|
||||
if isinstance(self._embedding, Embeddings):
|
||||
return self._embedding
|
||||
return None
|
||||
|
||||
def _embed_documents(self, texts: Iterable[str]) -> List[List[float]]:
|
||||
"""Embed search docs."""
|
||||
if isinstance(self._embedding, Embeddings):
|
||||
return self._embedding.embed_documents(list(texts))
|
||||
return [self._embedding(t) for t in texts]
|
||||
|
||||
def _embed_query(self, text: str) -> List[float]:
|
||||
"""Embed query text."""
|
||||
if isinstance(self._embedding, Embeddings):
|
||||
return self._embedding.embed_query(text)
|
||||
return self._embedding(text)
|
||||
|
||||
@staticmethod
|
||||
def convert_distance_strategy(
|
||||
distance_strategy: Union[VectorDistanceMetric, DistanceStrategy],
|
||||
) -> DistanceStrategy:
|
||||
"""
|
||||
Convert Aerospikes distance strategy to langchains DistanceStrategy
|
||||
enum. This is a convenience method to allow users to pass in the same
|
||||
distance metric used to create the index.
|
||||
"""
|
||||
from aerospike_vector_search.types import VectorDistanceMetric
|
||||
|
||||
if isinstance(distance_strategy, DistanceStrategy):
|
||||
return distance_strategy
|
||||
|
||||
if distance_strategy == VectorDistanceMetric.COSINE:
|
||||
return DistanceStrategy.COSINE
|
||||
|
||||
if distance_strategy == VectorDistanceMetric.DOT_PRODUCT:
|
||||
return DistanceStrategy.DOT_PRODUCT
|
||||
|
||||
if distance_strategy == VectorDistanceMetric.SQUARED_EUCLIDEAN:
|
||||
return DistanceStrategy.EUCLIDEAN_DISTANCE
|
||||
|
||||
raise ValueError(
|
||||
"Unknown distance strategy, must be cosine, dot_product" ", or euclidean"
|
||||
)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
set_name: Optional[str] = None,
|
||||
embedding_chunk_size: int = 1000,
|
||||
index_name: Optional[str] = None,
|
||||
wait_for_index: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
set_name: Optional aerospike set name to add the texts to.
|
||||
batch_size: Batch size to use when adding the texts to the vectorstore.
|
||||
embedding_chunk_size: Chunk size to use when embedding the texts.
|
||||
index_name: Optional aerospike index name used for waiting for index
|
||||
completion. If not provided, the default index_name will be used.
|
||||
wait_for_index: If True, wait for the all the texts to be indexed
|
||||
before returning. Requires index_name to be provided. Defaults
|
||||
to True.
|
||||
**kwargs: Additional keyword arguments to pass to the client upsert call.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
"""
|
||||
if set_name is None:
|
||||
set_name = self._set_name
|
||||
|
||||
if index_name is None:
|
||||
index_name = self._index_name
|
||||
|
||||
if wait_for_index and index_name is None:
|
||||
raise ValueError("if wait_for_index is True, index_name must be provided")
|
||||
|
||||
texts = list(texts)
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
|
||||
# We need to shallow copy so that we can add the vector and text keys
|
||||
if metadatas:
|
||||
metadatas = [m.copy() for m in metadatas]
|
||||
else:
|
||||
metadatas = metadatas or [{} for _ in texts]
|
||||
|
||||
for i in range(0, len(texts), embedding_chunk_size):
|
||||
chunk_texts = texts[i : i + embedding_chunk_size]
|
||||
chunk_ids = ids[i : i + embedding_chunk_size]
|
||||
chunk_metadatas = metadatas[i : i + embedding_chunk_size]
|
||||
embeddings = self._embed_documents(chunk_texts)
|
||||
|
||||
for metadata, embedding, text in zip(
|
||||
chunk_metadatas, embeddings, chunk_texts
|
||||
):
|
||||
metadata[self._vector_key] = embedding
|
||||
metadata[self._text_key] = text
|
||||
|
||||
for id, metadata in zip(chunk_ids, chunk_metadatas):
|
||||
metadata[self._id_key] = id
|
||||
self._client.upsert(
|
||||
namespace=self._namespace,
|
||||
key=id,
|
||||
set_name=set_name,
|
||||
record_data=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if wait_for_index:
|
||||
self._client.wait_for_index_completion(
|
||||
namespace=self._namespace,
|
||||
name=index_name,
|
||||
)
|
||||
|
||||
return ids
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
set_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> Optional[bool]:
|
||||
"""Delete by vector ID or other criteria.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
**kwargs: Other keyword arguments to pass to client delete call.
|
||||
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful,
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
from aerospike_vector_search import AVSServerError
|
||||
|
||||
if ids:
|
||||
for id in ids:
|
||||
try:
|
||||
self._client.delete(
|
||||
namespace=self._namespace,
|
||||
key=id,
|
||||
set_name=set_name,
|
||||
**kwargs,
|
||||
)
|
||||
except AVSServerError:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return aerospike documents most similar to query, along with scores.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
metadata_keys: List of metadata keys to return with the documents.
|
||||
If None, all metadata keys will be returned. Defaults to None.
|
||||
index_name: Name of the index to search. Overrides the default
|
||||
index_name.
|
||||
kwargs: Additional keyword arguments to pass to the search method.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and associated scores.
|
||||
"""
|
||||
|
||||
return self.similarity_search_by_vector_with_score(
|
||||
self._embed_query(query),
|
||||
k=k,
|
||||
metadata_keys=metadata_keys,
|
||||
index_name=index_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def similarity_search_by_vector_with_score(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return aerospike documents most similar to embedding, along with scores.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
metadata_keys: List of metadata keys to return with the documents.
|
||||
If None, all metadata keys will be returned. Defaults to None.
|
||||
index_name: Name of the index to search. Overrides the default
|
||||
index_name.
|
||||
kwargs: Additional keyword arguments to pass to the client
|
||||
vector_search method.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and associated scores.
|
||||
|
||||
"""
|
||||
|
||||
docs = []
|
||||
|
||||
if metadata_keys and self._text_key not in metadata_keys:
|
||||
metadata_keys = [self._text_key] + metadata_keys
|
||||
|
||||
if index_name is None:
|
||||
index_name = self._index_name
|
||||
|
||||
if index_name is None:
|
||||
raise ValueError("index_name must be provided")
|
||||
|
||||
results: list[Neighbor] = self._client.vector_search(
|
||||
index_name=index_name,
|
||||
namespace=self._namespace,
|
||||
query=embedding,
|
||||
limit=k,
|
||||
field_names=metadata_keys,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
metadata = result.fields
|
||||
|
||||
if self._text_key in metadata:
|
||||
text = metadata.pop(self._text_key)
|
||||
score = result.distance
|
||||
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||
else:
|
||||
logger.warning(
|
||||
f"Found document with no `{self._text_key}` key. Skipping."
|
||||
)
|
||||
continue
|
||||
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
metadata_keys: List of metadata keys to return with the documents.
|
||||
If None, all metadata keys will be returned. Defaults to None.
|
||||
index_name: Name of the index to search. Overrides the default
|
||||
index_name.
|
||||
kwargs: Additional keyword arguments to pass to the search method.
|
||||
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
return [
|
||||
doc
|
||||
for doc, _ in self.similarity_search_by_vector_with_score(
|
||||
embedding,
|
||||
k=k,
|
||||
metadata_keys=metadata_keys,
|
||||
index_name=index_name,
|
||||
**kwargs,
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return aerospike documents most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
metadata_keys: List of metadata keys to return with the documents.
|
||||
If None, all metadata keys will be returned. Defaults to None.
|
||||
index_name: Optional name of the index to search. Overrides the
|
||||
default index_name.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query, k=k, metadata_keys=metadata_keys, index_name=index_name, **kwargs
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||
"""
|
||||
The 'correct' relevance function
|
||||
may differ depending on a few things, including:
|
||||
- the distance / similarity metric used by the VectorStore
|
||||
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
||||
- embedding dimensionality
|
||||
- etc.
|
||||
|
||||
0 is dissimilar, 1 is similar.
|
||||
|
||||
Aerospike's relevance_fn assume euclidean and dot product embeddings are
|
||||
normalized to unit norm.
|
||||
"""
|
||||
if self._distance_strategy == DistanceStrategy.COSINE:
|
||||
return self._cosine_relevance_score_fn
|
||||
elif self._distance_strategy == DistanceStrategy.DOT_PRODUCT:
|
||||
return self._max_inner_product_relevance_score_fn
|
||||
elif self._distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
||||
return self._euclidean_relevance_score_fn
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown distance strategy, must be cosine, dot_product"
|
||||
", or euclidean"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _cosine_relevance_score_fn(score: float) -> float:
|
||||
"""Aerospike returns cosine distance scores between [0,2]
|
||||
|
||||
0 is dissimilar, 1 is similar.
|
||||
"""
|
||||
return 1 - (score / 2)
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree of
|
||||
diversity among the results with 0 corresponding to maximum
|
||||
diversity and 1 to minimum diversity. Defaults to 0.5.
|
||||
metadata_keys: List of metadata keys to return with the documents.
|
||||
If None, all metadata keys will be returned. Defaults to None.
|
||||
index_name: Optional name of the index to search. Overrides the
|
||||
default index_name.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
if metadata_keys and self._vector_key not in metadata_keys:
|
||||
metadata_keys = [self._vector_key] + metadata_keys
|
||||
|
||||
docs = self.similarity_search_by_vector(
|
||||
embedding,
|
||||
k=fetch_k,
|
||||
metadata_keys=metadata_keys,
|
||||
index_name=index_name,
|
||||
**kwargs,
|
||||
)
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array([embedding], dtype=np.float32),
|
||||
[doc.metadata[self._vector_key] for doc in docs],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
|
||||
if metadata_keys and self._vector_key in metadata_keys:
|
||||
for i in mmr_selected:
|
||||
docs[i].metadata.pop(self._vector_key)
|
||||
|
||||
return [docs[i] for i in mmr_selected]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
metadata_keys: Optional[List[str]] = None,
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
index_name: Name of the index to search.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
embedding = self._embed_query(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
fetch_k,
|
||||
lambda_mult,
|
||||
metadata_keys=metadata_keys,
|
||||
index_name=index_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
client: Client = None,
|
||||
namespace: str = "test",
|
||||
index_name: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
embeddings_chunk_size: int = 1000,
|
||||
client_kwargs: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> Aerospike:
|
||||
"""
|
||||
This is a user friendly interface that:
|
||||
1. Embeds text.
|
||||
2. Converts the texts into documents.
|
||||
3. Adds the documents to a provided Aerospike index
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import Aerospike
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from aerospike_vector_search import Client, HostPort
|
||||
|
||||
client = Client(seeds=HostPort(host="localhost", port=5000))
|
||||
aerospike = Aerospike.from_texts(
|
||||
["foo", "bar", "baz"],
|
||||
embedder,
|
||||
client,
|
||||
"namespace",
|
||||
index_name="index",
|
||||
vector_key="vector",
|
||||
distance_strategy=MODEL_DISTANCE_CALC,
|
||||
)
|
||||
"""
|
||||
aerospike = cls(
|
||||
client,
|
||||
embedding,
|
||||
namespace,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
aerospike.add_texts(
|
||||
texts,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
index_name=index_name,
|
||||
embedding_chunk_size=embeddings_chunk_size,
|
||||
**(client_kwargs or {}),
|
||||
)
|
||||
return aerospike
|
26
libs/community/poetry.lock
generated
26
libs/community/poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aenum"
|
||||
@ -12,6 +12,21 @@ files = [
|
||||
{file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aerospike-vector-search"
|
||||
version = "0.6.1"
|
||||
description = "Aerospike Vector Search Client Library for Python"
|
||||
optional = true
|
||||
python-versions = ">3.8"
|
||||
files = [
|
||||
{file = "aerospike-vector-search-0.6.1.tar.gz", hash = "sha256:1d3dcf84221a08434a0b2fb4bbac040b3718a169cdd7e44a725eae2fdbad6a43"},
|
||||
{file = "aerospike_vector_search-0.6.1-py3-none-any.whl", hash = "sha256:cc7cc7c829f218c4ee9ccd93ca0ecad7104d81deac236309dcdf87e9c399fd35"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
grpcio = "*"
|
||||
protobuf = "*"
|
||||
|
||||
[[package]]
|
||||
name = "aiodns"
|
||||
version = "3.1.1"
|
||||
@ -6691,26 +6706,31 @@ python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:645a05321aecc8c45739f71f0eb574ce33138d19189582ffa5241fea3a8e2549"},
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2dfc9e010669ae92fade6fb72aaea49ebe3b8dcd7ee4dcbbe50115abcaa4d3fe"},
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:734ee380b3abd038602be79114194a3cb74ac102b7c943bcb333104575922c50"},
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:b22f8d854f8196ad5b20308c1cebad3d5189ed9f0988acbafa043947ea7e6c55"},
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-win32.whl", hash = "sha256:cc0f794e3466bc96b5bf79d42fbc1551428751e3fef38ebc10ac70396b676144"},
|
||||
{file = "PyMuPDF-1.23.26-cp310-none-win_amd64.whl", hash = "sha256:2eb701247d8e685a24e45899d1175f01a3ce5fc792a4431c91fbb68633b29298"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:e2804a64bb57da414781e312fb0561f6be67658ad57ed4a73dce008b23fc70a6"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:97b40bb22e3056874634617a90e0ed24a5172cf71791b9e25d1d91c6743bc567"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:fab8833559bc47ab26ce736f915b8fc1dd37c108049b90396f7cd5e1004d7593"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:f25aafd3e7fb9d7761a22acf2b67d704f04cc36d4dc33a3773f0eb3f4ec3606f"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-win32.whl", hash = "sha256:05e672ed3e82caca7ef02a88ace30130b1dd392a1190f03b2b58ffe7aa331400"},
|
||||
{file = "PyMuPDF-1.23.26-cp311-none-win_amd64.whl", hash = "sha256:92b3c4dd4d0491d495f333be2d41f4e1c155a409bc9d04b5ff29655dccbf4655"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:a217689ede18cc6991b4e6a78afee8a440b3075d53b9dec4ba5ef7487d4547e9"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:42ad2b819b90ce1947e11b90ec5085889df0a2e3aa0207bc97ecacfc6157cabc"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:99607649f89a02bba7d8ebe96e2410664316adc95e9337f7dfeff6a154f93049"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:bb42d4b8407b4de7cb58c28f01449f16f32a6daed88afb41108f1aeb3552bdd4"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-win32.whl", hash = "sha256:c40d044411615e6f0baa7d3d933b3032cf97e168c7fa77d1be8a46008c109aee"},
|
||||
{file = "PyMuPDF-1.23.26-cp312-none-win_amd64.whl", hash = "sha256:3f876533aa7f9a94bcd9a0225ce72571b7808260903fec1d95c120bc842fb52d"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:52df831d46beb9ff494f5fba3e5d069af6d81f49abf6b6e799ee01f4f8fa6799"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:0bbb0cf6593e53524f3fc26fb5e6ead17c02c64791caec7c4afe61b677dedf80"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:5ef4360f20015673c20cf59b7e19afc97168795188c584254ed3778cde43ce77"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:d7cd88842b2e7f4c71eef4d87c98c35646b80b60e6375392d7ce40e519261f59"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-win32.whl", hash = "sha256:6577e2f473625e2d0df5f5a3bf1e4519e94ae749733cc9937994d1b256687bfa"},
|
||||
{file = "PyMuPDF-1.23.26-cp38-none-win_amd64.whl", hash = "sha256:fbe1a3255b2cd0d769b2da2c4efdd0c0f30d4961a1aac02c0f75cf951b337aa4"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:73fce034f2afea886a59ead2d0caedf27e2b2a8558b5da16d0286882e0b1eb82"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:b3de8618b7cb5b36db611083840b3bcf09b11a893e2d8262f4e042102c7e65de"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:879e7f5ad35709d8760ab6103c3d5dac8ab8043a856ab3653fd324af7358ee87"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:deee96c2fd415ded7b5070d8d5b2c60679aee6ed0e28ac0d2cb998060d835c2c"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-win32.whl", hash = "sha256:9f7f4ef99dd8ac97fb0b852efa3dcbee515798078b6c79a6a13c7b1e7c5d41a4"},
|
||||
{file = "PyMuPDF-1.23.26-cp39-none-win_amd64.whl", hash = "sha256:ba9a54552c7afb9ec85432c765e2fa9a81413acfaa7d70db7c9b528297749e5b"},
|
||||
@ -10079,9 +10099,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
||||
|
||||
[extras]
|
||||
cli = ["typer"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "azure-identity", "azure-search-documents", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "httpx-sse", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "oracledb", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pyjwt", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "vdms", "xata", "xmltodict"]
|
||||
extended-testing = ["aerospike-vector-search", "aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "azure-identity", "azure-search-documents", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "httpx-sse", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "oracledb", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pyjwt", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "vdms", "xata", "xmltodict"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "34179305bdc9ea3a20dd788263a3671c0917dace6513e75d0171a24d9e2cb77b"
|
||||
content-hash = "6fbb50e2a8146f8fc2590c8de1a194c7bbc7dd2cfd3d2fd090247aadc01e63f1"
|
||||
|
@ -104,6 +104,7 @@ vdms = {version = "^0.0.20", optional = true}
|
||||
httpx-sse = {version = "^0.4.0", optional = true}
|
||||
pyjwt = {version = "^2.8.0", optional = true}
|
||||
oracledb = {version = "^2.2.0", optional = true}
|
||||
aerospike-vector-search = {version = "^0.6.1", optional = true}
|
||||
|
||||
[tool.poetry.group.test]
|
||||
optional = true
|
||||
@ -201,6 +202,7 @@ cli = ["typer"]
|
||||
# Please use new-line on formatting to make it easier to add new packages without
|
||||
# merge-conflicts
|
||||
extended_testing = [
|
||||
"aerospike-vector-search",
|
||||
"aleph-alpha-client",
|
||||
"aiosqlite",
|
||||
"assemblyai",
|
||||
|
@ -0,0 +1,36 @@
|
||||
cluster:
|
||||
|
||||
# Unique identifier for this cluster.
|
||||
cluster-name: aerospike-vector
|
||||
|
||||
# The Proximus service listening ports, TLS and network interface.
|
||||
service:
|
||||
ports:
|
||||
5002: {}
|
||||
# Uncomment for local debugging
|
||||
advertised-listeners:
|
||||
default:
|
||||
address: 127.0.0.1
|
||||
port: 5002
|
||||
|
||||
# Management API listening ports, TLS and network interface.
|
||||
manage:
|
||||
ports:
|
||||
5040: {}
|
||||
|
||||
# Intra cluster interconnect listening ports, TLS and network interface.
|
||||
interconnect:
|
||||
ports:
|
||||
5001: {}
|
||||
|
||||
# Target Aerospike cluster
|
||||
aerospike:
|
||||
seeds:
|
||||
- aerospike:
|
||||
port: 3000
|
||||
|
||||
# The logging properties.
|
||||
logging:
|
||||
enable-console-logging: true
|
||||
levels:
|
||||
metrics-ticker: off
|
@ -0,0 +1,62 @@
|
||||
# Aerospike database configuration file for use with systemd.
|
||||
|
||||
service {
|
||||
cluster-name quote-demo
|
||||
proto-fd-max 15000
|
||||
}
|
||||
|
||||
|
||||
logging {
|
||||
file /var/log/aerospike/aerospike.log {
|
||||
context any info
|
||||
}
|
||||
|
||||
# Send log messages to stdout
|
||||
console {
|
||||
context any info
|
||||
context query critical
|
||||
}
|
||||
}
|
||||
|
||||
network {
|
||||
service {
|
||||
address any
|
||||
port 3000
|
||||
}
|
||||
|
||||
heartbeat {
|
||||
mode multicast
|
||||
multicast-group 239.1.99.222
|
||||
port 9918
|
||||
interval 150
|
||||
timeout 10
|
||||
}
|
||||
|
||||
fabric {
|
||||
port 3001
|
||||
}
|
||||
|
||||
info {
|
||||
port 3003
|
||||
}
|
||||
}
|
||||
|
||||
namespace test {
|
||||
replication-factor 1
|
||||
nsup-period 60
|
||||
|
||||
storage-engine device {
|
||||
file /opt/aerospike/data/test.dat
|
||||
filesize 1G
|
||||
}
|
||||
}
|
||||
|
||||
namespace proximus-meta {
|
||||
replication-factor 1
|
||||
nsup-period 100
|
||||
|
||||
storage-engine memory {
|
||||
data-size 1G
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,23 @@
|
||||
services:
|
||||
aerospike:
|
||||
image: aerospike/aerospike-server-enterprise:7.0.0.2
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- aerospike-test
|
||||
volumes:
|
||||
- .:/opt/aerospike/etc/aerospike
|
||||
command:
|
||||
- "--config-file"
|
||||
- "/opt/aerospike/etc/aerospike/aerospike.conf"
|
||||
proximus:
|
||||
image: aerospike/aerospike-proximus:0.4.0
|
||||
ports:
|
||||
- "5002:5002"
|
||||
networks:
|
||||
- aerospike-test
|
||||
volumes:
|
||||
- .:/etc/aerospike-proximus
|
||||
|
||||
networks:
|
||||
aerospike-test: {}
|
@ -0,0 +1,838 @@
|
||||
"""Test Aerospike functionality."""
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Any, Generator
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.vectorstores.aerospike import (
|
||||
Aerospike,
|
||||
)
|
||||
from langchain_community.vectorstores.utils import DistanceStrategy
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||
ConsistentFakeEmbeddings,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.requires("aerospike_vector_search")
|
||||
|
||||
TEST_INDEX_NAME = "test-index"
|
||||
TEST_NAMESPACE = "test"
|
||||
TEST_AEROSPIKE_HOST_PORT = ("localhost", 5002)
|
||||
TEXT_KEY = "_text"
|
||||
VECTOR_KEY = "_vector"
|
||||
ID_KEY = "_id"
|
||||
EUCLIDEAN_SCORE = 1.0
|
||||
DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + "/docker-compose/aerospike"
|
||||
FEAT_KEY_PATH = DIR_PATH + "/features.conf"
|
||||
|
||||
|
||||
def compose_up() -> None:
|
||||
subprocess.run(["docker", "compose", "up", "-d"], cwd=DIR_PATH)
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
def compose_down() -> None:
|
||||
subprocess.run(["docker", "compose", "down"], cwd=DIR_PATH)
|
||||
|
||||
|
||||
@pytest.fixture(scope="class", autouse=True)
|
||||
def docker_compose() -> Generator[None, None, None]:
|
||||
try:
|
||||
import aerospike_vector_search # noqa
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
if not os.path.exists(FEAT_KEY_PATH):
|
||||
pytest.skip(
|
||||
"Aerospike feature key file not found at path {}".format(FEAT_KEY_PATH)
|
||||
)
|
||||
|
||||
compose_up()
|
||||
yield
|
||||
compose_down()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def seeds() -> Generator[Any, None, None]:
|
||||
try:
|
||||
from aerospike_vector_search.types import HostPort
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
yield HostPort(
|
||||
host=TEST_AEROSPIKE_HOST_PORT[0],
|
||||
port=TEST_AEROSPIKE_HOST_PORT[1],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
@pytest.mark.requires("aerospike_vector_search")
|
||||
def admin_client(seeds: Any) -> Generator[Any, None, None]:
|
||||
try:
|
||||
from aerospike_vector_search.admin import Client as AdminClient
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
with AdminClient(seeds=seeds) as admin_client:
|
||||
yield admin_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
@pytest.mark.requires("aerospike_vector_search")
|
||||
def client(seeds: Any) -> Generator[Any, None, None]:
|
||||
try:
|
||||
from aerospike_vector_search import Client
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
with Client(seeds=seeds) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def embedder() -> Any:
|
||||
return ConsistentFakeEmbeddings()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def aerospike(
|
||||
client: Any, embedder: ConsistentFakeEmbeddings
|
||||
) -> Generator[Aerospike, None, None]:
|
||||
yield Aerospike(
|
||||
client,
|
||||
embedder,
|
||||
TEST_NAMESPACE,
|
||||
vector_key=VECTOR_KEY,
|
||||
text_key=TEXT_KEY,
|
||||
id_key=ID_KEY,
|
||||
)
|
||||
|
||||
|
||||
def get_func_name() -> str:
|
||||
"""
|
||||
Used to get the name of the calling function. The name is used for the index
|
||||
and set name in Aerospike tests for debugging purposes.
|
||||
"""
|
||||
return inspect.stack()[1].function
|
||||
|
||||
|
||||
"""
|
||||
TODO: Add tests for delete()
|
||||
"""
|
||||
|
||||
|
||||
class TestAerospike:
|
||||
def test_from_text(
|
||||
self,
|
||||
client: Any,
|
||||
admin_client: Any,
|
||||
embedder: ConsistentFakeEmbeddings,
|
||||
) -> None:
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike = Aerospike.from_texts(
|
||||
["foo", "bar", "baz", "bay", "bax", "baw", "bav"],
|
||||
embedder,
|
||||
client=client,
|
||||
namespace=TEST_NAMESPACE,
|
||||
index_name=index_name,
|
||||
ids=["1", "2", "3", "4", "5", "6", "7"],
|
||||
set_name=set_name,
|
||||
)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="foo",
|
||||
metadata={
|
||||
ID_KEY: "1",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar",
|
||||
metadata={
|
||||
ID_KEY: "2",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="baz",
|
||||
metadata={
|
||||
ID_KEY: "3",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
|
||||
},
|
||||
),
|
||||
]
|
||||
actual = aerospike.search(
|
||||
"foo", k=3, index_name=index_name, search_type="similarity"
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_from_documents(
|
||||
self,
|
||||
client: Any,
|
||||
admin_client: Any,
|
||||
embedder: ConsistentFakeEmbeddings,
|
||||
) -> None:
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
documents = [
|
||||
Document(
|
||||
page_content="foo",
|
||||
metadata={
|
||||
ID_KEY: "1",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar",
|
||||
metadata={
|
||||
ID_KEY: "2",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="baz",
|
||||
metadata={
|
||||
ID_KEY: "3",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bay",
|
||||
metadata={
|
||||
ID_KEY: "4",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bax",
|
||||
metadata={
|
||||
ID_KEY: "5",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="baw",
|
||||
metadata={
|
||||
ID_KEY: "6",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 5.0],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bav",
|
||||
metadata={
|
||||
ID_KEY: "7",
|
||||
"_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.0],
|
||||
},
|
||||
),
|
||||
]
|
||||
aerospike = Aerospike.from_documents(
|
||||
documents,
|
||||
embedder,
|
||||
client=client,
|
||||
namespace=TEST_NAMESPACE,
|
||||
index_name=index_name,
|
||||
ids=["1", "2", "3", "4", "5", "6", "7"],
|
||||
set_name=set_name,
|
||||
)
|
||||
|
||||
actual = aerospike.search(
|
||||
"foo", k=3, index_name=index_name, search_type="similarity"
|
||||
)
|
||||
|
||||
expected = documents[:3]
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_delete(self, aerospike: Aerospike, admin_client: Any, client: Any) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
)
|
||||
|
||||
assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="1")
|
||||
assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="2")
|
||||
assert client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="3")
|
||||
|
||||
aerospike.delete(["1", "2", "3"], set_name=set_name)
|
||||
|
||||
assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="1")
|
||||
assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="2")
|
||||
assert not client.exists(namespace=TEST_NAMESPACE, set_name=set_name, key="3")
|
||||
|
||||
def test_search_blocking(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # Blocks until all vectors are indexed
|
||||
expected = [Document(page_content="foo", metadata={ID_KEY: "1"})]
|
||||
actual = aerospike.search(
|
||||
"foo",
|
||||
k=1,
|
||||
index_name=index_name,
|
||||
search_type="similarity",
|
||||
metadata_keys=[ID_KEY],
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_search_nonblocking(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
wait_for_index=True,
|
||||
) # blocking
|
||||
aerospike.add_texts(
|
||||
["bay"], index_name=index_name, set_name=set_name, wait_for_index=False
|
||||
)
|
||||
expected = [
|
||||
Document(page_content="foo", metadata={ID_KEY: "1"}),
|
||||
Document(page_content="bar", metadata={ID_KEY: "2"}),
|
||||
Document(page_content="baz", metadata={ID_KEY: "3"}),
|
||||
]
|
||||
actual = aerospike.search(
|
||||
"foo",
|
||||
k=4,
|
||||
index_name=index_name,
|
||||
search_type="similarity",
|
||||
metadata_keys=[ID_KEY],
|
||||
)
|
||||
|
||||
# "bay"
|
||||
assert actual == expected
|
||||
|
||||
def test_similarity_search_with_score(
|
||||
self, aerospike: Aerospike, admin_client: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
expected = [(Document(page_content="foo", metadata={ID_KEY: "1"}), 0.0)]
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
)
|
||||
actual = aerospike.similarity_search_with_score(
|
||||
"foo", k=1, index_name=index_name, metadata_keys=[ID_KEY]
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_similarity_search_by_vector_with_score(
|
||||
self,
|
||||
aerospike: Aerospike,
|
||||
admin_client: Any,
|
||||
embedder: ConsistentFakeEmbeddings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
expected = [
|
||||
(Document(page_content="foo", metadata={"a": "b", ID_KEY: "1"}), 0.0)
|
||||
]
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
metadatas=[{"a": "b", "1": "2"}, {"a": "c"}, {"a": "d"}],
|
||||
)
|
||||
actual = aerospike.similarity_search_by_vector_with_score(
|
||||
embedder.embed_query("foo"),
|
||||
k=1,
|
||||
index_name=index_name,
|
||||
metadata_keys=["a", ID_KEY],
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_similarity_search_by_vector(
|
||||
self,
|
||||
aerospike: Aerospike,
|
||||
admin_client: Any,
|
||||
embedder: ConsistentFakeEmbeddings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
expected = [
|
||||
Document(page_content="foo", metadata={"a": "b", ID_KEY: "1"}),
|
||||
Document(page_content="bar", metadata={"a": "c", ID_KEY: "2"}),
|
||||
]
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
metadatas=[{"a": "b", "1": "2"}, {"a": "c"}, {"a": "d"}],
|
||||
)
|
||||
actual = aerospike.similarity_search_by_vector(
|
||||
embedder.embed_query("foo"),
|
||||
k=2,
|
||||
index_name=index_name,
|
||||
metadata_keys=["a", ID_KEY],
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_similarity_search(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
expected = [
|
||||
Document(page_content="foo", metadata={ID_KEY: "1"}),
|
||||
Document(page_content="bar", metadata={ID_KEY: "2"}),
|
||||
Document(page_content="baz", metadata={ID_KEY: "3"}),
|
||||
]
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
actual = aerospike.similarity_search(
|
||||
"foo", k=3, index_name=index_name, metadata_keys=[ID_KEY]
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
|
||||
def test_max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
client: Any,
|
||||
admin_client: Any,
|
||||
embedder: ConsistentFakeEmbeddings,
|
||||
) -> None:
|
||||
"""Test max marginal relevance search."""
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike = Aerospike.from_texts(
|
||||
["foo", "bar", "baz", "bay", "bax", "baw", "bav"],
|
||||
embedder,
|
||||
client=client,
|
||||
namespace=TEST_NAMESPACE,
|
||||
index_name=index_name,
|
||||
ids=["1", "2", "3", "4", "5", "6", "7"],
|
||||
set_name=set_name,
|
||||
)
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search_by_vector(
|
||||
embedder.embed_query("foo"), index_name=index_name, k=3, fetch_k=3
|
||||
)
|
||||
sim_output = aerospike.similarity_search("foo", index_name=index_name, k=3)
|
||||
|
||||
assert len(mmr_output) == 3
|
||||
assert mmr_output == sim_output
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search_by_vector(
|
||||
embedder.embed_query("foo"), index_name=index_name, k=2, fetch_k=3
|
||||
)
|
||||
|
||||
assert len(mmr_output) == 2
|
||||
assert mmr_output[0].page_content == "foo"
|
||||
assert mmr_output[1].page_content == "bar"
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search_by_vector(
|
||||
embedder.embed_query("foo"),
|
||||
index_name=index_name,
|
||||
k=2,
|
||||
fetch_k=3,
|
||||
lambda_mult=0.1, # more diversity
|
||||
)
|
||||
|
||||
assert len(mmr_output) == 2
|
||||
assert mmr_output[0].page_content == "foo"
|
||||
assert mmr_output[1].page_content == "baz"
|
||||
|
||||
# if fetch_k < k, then the output will be less than k
|
||||
mmr_output = aerospike.max_marginal_relevance_search_by_vector(
|
||||
embedder.embed_query("foo"), index_name=index_name, k=3, fetch_k=2
|
||||
)
|
||||
assert len(mmr_output) == 2
|
||||
|
||||
def test_max_marginal_relevance_search(
|
||||
self, aerospike: Aerospike, admin_client: Any
|
||||
) -> None:
|
||||
"""Test max marginal relevance search."""
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz", "bay", "bax", "baw", "bav"],
|
||||
ids=["1", "2", "3", "4", "5", "6", "7"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
)
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search(
|
||||
"foo", index_name=index_name, k=3, fetch_k=3
|
||||
)
|
||||
sim_output = aerospike.similarity_search("foo", index_name=index_name, k=3)
|
||||
|
||||
assert len(mmr_output) == 3
|
||||
assert mmr_output == sim_output
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search(
|
||||
"foo", index_name=index_name, k=2, fetch_k=3
|
||||
)
|
||||
|
||||
assert len(mmr_output) == 2
|
||||
assert mmr_output[0].page_content == "foo"
|
||||
assert mmr_output[1].page_content == "bar"
|
||||
|
||||
mmr_output = aerospike.max_marginal_relevance_search(
|
||||
"foo",
|
||||
index_name=index_name,
|
||||
k=2,
|
||||
fetch_k=3,
|
||||
lambda_mult=0.1, # more diversity
|
||||
)
|
||||
|
||||
assert len(mmr_output) == 2
|
||||
assert mmr_output[0].page_content == "foo"
|
||||
assert mmr_output[1].page_content == "baz"
|
||||
|
||||
# if fetch_k < k, then the output will be less than k
|
||||
mmr_output = aerospike.max_marginal_relevance_search(
|
||||
"foo", index_name=index_name, k=3, fetch_k=2
|
||||
)
|
||||
assert len(mmr_output) == 2
|
||||
|
||||
def test_cosine_distance(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
"""Test cosine distance."""
|
||||
from aerospike_vector_search import types
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
vector_distance_metric=types.VectorDistanceMetric.COSINE,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
|
||||
"""
|
||||
foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]
|
||||
far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0]
|
||||
cosine similarity ~= 0.71
|
||||
cosine distance ~= 1 - cosine similarity = 0.29
|
||||
"""
|
||||
expected = pytest.approx(0.292, abs=0.002)
|
||||
output = aerospike.similarity_search_with_score(
|
||||
"far", index_name=index_name, k=3
|
||||
)
|
||||
|
||||
_, actual_score = output[2]
|
||||
|
||||
assert actual_score == expected
|
||||
|
||||
def test_dot_product_distance(
|
||||
self, aerospike: Aerospike, admin_client: Any
|
||||
) -> None:
|
||||
"""Test dot product distance."""
|
||||
from aerospike_vector_search import types
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
vector_distance_metric=types.VectorDistanceMetric.DOT_PRODUCT,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
|
||||
"""
|
||||
foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]
|
||||
far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0]
|
||||
dot product = 9.0
|
||||
dot product distance = dot product * -1 = -9.0
|
||||
"""
|
||||
expected = -9.0
|
||||
output = aerospike.similarity_search_with_score(
|
||||
"far", index_name=index_name, k=3
|
||||
)
|
||||
|
||||
_, actual_score = output[2]
|
||||
|
||||
assert actual_score == expected
|
||||
|
||||
def test_euclidean_distance(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
"""Test dot product distance."""
|
||||
from aerospike_vector_search import types
|
||||
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
vector_distance_metric=types.VectorDistanceMetric.SQUARED_EUCLIDEAN,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "bar", "baz"],
|
||||
ids=["1", "2", "3"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
|
||||
"""
|
||||
foo vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]
|
||||
far vector = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0]
|
||||
euclidean distance = 9.0
|
||||
"""
|
||||
expected = 9.0
|
||||
output = aerospike.similarity_search_with_score(
|
||||
"far", index_name=index_name, k=3
|
||||
)
|
||||
|
||||
_, actual_score = output[2]
|
||||
|
||||
assert actual_score == expected
|
||||
|
||||
def test_as_retriever(self, aerospike: Aerospike, admin_client: Any) -> None:
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo", "foo", "foo", "foo", "bar"],
|
||||
ids=["1", "2", "3", "4", "5"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
|
||||
aerospike._index_name = index_name
|
||||
retriever = aerospike.as_retriever(
|
||||
search_type="similarity", search_kwargs={"k": 3}
|
||||
)
|
||||
results = retriever.invoke("foo")
|
||||
assert len(results) == 3
|
||||
assert all([d.page_content == "foo" for d in results])
|
||||
|
||||
def test_as_retriever_distance_threshold(
|
||||
self, aerospike: Aerospike, admin_client: Any
|
||||
) -> None:
|
||||
from aerospike_vector_search import types
|
||||
|
||||
aerospike._distance_strategy = DistanceStrategy.COSINE
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
vector_distance_metric=types.VectorDistanceMetric.COSINE,
|
||||
)
|
||||
aerospike.add_texts(
|
||||
["foo1", "foo2", "foo3", "bar4", "bar5", "bar6", "bar7", "bar8"],
|
||||
ids=["1", "2", "3", "4", "5", "6", "7", "8"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
) # blocking
|
||||
|
||||
aerospike._index_name = index_name
|
||||
retriever = aerospike.as_retriever(
|
||||
search_type="similarity_score_threshold",
|
||||
search_kwargs={"k": 9, "score_threshold": 0.90},
|
||||
)
|
||||
results = retriever.invoke("foo1")
|
||||
|
||||
assert all([d.page_content.startswith("foo") for d in results])
|
||||
assert len(results) == 3
|
||||
|
||||
def test_as_retriever_add_documents(
|
||||
self, aerospike: Aerospike, admin_client: Any
|
||||
) -> None:
|
||||
from aerospike_vector_search import types
|
||||
|
||||
aerospike._distance_strategy = DistanceStrategy.COSINE
|
||||
index_name = set_name = get_func_name()
|
||||
admin_client.index_create(
|
||||
namespace=TEST_NAMESPACE,
|
||||
sets=set_name,
|
||||
name=index_name,
|
||||
vector_field=VECTOR_KEY,
|
||||
dimensions=10,
|
||||
vector_distance_metric=types.VectorDistanceMetric.COSINE,
|
||||
)
|
||||
retriever = aerospike.as_retriever(
|
||||
search_type="similarity_score_threshold",
|
||||
search_kwargs={"k": 9, "score_threshold": 0.90},
|
||||
)
|
||||
|
||||
documents = [
|
||||
Document(
|
||||
page_content="foo1",
|
||||
metadata={
|
||||
"a": 1,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="foo2",
|
||||
metadata={
|
||||
"a": 2,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="foo3",
|
||||
metadata={
|
||||
"a": 3,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar4",
|
||||
metadata={
|
||||
"a": 4,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar5",
|
||||
metadata={
|
||||
"a": 5,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar6",
|
||||
metadata={
|
||||
"a": 6,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="bar7",
|
||||
metadata={
|
||||
"a": 7,
|
||||
},
|
||||
),
|
||||
]
|
||||
retriever.add_documents(
|
||||
documents,
|
||||
ids=["1", "2", "3", "4", "5", "6", "7", "8"],
|
||||
index_name=index_name,
|
||||
set_name=set_name,
|
||||
wait_for_index=True,
|
||||
)
|
||||
|
||||
aerospike._index_name = index_name
|
||||
results = retriever.invoke("foo1")
|
||||
|
||||
assert all([d.page_content.startswith("foo") for d in results])
|
||||
assert len(results) == 3
|
378
libs/community/tests/unit_tests/vectorstores/test_aerospike.py
Normal file
378
libs/community/tests/unit_tests/vectorstores/test_aerospike.py
Normal file
@ -0,0 +1,378 @@
|
||||
import sys
|
||||
from typing import Any, Callable, Generator
|
||||
from unittest.mock import MagicMock, Mock, call
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.vectorstores.aerospike import Aerospike
|
||||
from langchain_community.vectorstores.utils import DistanceStrategy
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
pytestmark = pytest.mark.requires("aerospike_vector_search") and pytest.mark.skipif(
|
||||
sys.version_info < (3, 9), reason="requires python3.9 or higher"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def client() -> Generator[Any, None, None]:
|
||||
try:
|
||||
from aerospike_vector_search import Client
|
||||
from aerospike_vector_search.types import HostPort
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
client = Client(
|
||||
seeds=[
|
||||
HostPort(host="dummy-host", port=3000),
|
||||
],
|
||||
)
|
||||
|
||||
yield client
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client(mocker: Any) -> None:
|
||||
try:
|
||||
from aerospike_vector_search import Client
|
||||
except ImportError:
|
||||
pytest.skip("aerospike_vector_search not installed")
|
||||
|
||||
return mocker.MagicMock(Client)
|
||||
|
||||
|
||||
def test_aerospike(client: Any) -> None:
|
||||
"""Ensure an error is raised when search with score in hybrid mode
|
||||
because in this case Elasticsearch does not return any score.
|
||||
"""
|
||||
from aerospike_vector_search import AVSError
|
||||
|
||||
query_string = "foo"
|
||||
embedding = FakeEmbeddings()
|
||||
|
||||
store = Aerospike(
|
||||
client=client,
|
||||
embedding=embedding,
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
# TODO: Remove grpc import when aerospike_vector_search wraps grpc errors
|
||||
with pytest.raises(AVSError):
|
||||
store.similarity_search_by_vector(embedding.embed_query(query_string))
|
||||
|
||||
|
||||
def test_init_aerospike_distance(client: Any) -> None:
|
||||
from aerospike_vector_search.types import VectorDistanceMetric
|
||||
|
||||
embedding = FakeEmbeddings()
|
||||
aerospike = Aerospike(
|
||||
client=client,
|
||||
embedding=embedding,
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=VectorDistanceMetric.COSINE,
|
||||
)
|
||||
|
||||
assert aerospike._distance_strategy == DistanceStrategy.COSINE
|
||||
|
||||
|
||||
def test_init_bad_embedding(client: Any) -> None:
|
||||
def bad_embedding() -> None:
|
||||
return None
|
||||
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=(
|
||||
"Passing in `embedding` as a Callable is deprecated. Please pass"
|
||||
+ " in an Embeddings object instead."
|
||||
),
|
||||
):
|
||||
Aerospike(
|
||||
client=client,
|
||||
embedding=bad_embedding,
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
|
||||
def test_init_bad_client(client: Any) -> None:
|
||||
class BadClient:
|
||||
pass
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"client should be an instance of aerospike_vector_search.Client,"
|
||||
+ " got <class 'tests.unit_tests.vectorstores.test_aerospike."
|
||||
+ "test_init_bad_client.<locals>.BadClient'>"
|
||||
),
|
||||
):
|
||||
Aerospike(
|
||||
client=BadClient(),
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
|
||||
def test_convert_distance_strategy(client: Any) -> None:
|
||||
from aerospike_vector_search.types import VectorDistanceMetric
|
||||
|
||||
aerospike = Aerospike(
|
||||
client=client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
converted_strategy = aerospike.convert_distance_strategy(
|
||||
VectorDistanceMetric.COSINE
|
||||
)
|
||||
assert converted_strategy == DistanceStrategy.COSINE
|
||||
|
||||
converted_strategy = aerospike.convert_distance_strategy(
|
||||
VectorDistanceMetric.DOT_PRODUCT
|
||||
)
|
||||
assert converted_strategy == DistanceStrategy.DOT_PRODUCT
|
||||
|
||||
converted_strategy = aerospike.convert_distance_strategy(
|
||||
VectorDistanceMetric.SQUARED_EUCLIDEAN
|
||||
)
|
||||
assert converted_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
aerospike.convert_distance_strategy(VectorDistanceMetric.HAMMING)
|
||||
|
||||
|
||||
def test_add_texts_wait_for_index_error(client: Any) -> None:
|
||||
aerospike = Aerospike(
|
||||
client=client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
# index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="if wait_for_index is True, index_name must be provided"
|
||||
):
|
||||
aerospike.add_texts(["foo", "bar"], wait_for_index=True)
|
||||
|
||||
|
||||
def test_add_texts_returns_ids(mock_client: MagicMock) -> None:
|
||||
aerospike = Aerospike(
|
||||
client=mock_client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
excepted = ["0", "1"]
|
||||
actual = aerospike.add_texts(
|
||||
["foo", "bar"],
|
||||
metadatas=[{"foo": 0}, {"bar": 1}],
|
||||
ids=["0", "1"],
|
||||
set_name="otherset",
|
||||
index_name="dummy_index",
|
||||
wait_for_index=True,
|
||||
)
|
||||
|
||||
assert excepted == actual
|
||||
mock_client.upsert.assert_has_calls(
|
||||
calls=[
|
||||
call(
|
||||
namespace="test",
|
||||
key="0",
|
||||
set_name="otherset",
|
||||
record_data={
|
||||
"_id": "0",
|
||||
"text": "foo",
|
||||
"vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
|
||||
"foo": 0,
|
||||
},
|
||||
),
|
||||
call(
|
||||
namespace="test",
|
||||
key="1",
|
||||
set_name="otherset",
|
||||
record_data={
|
||||
"_id": "1",
|
||||
"text": "bar",
|
||||
"vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
"bar": 1,
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
mock_client.wait_for_index_completion.assert_called_once_with(
|
||||
namespace="test",
|
||||
name="dummy_index",
|
||||
)
|
||||
|
||||
|
||||
def test_delete_returns_false(mock_client: MagicMock) -> None:
|
||||
from aerospike_vector_search import AVSServerError
|
||||
|
||||
mock_client.delete.side_effect = Mock(side_effect=AVSServerError(rpc_error=""))
|
||||
aerospike = Aerospike(
|
||||
client=mock_client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
assert not aerospike.delete(["foo", "bar"], set_name="testset")
|
||||
mock_client.delete.assert_called_once_with(
|
||||
namespace="test", key="foo", set_name="testset"
|
||||
)
|
||||
|
||||
|
||||
def test_similarity_search_by_vector_with_score_missing_index_name(
|
||||
client: Any,
|
||||
) -> None:
|
||||
aerospike = Aerospike(
|
||||
client=client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
# index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="index_name must be provided"):
|
||||
aerospike.similarity_search_by_vector_with_score([1.0, 2.0, 3.0])
|
||||
|
||||
|
||||
def test_similarity_search_by_vector_with_score_filters_missing_text_key(
|
||||
mock_client: MagicMock,
|
||||
) -> None:
|
||||
from aerospike_vector_search.types import Neighbor
|
||||
|
||||
text_key = "text"
|
||||
mock_client.vector_search.return_value = [
|
||||
Neighbor(key="key1", fields={text_key: 1}, distance=1.0),
|
||||
Neighbor(key="key2", fields={}, distance=0.0),
|
||||
Neighbor(key="key3", fields={text_key: 3}, distance=3.0),
|
||||
]
|
||||
aerospike = Aerospike(
|
||||
client=mock_client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key=text_key,
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
actual = aerospike.similarity_search_by_vector_with_score(
|
||||
[1.0, 2.0, 3.0], k=10, metadata_keys=["foo"]
|
||||
)
|
||||
|
||||
expected = [
|
||||
(Document(page_content="1"), 1.0),
|
||||
(Document(page_content="3"), 3.0),
|
||||
]
|
||||
mock_client.vector_search.assert_called_once_with(
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
query=[1.0, 2.0, 3.0],
|
||||
limit=10,
|
||||
field_names=[text_key, "foo"],
|
||||
)
|
||||
|
||||
assert expected == actual
|
||||
|
||||
|
||||
def test_similarity_search_by_vector_with_score_overwrite_index_name(
|
||||
mock_client: MagicMock,
|
||||
) -> None:
|
||||
mock_client.vector_search.return_value = []
|
||||
aerospike = Aerospike(
|
||||
client=mock_client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
)
|
||||
|
||||
aerospike.similarity_search_by_vector_with_score(
|
||||
[1.0, 2.0, 3.0], index_name="other_index"
|
||||
)
|
||||
|
||||
mock_client.vector_search.assert_called_once_with(
|
||||
index_name="other_index",
|
||||
namespace="test",
|
||||
query=[1.0, 2.0, 3.0],
|
||||
limit=4,
|
||||
field_names=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"distance_strategy,expected_fn",
|
||||
[
|
||||
(DistanceStrategy.COSINE, Aerospike._cosine_relevance_score_fn),
|
||||
(DistanceStrategy.EUCLIDEAN_DISTANCE, Aerospike._euclidean_relevance_score_fn),
|
||||
(DistanceStrategy.DOT_PRODUCT, Aerospike._max_inner_product_relevance_score_fn),
|
||||
(DistanceStrategy.JACCARD, ValueError),
|
||||
],
|
||||
)
|
||||
def test_select_relevance_score_fn(
|
||||
client: Any, distance_strategy: DistanceStrategy, expected_fn: Callable
|
||||
) -> None:
|
||||
aerospike = Aerospike(
|
||||
client=client,
|
||||
embedding=FakeEmbeddings(),
|
||||
text_key="text",
|
||||
vector_key="vector",
|
||||
index_name="dummy_index",
|
||||
namespace="test",
|
||||
set_name="testset",
|
||||
distance_strategy=distance_strategy,
|
||||
)
|
||||
|
||||
if expected_fn == ValueError:
|
||||
with pytest.raises(ValueError):
|
||||
aerospike._select_relevance_score_fn()
|
||||
|
||||
else:
|
||||
fn = aerospike._select_relevance_score_fn()
|
||||
|
||||
assert fn == expected_fn
|
@ -4,6 +4,7 @@ from langchain_community import vectorstores
|
||||
from langchain_community.vectorstores import __all__, _module_lookup
|
||||
|
||||
EXPECTED_ALL = [
|
||||
"Aerospike",
|
||||
"AlibabaCloudOpenSearch",
|
||||
"AlibabaCloudOpenSearchSettings",
|
||||
"AnalyticDB",
|
||||
|
@ -46,6 +46,7 @@ def test_compatible_vectorstore_documentation() -> None:
|
||||
|
||||
# These are mentioned in the indexing.ipynb documentation
|
||||
documented = {
|
||||
"Aerospike",
|
||||
"AnalyticDB",
|
||||
"AstraDB",
|
||||
"AzureCosmosDBVectorSearch",
|
||||
|
@ -2,6 +2,7 @@
|
||||
from langchain_community.vectorstores import __all__ as public_api
|
||||
|
||||
_EXPECTED = [
|
||||
"Aerospike",
|
||||
"AlibabaCloudOpenSearch",
|
||||
"AlibabaCloudOpenSearchSettings",
|
||||
"AnalyticDB",
|
||||
|
Loading…
Reference in New Issue
Block a user