mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-05 06:33:20 +00:00
community[minor]: Add DocumentDBVectorSearch VectorStore (#17757)
**Description:** - Added Amazon DocumentDB Vector Search integration (HNSW index) - Added integration tests - Updated AWS documentation with DocumentDB Vector Search instructions - Added notebook for DocumentDB integration with example usage --------- Co-authored-by: EC2 Default User <ec2-user@ip-172-31-95-226.ec2.internal>
This commit is contained in:
parent
51f3902bc4
commit
1b4dcf22f3
@ -220,6 +220,35 @@ See a [usage example](/docs/integrations/vectorstores/opensearch#using-aos-amazo
|
||||
from langchain_community.vectorstores import OpenSearchVectorSearch
|
||||
```
|
||||
|
||||
### Amazon DocumentDB Vector Search
|
||||
|
||||
>[Amazon DocumentDB (with MongoDB Compatibility)](https://docs.aws.amazon.com/documentdb/) makes it easy to set up, operate, and scale MongoDB-compatible databases in the cloud.
|
||||
> With Amazon DocumentDB, you can run the same application code and use the same drivers and tools that you use with MongoDB.
|
||||
> Vector search for Amazon DocumentDB combines the flexibility and rich querying capability of a JSON-based document database with the power of vector search.
|
||||
|
||||
#### Installation and Setup
|
||||
|
||||
See [detail configuration instructions](/docs/integrations/vectorstores/documentdb).
|
||||
|
||||
We need to install the `pymongo` python package.
|
||||
|
||||
```bash
|
||||
pip install pymongo
|
||||
```
|
||||
|
||||
#### Deploy DocumentDB on AWS
|
||||
|
||||
[Amazon DocumentDB (with MongoDB Compatibility)](https://docs.aws.amazon.com/documentdb/) is a fast, reliable, and fully managed database service. Amazon DocumentDB makes it easy to set up, operate, and scale MongoDB-compatible databases in the cloud.
|
||||
|
||||
AWS offers services for computing, databases, storage, analytics, and other functionality. For an overview of all AWS services, see [Cloud Computing with Amazon Web Services](https://aws.amazon.com/what-is-aws/).
|
||||
|
||||
See a [usage example](/docs/integrations/vectorstores/documentdb).
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import DocumentDBVectorSearch
|
||||
```
|
||||
|
||||
|
||||
## Tools
|
||||
|
||||
### AWS Lambda
|
||||
|
477
docs/docs/integrations/vectorstores/documentdb.ipynb
Normal file
477
docs/docs/integrations/vectorstores/documentdb.ipynb
Normal file
@ -0,0 +1,477 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "245c0aa70db77606",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Amazon Document DB\n",
|
||||
"\n",
|
||||
">[Amazon DocumentDB (with MongoDB Compatibility)](https://docs.aws.amazon.com/documentdb/) makes it easy to set up, operate, and scale MongoDB-compatible databases in the cloud.\n",
|
||||
"> With Amazon DocumentDB, you can run the same application code and use the same drivers and tools that you use with MongoDB.\n",
|
||||
"> Vector search for Amazon DocumentDB combines the flexibility and rich querying capability of a JSON-based document database with the power of vector search.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook shows you how to use [Amazon Document DB Vector Search](https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html) to store documents in collections, create indicies and perform vector search queries using approximate nearest neighbor algorithms such \"cosine\", \"euclidean\", and \"dotProduct\". By default, DocumentDB creates Hierarchical Navigable Small World (HNSW) indexes. To learn about other supported vector index types, please refer to the document linked above.\n",
|
||||
"\n",
|
||||
"To use DocumentDB, you must first deploy a cluster. Please refer to the [Developer Guide](https://docs.aws.amazon.com/documentdb/latest/developerguide/what-is.html) for more details.\n",
|
||||
"\n",
|
||||
"[Sign Up](https://aws.amazon.com/free/) for free to get started today.\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "ab8e45f5bd435ade",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:20:00.721985Z",
|
||||
"start_time": "2023-10-10T17:19:57.996265Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install pymongo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9c7ce9e7b26efbb0",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:50:03.615234Z",
|
||||
"start_time": "2023-10-10T17:50:03.604289Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"\n",
|
||||
"# DocumentDB connection string\n",
|
||||
"# i.e., \"mongodb://{username}:{pass}@{cluster_endpoint}:{port}/?{params}\"\n",
|
||||
"CONNECTION_STRING = getpass.getpass(\"DocumentDB Cluster URI:\")\n",
|
||||
"\n",
|
||||
"INDEX_NAME = \"izzy-test-index\"\n",
|
||||
"NAMESPACE = \"izzy_test_db.izzy_test_collection\"\n",
|
||||
"DB_NAME, COLLECTION_NAME = NAMESPACE.split(\".\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f2e66b097c6ce2e3",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"We want to use `OpenAIEmbeddings` so we need to set up our OpenAI environment variables. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4a052d99c6b8a2a7",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:50:11.712929Z",
|
||||
"start_time": "2023-10-10T17:50:11.703871Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Set up the OpenAI Environment Variables\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",
|
||||
"os.environ[\n",
|
||||
" \"OPENAI_EMBEDDINGS_DEPLOYMENT\"\n",
|
||||
"] = \"smart-agent-embedding-ada\" # the deployment name for the embedding model\n",
|
||||
"os.environ[\"OPENAI_EMBEDDINGS_MODEL_NAME\"] = \"text-embedding-ada-002\" # the model name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ebaa28c6e2b35063",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"Now, we will load the documents into the collection, create the index, and then perform queries against the index.\n",
|
||||
"\n",
|
||||
"Please refer to the [documentation](https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html) if you have questions about certain parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "183741cf8f4c7c53",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:50:16.732718Z",
|
||||
"start_time": "2023-10-10T17:50:16.716642Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores.documentdb import (\n",
|
||||
" DocumentDBSimilarityType,\n",
|
||||
" DocumentDBVectorSearch,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"SOURCE_FILE_NAME = \"../../modules/state_of_the_union.txt\"\n",
|
||||
"\n",
|
||||
"loader = TextLoader(SOURCE_FILE_NAME)\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"# OpenAI Settings\n",
|
||||
"model_deployment = os.getenv(\n",
|
||||
" \"OPENAI_EMBEDDINGS_DEPLOYMENT\", \"smart-agent-embedding-ada\"\n",
|
||||
")\n",
|
||||
"model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(\n",
|
||||
" deployment=model_deployment, model=model_name\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "39ae6058c2f7fdf1",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:51:17.980698Z",
|
||||
"start_time": "2023-10-10T17:51:11.786336Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{ 'createdCollectionAutomatically' : false,\n",
|
||||
" 'numIndexesBefore' : 1,\n",
|
||||
" 'numIndexesAfter' : 2,\n",
|
||||
" 'ok' : 1,\n",
|
||||
" 'operationTime' : Timestamp(1703656982, 1)}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pymongo import MongoClient\n",
|
||||
"\n",
|
||||
"INDEX_NAME = \"izzy-test-index-2\"\n",
|
||||
"NAMESPACE = \"izzy_test_db.izzy_test_collection\"\n",
|
||||
"DB_NAME, COLLECTION_NAME = NAMESPACE.split(\".\")\n",
|
||||
"\n",
|
||||
"client: MongoClient = MongoClient(CONNECTION_STRING)\n",
|
||||
"collection = client[DB_NAME][COLLECTION_NAME]\n",
|
||||
"\n",
|
||||
"model_deployment = os.getenv(\n",
|
||||
" \"OPENAI_EMBEDDINGS_DEPLOYMENT\", \"smart-agent-embedding-ada\"\n",
|
||||
")\n",
|
||||
"model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n",
|
||||
"\n",
|
||||
"vectorstore = DocumentDBVectorSearch.from_documents(\n",
|
||||
" documents=docs,\n",
|
||||
" embedding=openai_embeddings,\n",
|
||||
" collection=collection,\n",
|
||||
" index_name=INDEX_NAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# number of dimensions used by model above\n",
|
||||
"dimensions = 1536\n",
|
||||
"\n",
|
||||
"# specify similarity algorithm, valid options are:\n",
|
||||
"# cosine (COS), euclidean (EUC), dotProduct (DOT)\n",
|
||||
"similarity_algorithm = DocumentDBSimilarityType.COS\n",
|
||||
"\n",
|
||||
"vectorstore.create_index(dimensions, similarity_algorithm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0666efbe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perform a similarity search between the embedding of the query and the embeddings of the documents\n",
|
||||
"query = \"What did the President say about Ketanji Brown Jackson\"\n",
|
||||
"docs = vectorstore.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "48b6dcca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "37e4df8c7d7db851",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"Once the documents have been loaded and the index has been created, you can now instantiate the vector store directly and run queries against the index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3c218ab6f59301f7",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:52:14.994861Z",
|
||||
"start_time": "2023-10-10T17:52:13.986379Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorstore = DocumentDBVectorSearch.from_connection_string(\n",
|
||||
" connection_string=CONNECTION_STRING,\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" embedding=openai_embeddings,\n",
|
||||
" index_name=INDEX_NAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# perform a similarity search between a query and the ingested documents\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = vectorstore.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba431631-eb5c-4559-b504-4546a9247048",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd67e4d92c9ab32f",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-10-10T17:53:21.145431Z",
|
||||
"start_time": "2023-10-10T17:53:20.884531Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perform a similarity search between a query and the ingested documents\n",
|
||||
"query = \"Which stats did the President share about the U.S. economy\"\n",
|
||||
"docs = vectorstore.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b63c73c7e905001c",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans, the American Rescue Plan helped working people—and left no one behind. \n",
|
||||
"\n",
|
||||
"And it worked. It created jobs. Lots of jobs. \n",
|
||||
"\n",
|
||||
"In fact—our economy created over 6.5 Million new jobs just last year, more jobs created in one year \n",
|
||||
"than ever before in the history of America. \n",
|
||||
"\n",
|
||||
"Our economy grew at a rate of 5.7% last year, the strongest growth in nearly 40 years, the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long. \n",
|
||||
"\n",
|
||||
"For the past 40 years we were told that if we gave tax breaks to those at the very top, the benefits would trickle down to everyone else. \n",
|
||||
"\n",
|
||||
"But that trickle-down theory led to weaker economic growth, lower wages, bigger deficits, and the widest gap between those at the top and everyone else in nearly a century.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f9ded8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Question Answering"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67351360",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"qa_retriever = vectorstore.as_retriever(\n",
|
||||
" search_type=\"similarity\",\n",
|
||||
" search_kwargs={\"k\": 25},\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aadaeca5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"prompt_template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
|
||||
"\n",
|
||||
"{context}\n",
|
||||
"\n",
|
||||
"Question: {question}\n",
|
||||
"\"\"\"\n",
|
||||
"PROMPT = PromptTemplate(\n",
|
||||
" template=prompt_template, input_variables=[\"context\", \"question\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2280140e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain_openai import OpenAI\n",
|
||||
"\n",
|
||||
"qa = RetrievalQA.from_chain_type(\n",
|
||||
" llm=OpenAI(),\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" retriever=qa_retriever,\n",
|
||||
" return_source_documents=True,\n",
|
||||
" chain_type_kwargs={\"prompt\": PROMPT},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = qa({\"query\": \"gpt-4 compute requirements\"})\n",
|
||||
"\n",
|
||||
"print(docs[\"result\"])\n",
|
||||
"print(docs[\"source_documents\"])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -192,6 +192,14 @@ def _import_docarray_inmemory() -> Any:
|
||||
return DocArrayInMemorySearch
|
||||
|
||||
|
||||
def _import_documentdb() -> Any:
|
||||
from langchain_community.vectorstores.documentdb import (
|
||||
DocumentDBVectorSearch,
|
||||
)
|
||||
|
||||
return DocumentDBVectorSearch
|
||||
|
||||
|
||||
def _import_elasticsearch() -> Any:
|
||||
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
|
||||
|
||||
@ -563,6 +571,8 @@ def __getattr__(name: str) -> Any:
|
||||
return _import_dingo()
|
||||
elif name == "DocArrayInMemorySearch":
|
||||
return _import_docarray_inmemory()
|
||||
elif name == "DocumentDBVectorSearch":
|
||||
return _import_documentdb()
|
||||
elif name == "DocArrayHnswSearch":
|
||||
return _import_docarray_hnsw()
|
||||
elif name == "ElasticsearchStore":
|
||||
@ -754,4 +764,5 @@ __all__ = [
|
||||
"VectorStore",
|
||||
"NeuralDBVectorStore",
|
||||
"Lantern",
|
||||
"DocumentDBVectorSearch",
|
||||
]
|
||||
|
361
libs/community/langchain_community/vectorstores/documentdb.py
Normal file
361
libs/community/langchain_community/vectorstores/documentdb.py
Normal file
@ -0,0 +1,361 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pymongo.collection import Collection
|
||||
|
||||
|
||||
# Before Python 3.11 native StrEnum is not available
|
||||
class DocumentDBSimilarityType(str, Enum):
|
||||
"""DocumentDB Similarity Type as enumerator."""
|
||||
|
||||
COS = "cosine"
|
||||
"""Cosine similarity"""
|
||||
DOT = "dotProduct"
|
||||
"""Dot product"""
|
||||
EUC = "euclidean"
|
||||
"""Euclidean distance"""
|
||||
|
||||
|
||||
DocumentDBDocumentType = TypeVar("DocumentDBDocumentType", bound=Dict[str, Any])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_INSERT_BATCH_SIZE = 128
|
||||
|
||||
|
||||
class DocumentDBVectorSearch(VectorStore):
|
||||
"""`Amazon DocumentDB (with MongoDB compatibility)` vector store.
|
||||
Please refer to the official Vector Search documentation for more details:
|
||||
https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html
|
||||
|
||||
To use, you should have both:
|
||||
- the ``pymongo`` python package installed
|
||||
- a connection string and credentials associated with a DocumentDB cluster
|
||||
|
||||
Example:
|
||||
. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import DocumentDBVectorSearch
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
collection = mongo_client["<db_name>"]["<collection_name>"]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = DocumentDBVectorSearch(collection, embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection: Collection[DocumentDBDocumentType],
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
index_name: str = "vectorSearchIndex",
|
||||
text_key: str = "textContent",
|
||||
embedding_key: str = "vectorContent",
|
||||
):
|
||||
"""Constructor for DocumentDBVectorSearch
|
||||
|
||||
Args:
|
||||
collection: MongoDB collection to add the texts to.
|
||||
embedding: Text embedding model to use.
|
||||
index_name: Name of the Vector Search index.
|
||||
text_key: MongoDB field that will contain the text
|
||||
for each document.
|
||||
embedding_key: MongoDB field that will contain the embedding
|
||||
for each document.
|
||||
"""
|
||||
self._collection = collection
|
||||
self._embedding = embedding
|
||||
self._index_name = index_name
|
||||
self._text_key = text_key
|
||||
self._embedding_key = embedding_key
|
||||
self._similarity_type = DocumentDBSimilarityType.COS
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self._embedding
|
||||
|
||||
def get_index_name(self) -> str:
|
||||
"""Returns the index name
|
||||
|
||||
Returns:
|
||||
Returns the index name
|
||||
|
||||
"""
|
||||
return self._index_name
|
||||
|
||||
@classmethod
|
||||
def from_connection_string(
|
||||
cls,
|
||||
connection_string: str,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> DocumentDBVectorSearch:
|
||||
"""Creates an Instance of DocumentDBVectorSearch from a Connection String
|
||||
|
||||
Args:
|
||||
connection_string: The DocumentDB cluster endpoint connection string
|
||||
namespace: The namespace (database.collection)
|
||||
embedding: The embedding utility
|
||||
**kwargs: Dynamic keyword arguments
|
||||
|
||||
Returns:
|
||||
an instance of the vector store
|
||||
|
||||
"""
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pymongo, please install it with "
|
||||
"`pip install pymongo`."
|
||||
)
|
||||
client: MongoClient = MongoClient(connection_string)
|
||||
db_name, collection_name = namespace.split(".")
|
||||
collection = client[db_name][collection_name]
|
||||
return cls(collection, embedding, **kwargs)
|
||||
|
||||
def index_exists(self) -> bool:
|
||||
"""Verifies if the specified index name during instance
|
||||
construction exists on the collection
|
||||
|
||||
Returns:
|
||||
Returns True on success and False if no such index exists
|
||||
on the collection
|
||||
"""
|
||||
cursor = self._collection.list_indexes()
|
||||
index_name = self._index_name
|
||||
|
||||
for res in cursor:
|
||||
current_index_name = res.pop("name")
|
||||
if current_index_name == index_name:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def delete_index(self) -> None:
|
||||
"""Deletes the index specified during instance construction if it exists"""
|
||||
if self.index_exists():
|
||||
self._collection.drop_index(self._index_name)
|
||||
# Raises OperationFailure on an error (e.g. trying to drop
|
||||
# an index that does not exist)
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
dimensions: int = 1536,
|
||||
similarity: DocumentDBSimilarityType = DocumentDBSimilarityType.COS,
|
||||
m: int = 16,
|
||||
ef_construction: int = 64,
|
||||
) -> dict[str, Any]:
|
||||
"""Creates an index using the index name specified at
|
||||
instance construction
|
||||
|
||||
Args:
|
||||
dimensions: Number of dimensions for vector similarity.
|
||||
The maximum number of supported dimensions is 2000
|
||||
|
||||
similarity: Similarity algorithm to use with the HNSW index.
|
||||
|
||||
m: Specifies the max number of connections for an HNSW index.
|
||||
Large impact on memory consumption.
|
||||
|
||||
ef_construction: Specifies the size of the dynamic candidate list
|
||||
for constructing the graph for HNSW index. Higher values lead
|
||||
to more accurate results but slower indexing speed.
|
||||
|
||||
Possible options are:
|
||||
- DocumentDBSimilarityType.COS (cosine distance),
|
||||
- DocumentDBSimilarityType.EUC (Euclidean distance), and
|
||||
- DocumentDBSimilarityType.DOT (dot product).
|
||||
|
||||
Returns:
|
||||
An object describing the created index
|
||||
|
||||
"""
|
||||
self._similarity_type = similarity
|
||||
|
||||
# prepare the command
|
||||
create_index_commands = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"name": self._index_name,
|
||||
"key": {self._embedding_key: "vector"},
|
||||
"vectorOptions": {
|
||||
"type": "hnsw",
|
||||
"similarity": similarity,
|
||||
"dimensions": dimensions,
|
||||
"m": m,
|
||||
"efConstruction": ef_construction,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
|
||||
# invoke the command from the database object
|
||||
create_index_responses: dict[str, Any] = current_database.command(
|
||||
create_index_commands
|
||||
)
|
||||
|
||||
return create_index_responses
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List:
|
||||
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
|
||||
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
result_ids = []
|
||||
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
|
||||
texts_batch.append(text)
|
||||
metadatas_batch.append(metadata)
|
||||
if (i + 1) % batch_size == 0:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
if texts_batch:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
return result_ids
|
||||
|
||||
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
|
||||
"""Used to Load Documents into the collection
|
||||
|
||||
Args:
|
||||
texts: The list of documents strings to load
|
||||
metadatas: The list of metadata objects associated with each document
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# If the text is empty, then exit early
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in DocumentDB
|
||||
insert_result = self._collection.insert_many(to_insert) # type: ignore
|
||||
return insert_result.inserted_ids
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
collection: Optional[Collection[DocumentDBDocumentType]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentDBVectorSearch:
|
||||
if collection is None:
|
||||
raise ValueError("Must provide 'collection' named parameter.")
|
||||
vectorstore = cls(collection, embedding, **kwargs)
|
||||
vectorstore.add_texts(texts, metadatas=metadatas)
|
||||
return vectorstore
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if ids is None:
|
||||
raise ValueError("No document ids provided to delete.")
|
||||
|
||||
for document_id in ids:
|
||||
self.delete_document_by_id(document_id)
|
||||
return True
|
||||
|
||||
def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
|
||||
"""Removes a Specific Document by Id
|
||||
|
||||
Args:
|
||||
document_id: The document identifier
|
||||
"""
|
||||
try:
|
||||
from bson.objectid import ObjectId
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import bson, please install with `pip install bson`."
|
||||
) from e
|
||||
if document_id is None:
|
||||
raise ValueError("No document id provided to delete.")
|
||||
|
||||
self._collection.delete_one({"_id": ObjectId(document_id)})
|
||||
|
||||
def _similarity_search_without_score(
|
||||
self, embeddings: List[float], k: int = 4, ef_search: int = 40
|
||||
) -> List[Document]:
|
||||
"""Returns a list of documents.
|
||||
|
||||
Args:
|
||||
embeddings: The query vector
|
||||
k: the number of documents to return
|
||||
ef_search: Specifies the size of the dynamic candidate list
|
||||
that HNSW index uses during search. A higher value of
|
||||
efSearch provides better recall at cost of speed.
|
||||
|
||||
Returns:
|
||||
A list of documents closest to the query vector
|
||||
"""
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"vectorSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"similarity": self._similarity_type,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
docs = []
|
||||
|
||||
for res in cursor:
|
||||
text = res.pop(self._text_key)
|
||||
docs.append(Document(page_content=text, metadata=res))
|
||||
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
ef_search: int = 40,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_without_score(
|
||||
embeddings=embeddings, k=k, ef_search=ef_search
|
||||
)
|
||||
return [doc for doc in docs]
|
@ -0,0 +1,390 @@
|
||||
"""Test DocumentDBVectorSearch functionality."""
|
||||
import logging
|
||||
import os
|
||||
from time import sleep
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores.documentdb import (
|
||||
DocumentDBSimilarityType,
|
||||
DocumentDBVectorSearch,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
model_deployment = os.getenv(
|
||||
"OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
|
||||
)
|
||||
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING = os.getenv("DOCUMENTDB_URI", "")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
dimensions = 1536
|
||||
similarity_algorithm = DocumentDBSimilarityType.COS
|
||||
|
||||
|
||||
def prepare_collection() -> Any:
|
||||
from pymongo import MongoClient
|
||||
|
||||
test_client: MongoClient = MongoClient(CONNECTION_STRING)
|
||||
return test_client[DB_NAME][COLLECTION_NAME]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def collection() -> Any:
|
||||
return prepare_collection()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def embedding_openai() -> Any:
|
||||
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
|
||||
deployment=model_deployment, model=model_name, chunk_size=1
|
||||
)
|
||||
return openai_embeddings
|
||||
|
||||
|
||||
"""
|
||||
This is how to run the integration tests:
|
||||
|
||||
cd libs/community
|
||||
make test TEST_FILE=tests/integration_tests/vectorstores/test_documentdb.py
|
||||
|
||||
NOTE: You will first need to follow the contributor setup steps:
|
||||
https://python.langchain.com/docs/contributing/code. You will also need to install
|
||||
`pymongo` via `poetry`. You can also run the test directly using `pytest`, but please
|
||||
make sure to install all dependencies.
|
||||
"""
|
||||
|
||||
|
||||
class TestDocumentDBVectorSearch:
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
||||
|
||||
# insure the test collection is empty
|
||||
collection = prepare_collection()
|
||||
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls) -> None:
|
||||
collection = prepare_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
collection.drop_indexes()
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self) -> None:
|
||||
collection = prepare_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
collection.drop_indexes()
|
||||
|
||||
def test_from_documents_cosine_distance(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = DocumentDBVectorSearch.from_documents(
|
||||
documents,
|
||||
embedding_openai,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for DocumentDB to save contents to the collection
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_documents_inner_product(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = DocumentDBVectorSearch.from_documents(
|
||||
documents,
|
||||
embedding_openai,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for DocumentDB to save contents to the collection
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, DocumentDBSimilarityType.DOT)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1, ef_search=100)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_cosine_distance(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"That fence is purple.",
|
||||
]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_cosine_distance(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_one(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
vectorstore.delete_document_by_id(first_document_id)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output2 = vectorstore.similarity_search("Sandwich", k=1, ef_search=10)
|
||||
assert output2
|
||||
assert output2[0].page_content != "What is a sandwich?"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_multiple(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=5)
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
output[1].metadata["_id"]
|
||||
second_document_id = output[1].metadata["_id"]
|
||||
|
||||
output[2].metadata["_id"]
|
||||
third_document_id = output[2].metadata["_id"]
|
||||
|
||||
document_ids = [first_document_id, second_document_id, third_document_id]
|
||||
vectorstore.delete(document_ids)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output_2 = vectorstore.similarity_search("Sandwich", k=5)
|
||||
assert output
|
||||
assert output_2
|
||||
|
||||
assert len(output) == 4 # we should see all the four documents
|
||||
assert (
|
||||
len(output_2) == 1
|
||||
) # we should see only one document left after three have been deleted
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_inner_product(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, DocumentDBSimilarityType.DOT)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_euclidean_distance(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = DocumentDBVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the HNSW index that will be leveraged later for vector search
|
||||
vectorstore.create_index(dimensions, DocumentDBSimilarityType.EUC)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def invoke_delete_with_no_args(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> Optional[bool]:
|
||||
vectorstore: DocumentDBVectorSearch = (
|
||||
DocumentDBVectorSearch.from_connection_string(
|
||||
CONNECTION_STRING,
|
||||
NAMESPACE,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
)
|
||||
|
||||
return vectorstore.delete()
|
||||
|
||||
def invoke_delete_by_id_with_no_args(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
vectorstore: DocumentDBVectorSearch = (
|
||||
DocumentDBVectorSearch.from_connection_string(
|
||||
CONNECTION_STRING,
|
||||
NAMESPACE,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
)
|
||||
|
||||
vectorstore.delete_document_by_id()
|
||||
|
||||
def test_invalid_arguments_to_delete(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
self.invoke_delete_with_no_args(embedding_openai, collection)
|
||||
assert str(exception_info.value) == "No document ids provided to delete."
|
||||
|
||||
def test_no_arguments_to_delete_by_id(
|
||||
self, embedding_openai: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
self.invoke_delete_by_id_with_no_args(embedding_openai, collection)
|
||||
assert str(exception_info.value) == "No document id provided to delete."
|
@ -57,6 +57,7 @@ def test_compatible_vectorstore_documentation() -> None:
|
||||
"DatabricksVectorSearch",
|
||||
"DeepLake",
|
||||
"Dingo",
|
||||
"DocumentDBVectorSearch",
|
||||
"ElasticVectorSearch",
|
||||
"ElasticsearchStore",
|
||||
"FAISS",
|
||||
|
@ -24,6 +24,7 @@ _EXPECTED = [
|
||||
"DistanceStrategy",
|
||||
"DocArrayHnswSearch",
|
||||
"DocArrayInMemorySearch",
|
||||
"DocumentDBVectorSearch",
|
||||
"ElasticKnnSearch",
|
||||
"ElasticVectorSearch",
|
||||
"ElasticsearchStore",
|
||||
|
Loading…
Reference in New Issue
Block a user