mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-18 21:41:24 +00:00
adding MongoDBAtlasVectorSearch (#5338)
# Add MongoDBAtlasVectorSearch for the python library Fixes #5337 --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
parent
c4b502a470
commit
a61b7f7e7c
@ -0,0 +1,170 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "683953b3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# MongoDB Atlas Vector Search\n",
|
||||
"\n",
|
||||
">[MongoDB Atlas](https://www.mongodb.com/docs/atlas/) is a document database managed in the cloud. It also enables Lucene and its vector search feature.\n",
|
||||
"\n",
|
||||
"This notebook shows how to use the functionality related to the `MongoDB Atlas Vector Search` feature where you can store your embeddings in MongoDB documents and create a Lucene vector index to perform a KNN search.\n",
|
||||
"\n",
|
||||
"It uses the [knnBeta Operator](https://www.mongodb.com/docs/atlas/atlas-search/knn-beta) available in MongoDB Atlas Search. This feature is in early access and available only for evaluation purposes, to validate functionality, and to gather feedback from a small closed group of early access users. It is not recommended for production deployments as we may introduce breaking changes.\n",
|
||||
"\n",
|
||||
"To use MongoDB Atlas, you must have first deployed a cluster. Free clusters are available. \n",
|
||||
"Here is the MongoDB Atlas [quick start](https://www.mongodb.com/docs/atlas/getting-started/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b4c41cad-08ef-4f72-a545-2151e4598efe",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install pymongo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1e38361-c1fe-4ac6-86e9-c90ebaf7ae87",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"MONGODB_ATLAS_URI = os.environ['MONGODB_ATLAS_URI']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "320af802-9271-46ee-948f-d2453933d44b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key. Make sure the environment variable `OPENAI_API_KEY` is set up before proceeding."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3ecc42",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, let's create a Lucene vector index on your cluster. In the below example, `embedding` is the name of the field that contains the embedding vector. Please refer to the [documentation](https://www.mongodb.com/docs/atlas/atlas-search/define-field-mappings-for-vector-search) to get more details on how to define an Atlas Search index.\n",
|
||||
"You can name the index `langchain_demo` and create the index on the namespace `lanchain_db.langchain_col`. Finally, write the following definition in the JSON editor:\n",
|
||||
"\n",
|
||||
"```json\n",
|
||||
"{\n",
|
||||
" \"mappings\": {\n",
|
||||
" \"dynamic\": true,\n",
|
||||
" \"fields\": {\n",
|
||||
" \"embedding\": {\n",
|
||||
" \"dimensions\": 1536,\n",
|
||||
" \"similarity\": \"cosine\",\n",
|
||||
" \"type\": \"knnVector\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "aac9563e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores import MongoDBAtlasVectorSearch\n",
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a3c3999a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"loader = TextLoader('../../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e104aee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pymongo import MongoClient\n",
|
||||
"\n",
|
||||
"# initialize MongoDB python client\n",
|
||||
"client = MongoClient(MONGODB_ATLAS_CONNECTION_STRING)\n",
|
||||
"\n",
|
||||
"db_name = \"lanchain_db\"\n",
|
||||
"collection_name = \"langchain_col\"\n",
|
||||
"namespace = f\"{db_name}.{collection_name}\"\n",
|
||||
"index_name = \"langchain_demo\"\n",
|
||||
"\n",
|
||||
"# insert the documents in MongoDB Atlas with their embedding\n",
|
||||
"docsearch = MongoDBAtlasVectorSearch.from_documents(\n",
|
||||
" docs,\n",
|
||||
" embeddings,\n",
|
||||
" client=client,\n",
|
||||
" namespace=namespace,\n",
|
||||
" index_name=index_name\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# perform a similarity search between the embedding of the query and the embeddings of the documents\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c608226",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -10,6 +10,7 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
|
||||
from langchain.vectorstores.faiss import FAISS
|
||||
from langchain.vectorstores.lancedb import LanceDB
|
||||
from langchain.vectorstores.milvus import Milvus
|
||||
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
|
||||
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
|
||||
from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
|
||||
from langchain.vectorstores.pinecone import Pinecone
|
||||
@ -38,6 +39,7 @@ __all__ = [
|
||||
"AtlasDB",
|
||||
"DeepLake",
|
||||
"Annoy",
|
||||
"MongoDBAtlasVectorSearch",
|
||||
"MyScale",
|
||||
"MyScaleSettings",
|
||||
"SKLearnVectorStore",
|
||||
|
270
langchain/vectorstores/mongodb_atlas.py
Normal file
270
langchain/vectorstores/mongodb_atlas.py
Normal file
@ -0,0 +1,270 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pymongo import MongoClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_INSERT_BATCH_SIZE = 100
|
||||
|
||||
|
||||
class MongoDBAtlasVectorSearch(VectorStore):
|
||||
"""Wrapper around MongoDB Atlas Vector Search.
|
||||
|
||||
To use, you should have both:
|
||||
- the ``pymongo`` python package installed
|
||||
- a connection string associated with a MongoDB Atlas Cluster having deployed an
|
||||
Atlas Search index
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores import MongoDBAtlasVectorSearch
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
namespace = "<db_name>.<collection_name>"
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = MongoDBAtlasVectorSearch(mongo_client, namespace, embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: MongoClient,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
index_name: str = "default",
|
||||
text_key: str = "text",
|
||||
embedding_key: str = "embedding",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
client: MongoDB client.
|
||||
namespace: MongoDB namespace to add the texts to.
|
||||
embedding: Text embedding model to use.
|
||||
text_key: MongoDB field that will contain the text for each
|
||||
document.
|
||||
embedding_key: MongoDB field that will contain the embedding for
|
||||
each document.
|
||||
"""
|
||||
self._client = client
|
||||
db_name, collection_name = namespace.split(".")
|
||||
self._collection = client[db_name][collection_name]
|
||||
self._embedding = embedding
|
||||
self._index_name = index_name
|
||||
self._text_key = text_key
|
||||
self._embedding_key = embedding_key
|
||||
|
||||
@classmethod
|
||||
def from_connection_string(
|
||||
cls,
|
||||
connection_string: str,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pymongo, please install it with "
|
||||
"`pip install pymongo`."
|
||||
)
|
||||
client: MongoClient = MongoClient(connection_string)
|
||||
return cls(client, namespace, embedding, **kwargs)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
|
||||
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
result_ids = []
|
||||
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
|
||||
texts_batch.append(text)
|
||||
metadatas_batch.append(metadata)
|
||||
if (i + 1) % batch_size == 0:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
if texts_batch:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
return result_ids
|
||||
|
||||
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
|
||||
if not texts:
|
||||
return []
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in MongoDB Atlas
|
||||
insert_result = self._collection.insert_many(to_insert)
|
||||
return insert_result.inserted_ids
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return MongoDB documents most similar to query, along with scores.
|
||||
|
||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
||||
This feature is in early access and available only for evaluation purposes, to
|
||||
validate functionality, and to gather feedback from a small closed group of
|
||||
early access users. It is not recommended for production deployments as we
|
||||
may introduce breaking changes.
|
||||
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Optional Number of Documents to return. Defaults to 4.
|
||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta search.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
knn_beta = {
|
||||
"vector": self._embedding.embed_query(query),
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
}
|
||||
if pre_filter:
|
||||
knn_beta["filter"] = pre_filter
|
||||
pipeline = [
|
||||
{
|
||||
"$search": {
|
||||
"index": self._index_name,
|
||||
"knnBeta": knn_beta,
|
||||
}
|
||||
},
|
||||
{"$project": {"score": {"$meta": "searchScore"}, self._embedding_key: 0}},
|
||||
]
|
||||
if post_filter_pipeline is not None:
|
||||
pipeline.extend(post_filter_pipeline)
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
docs = []
|
||||
for res in cursor:
|
||||
text = res.pop(self._text_key)
|
||||
score = res.pop("score")
|
||||
docs.append((Document(page_content=text, metadata=res), score))
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return MongoDB documents most similar to query.
|
||||
|
||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
||||
This feature is in early access and available only for evaluation purposes, to
|
||||
validate functionality, and to gather feedback from a small closed group of
|
||||
early access users. It is not recommended for production deployments as we may
|
||||
introduce breaking changes.
|
||||
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Optional Number of Documents to return. Defaults to 4.
|
||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta search.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
post_filter_pipeline=post_filter_pipeline,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
client: Optional[MongoClient] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents.
|
||||
|
||||
This is a user-friendly interface that:
|
||||
1. Embeds documents.
|
||||
2. Adds the documents to a provided MongoDB Atlas Vector Search index
|
||||
(Lucene)
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from pymongo import MongoClient
|
||||
|
||||
from langchain.vectorstores import MongoDBAtlasVectorSearch
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
namespace = "<db_name>.<collection_name>"
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
embeddings,
|
||||
metadatas=metadatas,
|
||||
client=client,
|
||||
namespace=namespace
|
||||
)
|
||||
"""
|
||||
if not client or not namespace:
|
||||
raise ValueError("Must provide 'client' and 'namespace' named parameters.")
|
||||
vecstore = cls(client, namespace, embedding, **kwargs)
|
||||
vecstore.add_texts(texts, metadatas=metadatas)
|
||||
return vecstore
|
6
poetry.lock
generated
6
poetry.lock
generated
@ -6965,7 +6965,7 @@ tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"]
|
||||
name = "pymongo"
|
||||
version = "4.3.3"
|
||||
description = "Python driver for MongoDB <http://www.mongodb.org>"
|
||||
category = "dev"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -10948,7 +10948,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "openai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
@ -10962,4 +10962,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "b3dc23f376de141d22b729d038144a1e6d66983a910160c3500fe0d79f8e5917"
|
||||
content-hash = "937d2f0165f6aa381ea1e26002272a92b189ab18607bd05895e36d23f56978f4"
|
||||
|
@ -36,6 +36,7 @@ jinja2 = {version = "^3", optional = true}
|
||||
tiktoken = {version = "^0.3.2", optional = true, python="^3.9"}
|
||||
pinecone-client = {version = "^2", optional = true}
|
||||
pinecone-text = {version = "^0.4.2", optional = true}
|
||||
pymongo = {version = "^4.3.3", optional = true}
|
||||
clickhouse-connect = {version="^0.5.14", optional=true}
|
||||
weaviate-client = {version = "^3", optional = true}
|
||||
google-api-python-client = {version = "2.70.0", optional = true}
|
||||
@ -159,6 +160,7 @@ elasticsearch = {extras = ["async"], version = "^8.6.2"}
|
||||
redis = "^4.5.4"
|
||||
pinecone-client = "^2.2.1"
|
||||
pinecone-text = "^0.4.2"
|
||||
pymongo = "^4.3.3"
|
||||
clickhouse-connect = "^0.5.14"
|
||||
pgvector = "^0.1.6"
|
||||
transformers = "^4.27.4"
|
||||
@ -174,7 +176,6 @@ gptcache = "^0.1.9"
|
||||
promptlayer = "^0.1.80"
|
||||
tair = "^1.3.3"
|
||||
wikipedia = "^1"
|
||||
pymongo = "^4.3.3"
|
||||
cassandra-driver = "^3.27.0"
|
||||
arxiv = "^1.4"
|
||||
mastodon-py = "^1.8.1"
|
||||
@ -234,6 +235,7 @@ all = [
|
||||
"jinja2",
|
||||
"pinecone-client",
|
||||
"pinecone-text",
|
||||
"pymongo",
|
||||
"weaviate-client",
|
||||
"redis",
|
||||
"google-api-python-client",
|
||||
|
@ -22,4 +22,8 @@ PINECONE_ENVIRONMENT=us-west4-gcp
|
||||
# details here https://learn.microsoft.com/en-us/dotnet/api/azure.identity.defaultazurecredential?view=azure-dotnet
|
||||
POWERBI_DATASET_ID=_powerbi_dataset_id_here
|
||||
POWERBI_TABLE_NAME=_test_table_name_here
|
||||
POWERBI_NUMROWS=_num_rows_in_your_test_table
|
||||
POWERBI_NUMROWS=_num_rows_in_your_test_table
|
||||
|
||||
|
||||
# MongoDB Atlas Vector Search
|
||||
MONGODB_ATLAS_URI=your_mongodb_atlas_connection_string
|
135
tests/integration_tests/vectorstores/test_mongodb_atlas.py
Normal file
135
tests/integration_tests/vectorstores/test_mongodb_atlas.py
Normal file
@ -0,0 +1,135 @@
|
||||
"""Test MongoDB Atlas Vector Search functionality."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from time import sleep
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pymongo import MongoClient
|
||||
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
|
||||
def get_test_client() -> Optional[MongoClient]:
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
|
||||
client: MongoClient = MongoClient(CONNECTION_STRING)
|
||||
return client
|
||||
except: # noqa: E722
|
||||
return None
|
||||
|
||||
|
||||
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
|
||||
# connections.
|
||||
TEST_CLIENT = get_test_client()
|
||||
|
||||
|
||||
class TestMongoDBAtlasVectorSearch:
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
# insure the test collection is empty
|
||||
assert TEST_CLIENT[DB_NAME][COLLECTION_NAME].count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls) -> None:
|
||||
# delete all the documents in the collection
|
||||
TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index]
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self) -> None:
|
||||
# delete all the documents in the collection
|
||||
TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index]
|
||||
|
||||
def test_from_documents(self, embedding_openai: Embeddings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_documents(
|
||||
documents,
|
||||
embedding_openai,
|
||||
client=TEST_CLIENT,
|
||||
namespace=NAMESPACE,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for mongot to update Lucene's index
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
def test_from_texts(self, embedding_openai: Embeddings) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"That fence is purple.",
|
||||
]
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
client=TEST_CLIENT,
|
||||
namespace=NAMESPACE,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for mongot to update Lucene's index
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
|
||||
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
client=TEST_CLIENT,
|
||||
namespace=NAMESPACE,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for mongot to update Lucene's index
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
def test_from_texts_with_metadatas_and_pre_filter(
|
||||
self, embedding_openai: Embeddings
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
metadatas=metadatas,
|
||||
client=TEST_CLIENT,
|
||||
namespace=NAMESPACE,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for mongot to update Lucene's index
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich", k=1, pre_filter={"range": {"lte": 0, "path": "c"}}
|
||||
)
|
||||
assert output == []
|
Loading…
Reference in New Issue
Block a user