LangChain Community: VectorStores: Azure Cosmos DB Filtered Vector Search (#24087)

Thank you for contributing to LangChain!

- This PR adds vector search filtering for Azure Cosmos DB Mongo vCore
and NoSQL.


- [ ] **PR message**: ***Delete this entire checklist*** and replace
with
    - **Description:** a description of the change
    - **Issue:** the issue # it fixes, if applicable
    - **Dependencies:** any dependencies required for this change
- **Twitter handle:** if your PR gets announced, and you'd like a
mention, we'll gladly shout you out!


- [ ] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
Aayush Kataria
2024-07-23 16:59:23 -07:00
committed by GitHub
parent ac41c97d21
commit 0f45ac4088
4 changed files with 222 additions and 42 deletions

View File

@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
INDEX_NAME = "langchain-test-index"
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
num_lists = 3

View File

@@ -104,6 +104,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
),
indexing_policy=get_vector_indexing_policy("flat"),
cosmos_container_properties={"partition_key": partition_key},
cosmos_database_properties={},
)
sleep(1) # waits for Cosmos DB to save contents to the collection
@@ -139,6 +140,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
),
indexing_policy=get_vector_indexing_policy("flat"),
cosmos_container_properties={"partition_key": partition_key},
cosmos_database_properties={},
)
sleep(1) # waits for Cosmos DB to save contents to the collection
@@ -154,3 +156,60 @@ class TestAzureCosmosDBNoSqlVectorSearch:
assert output2
assert output2[0].page_content != "Dogs are tough."
safe_delete_database(cosmos_client)
def test_from_documents_cosine_distance_with_filtering(
self,
cosmos_client: Any,
partition_key: Any,
azure_openai_embeddings: OpenAIEmbeddings,
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"a": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
store = AzureCosmosDBNoSqlVectorSearch.from_documents(
documents,
azure_openai_embeddings,
cosmos_client=cosmos_client,
database_name=database_name,
container_name=container_name,
vector_embedding_policy=get_vector_embedding_policy(
"cosine", "float32", 400
),
indexing_policy=get_vector_indexing_policy("flat"),
cosmos_container_properties={"partition_key": partition_key},
cosmos_database_properties={},
)
sleep(1) # waits for Cosmos DB to save contents to the collection
output = store.similarity_search("Dogs", k=4)
assert len(output) == 4
assert output[0].page_content == "Dogs are tough."
assert output[0].metadata["a"] == 1
pre_filter = {
"where_clause": "WHERE c.metadata.a=1",
}
output = store.similarity_search(
"Dogs", k=4, pre_filter=pre_filter, with_embedding=True
)
assert len(output) == 2
assert output[0].page_content == "Dogs are tough."
assert output[0].metadata["a"] == 1
pre_filter = {
"where_clause": "WHERE c.metadata.a=1",
"limit_offset_clause": "OFFSET 0 LIMIT 1",
}
output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter)
assert len(output) == 1
assert output[0].page_content == "Dogs are tough."
assert output[0].metadata["a"] == 1
safe_delete_database(cosmos_client)