diff --git a/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb b/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb index 50cc25b1119..8509b506910 100644 --- a/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb +++ b/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb @@ -174,7 +174,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Similarity search" + "## Similarity search\n", + "Optional keyword arguments to similarity_search include specifying k number of documents to retrive, \n", + "a filters dictionary for metadata filtering based on [this syntax](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#use-filters-on-queries),\n", + "as well as the [query_type](https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.similarity_search) which can be ANN or HYBRID " ] }, { diff --git a/libs/community/langchain_community/vectorstores/databricks_vector_search.py b/libs/community/langchain_community/vectorstores/databricks_vector_search.py index 0a5e7a6d514..dd76b61797a 100644 --- a/libs/community/langchain_community/vectorstores/databricks_vector_search.py +++ b/libs/community/langchain_community/vectorstores/databricks_vector_search.py @@ -277,7 +277,13 @@ class DatabricksVectorSearch(VectorStore): return True def similarity_search( - self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any + self, + query: str, + k: int = 4, + filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, + **kwargs: Any, ) -> List[Document]: """Return docs most similar to query. @@ -285,17 +291,24 @@ class DatabricksVectorSearch(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding. """ docs_with_score = self.similarity_search_with_score( - query=query, k=k, filters=filters, **kwargs + query=query, k=k, filters=filters, query_type=query_type, **kwargs ) return [doc for doc, _ in docs_with_score] def similarity_search_with_score( - self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any + self, + query: str, + k: int = 4, + filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, + **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to query, along with scores. @@ -303,6 +316,7 @@ class DatabricksVectorSearch(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding and score for each. @@ -321,6 +335,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=query_vector, filters=filters, num_results=k, + query_type=query_type, ) return self._parse_search_response(search_resp) @@ -343,6 +358,8 @@ class DatabricksVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -359,6 +376,7 @@ class DatabricksVectorSearch(VectorStore): to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents selected by maximal marginal relevance. """ @@ -377,6 +395,7 @@ class DatabricksVectorSearch(VectorStore): fetch_k, lambda_mult=lambda_mult, filters=filters, + query_type=query_type, ) return docs @@ -387,6 +406,8 @@ class DatabricksVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -403,6 +424,7 @@ class DatabricksVectorSearch(VectorStore): to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents selected by maximal marginal relevance. """ @@ -420,6 +442,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=embedding, filters=filters, num_results=fetch_k, + query_type=query_type, ) embeddings_result_index = ( @@ -449,6 +472,8 @@ class DatabricksVectorSearch(VectorStore): embedding: List[float], k: int = 4, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -457,12 +482,13 @@ class DatabricksVectorSearch(VectorStore): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding. """ docs_with_score = self.similarity_search_by_vector_with_score( - embedding=embedding, k=k, filters=filters, **kwargs + embedding=embedding, k=k, filters=filters, query_type=query_type, **kwargs ) return [doc for doc, _ in docs_with_score] @@ -471,6 +497,8 @@ class DatabricksVectorSearch(VectorStore): embedding: List[float], k: int = 4, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to embedding vector, along with scores. @@ -479,6 +507,7 @@ class DatabricksVectorSearch(VectorStore): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding and score for each. @@ -493,6 +522,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=embedding, filters=filters, num_results=k, + query_type=query_type, ) return self._parse_search_response(search_resp) diff --git a/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py b/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py index e2528cb04f8..703a6e84a82 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py @@ -167,6 +167,12 @@ EXAMPLE_SEARCH_RESPONSE_WITH_EMBEDDING = { "next_page_token": "", } +ALL_QUERY_TYPES = [ + None, + "ANN", + "HYBRID", +] + def mock_index(index_details: dict) -> MagicMock: from databricks.vector_search.client import VectorSearchIndex @@ -475,8 +481,10 @@ def test_delete_fail_no_ids() -> None: @pytest.mark.requires("databricks", "databricks.vector_search") -@pytest.mark.parametrize("index_details", ALL_INDEXES) -def test_similarity_search(index_details: dict) -> None: +@pytest.mark.parametrize( + "index_details, query_type", itertools.product(ALL_INDEXES, ALL_QUERY_TYPES) +) +def test_similarity_search(index_details: dict, query_type: Optional[str]) -> None: index = mock_index(index_details) index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE vectorsearch = default_databricks_vector_search(index) @@ -484,7 +492,9 @@ def test_similarity_search(index_details: dict) -> None: filters = {"some filter": True} limit = 7 - search_result = vectorsearch.similarity_search(query, k=limit, filters=filters) + search_result = vectorsearch.similarity_search( + query, k=limit, filters=filters, query_type=query_type + ) if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS: index.similarity_search.assert_called_once_with( columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN], @@ -492,6 +502,7 @@ def test_similarity_search(index_details: dict) -> None: query_vector=None, filters=filters, num_results=limit, + query_type=query_type, ) else: index.similarity_search.assert_called_once_with( @@ -500,6 +511,7 @@ def test_similarity_search(index_details: dict) -> None: query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query), filters=filters, num_results=limit, + query_type=query_type, ) assert len(search_result) == len(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts) @@ -620,6 +632,7 @@ def test_similarity_search_by_vector(index_details: dict) -> None: query_vector=query_embedding, filters=filters, num_results=limit, + query_type=None, ) assert len(search_result) == len(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)