From a3c10fc6ce7b32466d6e333cb67b88e57d61d4e4 Mon Sep 17 00:00:00 2001 From: Lage Ragnarsson Date: Tue, 16 Jul 2024 00:14:08 +0200 Subject: [PATCH] community: Add support for specifying hybrid search for Databricks vector search (#23528) **Description:** Databricks Vector Search recently added support for hybrid keyword-similarity search. See [usage examples](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint) from their documentation. This PR updates the Langchain vectorstore interface for Databricks to enable the user to pass the *query_type* parameter to *similarity_search* to make use of this functionality. By default, there will not be any changes for existing users of this interface. To use the new hybrid search feature, it is now possible to do ```python # ... dvs = DatabricksVectorSearch(index) dvs.similarity_search("my search query", query_type="HYBRID") ``` Or using the retriever: ```python retriever = dvs.as_retriever( search_kwargs={ "query_type": "HYBRID", } ) retriever.invoke("my search query") ``` --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Erick Friis --- .../databricks_vector_search.ipynb | 5 ++- .../vectorstores/databricks_vector_search.py | 38 +++++++++++++++++-- .../test_databricks_vector_search.py | 19 ++++++++-- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb b/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb index 50cc25b1119..8509b506910 100644 --- a/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb +++ b/docs/docs/integrations/vectorstores/databricks_vector_search.ipynb @@ -174,7 +174,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Similarity search" + "## Similarity search\n", + "Optional keyword arguments to similarity_search include specifying k number of documents to retrive, \n", + "a filters dictionary for metadata filtering based on [this syntax](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#use-filters-on-queries),\n", + "as well as the [query_type](https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.similarity_search) which can be ANN or HYBRID " ] }, { diff --git a/libs/community/langchain_community/vectorstores/databricks_vector_search.py b/libs/community/langchain_community/vectorstores/databricks_vector_search.py index 0a5e7a6d514..dd76b61797a 100644 --- a/libs/community/langchain_community/vectorstores/databricks_vector_search.py +++ b/libs/community/langchain_community/vectorstores/databricks_vector_search.py @@ -277,7 +277,13 @@ class DatabricksVectorSearch(VectorStore): return True def similarity_search( - self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any + self, + query: str, + k: int = 4, + filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, + **kwargs: Any, ) -> List[Document]: """Return docs most similar to query. @@ -285,17 +291,24 @@ class DatabricksVectorSearch(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding. """ docs_with_score = self.similarity_search_with_score( - query=query, k=k, filters=filters, **kwargs + query=query, k=k, filters=filters, query_type=query_type, **kwargs ) return [doc for doc, _ in docs_with_score] def similarity_search_with_score( - self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any + self, + query: str, + k: int = 4, + filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, + **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to query, along with scores. @@ -303,6 +316,7 @@ class DatabricksVectorSearch(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding and score for each. @@ -321,6 +335,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=query_vector, filters=filters, num_results=k, + query_type=query_type, ) return self._parse_search_response(search_resp) @@ -343,6 +358,8 @@ class DatabricksVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -359,6 +376,7 @@ class DatabricksVectorSearch(VectorStore): to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents selected by maximal marginal relevance. """ @@ -377,6 +395,7 @@ class DatabricksVectorSearch(VectorStore): fetch_k, lambda_mult=lambda_mult, filters=filters, + query_type=query_type, ) return docs @@ -387,6 +406,8 @@ class DatabricksVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -403,6 +424,7 @@ class DatabricksVectorSearch(VectorStore): to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents selected by maximal marginal relevance. """ @@ -420,6 +442,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=embedding, filters=filters, num_results=fetch_k, + query_type=query_type, ) embeddings_result_index = ( @@ -449,6 +472,8 @@ class DatabricksVectorSearch(VectorStore): embedding: List[float], k: int = 4, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -457,12 +482,13 @@ class DatabricksVectorSearch(VectorStore): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding. """ docs_with_score = self.similarity_search_by_vector_with_score( - embedding=embedding, k=k, filters=filters, **kwargs + embedding=embedding, k=k, filters=filters, query_type=query_type, **kwargs ) return [doc for doc, _ in docs_with_score] @@ -471,6 +497,8 @@ class DatabricksVectorSearch(VectorStore): embedding: List[float], k: int = 4, filters: Optional[Any] = None, + *, + query_type: Optional[str] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to embedding vector, along with scores. @@ -479,6 +507,7 @@ class DatabricksVectorSearch(VectorStore): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filters to apply to the query. Defaults to None. + query_type: The type of this query. Supported values are "ANN" and "HYBRID". Returns: List of Documents most similar to the embedding and score for each. @@ -493,6 +522,7 @@ class DatabricksVectorSearch(VectorStore): query_vector=embedding, filters=filters, num_results=k, + query_type=query_type, ) return self._parse_search_response(search_resp) diff --git a/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py b/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py index e2528cb04f8..703a6e84a82 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_databricks_vector_search.py @@ -167,6 +167,12 @@ EXAMPLE_SEARCH_RESPONSE_WITH_EMBEDDING = { "next_page_token": "", } +ALL_QUERY_TYPES = [ + None, + "ANN", + "HYBRID", +] + def mock_index(index_details: dict) -> MagicMock: from databricks.vector_search.client import VectorSearchIndex @@ -475,8 +481,10 @@ def test_delete_fail_no_ids() -> None: @pytest.mark.requires("databricks", "databricks.vector_search") -@pytest.mark.parametrize("index_details", ALL_INDEXES) -def test_similarity_search(index_details: dict) -> None: +@pytest.mark.parametrize( + "index_details, query_type", itertools.product(ALL_INDEXES, ALL_QUERY_TYPES) +) +def test_similarity_search(index_details: dict, query_type: Optional[str]) -> None: index = mock_index(index_details) index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE vectorsearch = default_databricks_vector_search(index) @@ -484,7 +492,9 @@ def test_similarity_search(index_details: dict) -> None: filters = {"some filter": True} limit = 7 - search_result = vectorsearch.similarity_search(query, k=limit, filters=filters) + search_result = vectorsearch.similarity_search( + query, k=limit, filters=filters, query_type=query_type + ) if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS: index.similarity_search.assert_called_once_with( columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN], @@ -492,6 +502,7 @@ def test_similarity_search(index_details: dict) -> None: query_vector=None, filters=filters, num_results=limit, + query_type=query_type, ) else: index.similarity_search.assert_called_once_with( @@ -500,6 +511,7 @@ def test_similarity_search(index_details: dict) -> None: query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query), filters=filters, num_results=limit, + query_type=query_type, ) assert len(search_result) == len(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts) @@ -620,6 +632,7 @@ def test_similarity_search_by_vector(index_details: dict) -> None: query_vector=query_embedding, filters=filters, num_results=limit, + query_type=None, ) assert len(search_result) == len(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)