community: Add support for specifying hybrid search for Databricks vector search (#23528)

**Description:**

Databricks Vector Search recently added support for hybrid
keyword-similarity search.
See [usage
examples](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint)
from their documentation.

This PR updates the Langchain vectorstore interface for Databricks to
enable the user to pass the *query_type* parameter to
*similarity_search* to make use of this functionality.
By default, there will not be any changes for existing users of this
interface. To use the new hybrid search feature, it is now possible to
do

```python
# ...
dvs = DatabricksVectorSearch(index)
dvs.similarity_search("my search query", query_type="HYBRID")
```

Or using the retriever:

```python
retriever = dvs.as_retriever(
    search_kwargs={
        "query_type": "HYBRID",
    }
)
retriever.invoke("my search query")
```

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Lage Ragnarsson 2024-07-16 00:14:08 +02:00 committed by GitHub
parent 5171ffc026
commit a3c10fc6ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 8 deletions

View File

@ -174,7 +174,10 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Similarity search" "## Similarity search\n",
"Optional keyword arguments to similarity_search include specifying k number of documents to retrive, \n",
"a filters dictionary for metadata filtering based on [this syntax](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#use-filters-on-queries),\n",
"as well as the [query_type](https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.similarity_search) which can be ANN or HYBRID "
] ]
}, },
{ {

View File

@ -277,7 +277,13 @@ class DatabricksVectorSearch(VectorStore):
return True return True
def similarity_search( def similarity_search(
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any self,
query: str,
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs most similar to query. """Return docs most similar to query.
@ -285,17 +291,24 @@ class DatabricksVectorSearch(VectorStore):
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents most similar to the embedding. List of Documents most similar to the embedding.
""" """
docs_with_score = self.similarity_search_with_score( docs_with_score = self.similarity_search_with_score(
query=query, k=k, filters=filters, **kwargs query=query, k=k, filters=filters, query_type=query_type, **kwargs
) )
return [doc for doc, _ in docs_with_score] return [doc for doc, _ in docs_with_score]
def similarity_search_with_score( def similarity_search_with_score(
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any self,
query: str,
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs most similar to query, along with scores. """Return docs most similar to query, along with scores.
@ -303,6 +316,7 @@ class DatabricksVectorSearch(VectorStore):
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents most similar to the embedding and score for each. List of Documents most similar to the embedding and score for each.
@ -321,6 +335,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=query_vector, query_vector=query_vector,
filters=filters, filters=filters,
num_results=k, num_results=k,
query_type=query_type,
) )
return self._parse_search_response(search_resp) return self._parse_search_response(search_resp)
@ -343,6 +358,8 @@ class DatabricksVectorSearch(VectorStore):
fetch_k: int = 20, fetch_k: int = 20,
lambda_mult: float = 0.5, lambda_mult: float = 0.5,
filters: Optional[Any] = None, filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs selected using the maximal marginal relevance. """Return docs selected using the maximal marginal relevance.
@ -359,6 +376,7 @@ class DatabricksVectorSearch(VectorStore):
to maximum diversity and 1 to minimum diversity. to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. Defaults to 0.5.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents selected by maximal marginal relevance. List of Documents selected by maximal marginal relevance.
""" """
@ -377,6 +395,7 @@ class DatabricksVectorSearch(VectorStore):
fetch_k, fetch_k,
lambda_mult=lambda_mult, lambda_mult=lambda_mult,
filters=filters, filters=filters,
query_type=query_type,
) )
return docs return docs
@ -387,6 +406,8 @@ class DatabricksVectorSearch(VectorStore):
fetch_k: int = 20, fetch_k: int = 20,
lambda_mult: float = 0.5, lambda_mult: float = 0.5,
filters: Optional[Any] = None, filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs selected using the maximal marginal relevance. """Return docs selected using the maximal marginal relevance.
@ -403,6 +424,7 @@ class DatabricksVectorSearch(VectorStore):
to maximum diversity and 1 to minimum diversity. to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. Defaults to 0.5.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents selected by maximal marginal relevance. List of Documents selected by maximal marginal relevance.
""" """
@ -420,6 +442,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=embedding, query_vector=embedding,
filters=filters, filters=filters,
num_results=fetch_k, num_results=fetch_k,
query_type=query_type,
) )
embeddings_result_index = ( embeddings_result_index = (
@ -449,6 +472,8 @@ class DatabricksVectorSearch(VectorStore):
embedding: List[float], embedding: List[float],
k: int = 4, k: int = 4,
filters: Optional[Any] = None, filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs most similar to embedding vector. """Return docs most similar to embedding vector.
@ -457,12 +482,13 @@ class DatabricksVectorSearch(VectorStore):
embedding: Embedding to look up documents similar to. embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents most similar to the embedding. List of Documents most similar to the embedding.
""" """
docs_with_score = self.similarity_search_by_vector_with_score( docs_with_score = self.similarity_search_by_vector_with_score(
embedding=embedding, k=k, filters=filters, **kwargs embedding=embedding, k=k, filters=filters, query_type=query_type, **kwargs
) )
return [doc for doc, _ in docs_with_score] return [doc for doc, _ in docs_with_score]
@ -471,6 +497,8 @@ class DatabricksVectorSearch(VectorStore):
embedding: List[float], embedding: List[float],
k: int = 4, k: int = 4,
filters: Optional[Any] = None, filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs most similar to embedding vector, along with scores. """Return docs most similar to embedding vector, along with scores.
@ -479,6 +507,7 @@ class DatabricksVectorSearch(VectorStore):
embedding: Embedding to look up documents similar to. embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None. filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns: Returns:
List of Documents most similar to the embedding and score for each. List of Documents most similar to the embedding and score for each.
@ -493,6 +522,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=embedding, query_vector=embedding,
filters=filters, filters=filters,
num_results=k, num_results=k,
query_type=query_type,
) )
return self._parse_search_response(search_resp) return self._parse_search_response(search_resp)

View File

@ -167,6 +167,12 @@ EXAMPLE_SEARCH_RESPONSE_WITH_EMBEDDING = {
"next_page_token": "", "next_page_token": "",
} }
ALL_QUERY_TYPES = [
None,
"ANN",
"HYBRID",
]
def mock_index(index_details: dict) -> MagicMock: def mock_index(index_details: dict) -> MagicMock:
from databricks.vector_search.client import VectorSearchIndex from databricks.vector_search.client import VectorSearchIndex
@ -475,8 +481,10 @@ def test_delete_fail_no_ids() -> None:
@pytest.mark.requires("databricks", "databricks.vector_search") @pytest.mark.requires("databricks", "databricks.vector_search")
@pytest.mark.parametrize("index_details", ALL_INDEXES) @pytest.mark.parametrize(
def test_similarity_search(index_details: dict) -> None: "index_details, query_type", itertools.product(ALL_INDEXES, ALL_QUERY_TYPES)
)
def test_similarity_search(index_details: dict, query_type: Optional[str]) -> None:
index = mock_index(index_details) index = mock_index(index_details)
index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE
vectorsearch = default_databricks_vector_search(index) vectorsearch = default_databricks_vector_search(index)
@ -484,7 +492,9 @@ def test_similarity_search(index_details: dict) -> None:
filters = {"some filter": True} filters = {"some filter": True}
limit = 7 limit = 7
search_result = vectorsearch.similarity_search(query, k=limit, filters=filters) search_result = vectorsearch.similarity_search(
query, k=limit, filters=filters, query_type=query_type
)
if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS: if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS:
index.similarity_search.assert_called_once_with( index.similarity_search.assert_called_once_with(
columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN], columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN],
@ -492,6 +502,7 @@ def test_similarity_search(index_details: dict) -> None:
query_vector=None, query_vector=None,
filters=filters, filters=filters,
num_results=limit, num_results=limit,
query_type=query_type,
) )
else: else:
index.similarity_search.assert_called_once_with( index.similarity_search.assert_called_once_with(
@ -500,6 +511,7 @@ def test_similarity_search(index_details: dict) -> None:
query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query), query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query),
filters=filters, filters=filters,
num_results=limit, num_results=limit,
query_type=query_type,
) )
assert len(search_result) == len(fake_texts) assert len(search_result) == len(fake_texts)
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)
@ -620,6 +632,7 @@ def test_similarity_search_by_vector(index_details: dict) -> None:
query_vector=query_embedding, query_vector=query_embedding,
filters=filters, filters=filters,
num_results=limit, num_results=limit,
query_type=None,
) )
assert len(search_result) == len(fake_texts) assert len(search_result) == len(fake_texts)
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts) assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)