community: Add support for specifying hybrid search for Databricks vector search (#23528)

**Description:**

Databricks Vector Search recently added support for hybrid
keyword-similarity search.
See [usage
examples](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint)
from their documentation.

This PR updates the Langchain vectorstore interface for Databricks to
enable the user to pass the *query_type* parameter to
*similarity_search* to make use of this functionality.
By default, there will not be any changes for existing users of this
interface. To use the new hybrid search feature, it is now possible to
do

```python
# ...
dvs = DatabricksVectorSearch(index)
dvs.similarity_search("my search query", query_type="HYBRID")
```

Or using the retriever:

```python
retriever = dvs.as_retriever(
    search_kwargs={
        "query_type": "HYBRID",
    }
)
retriever.invoke("my search query")
```

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Lage Ragnarsson 2024-07-16 00:14:08 +02:00 committed by GitHub
parent 5171ffc026
commit a3c10fc6ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 8 deletions

View File

@ -174,7 +174,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similarity search"
"## Similarity search\n",
"Optional keyword arguments to similarity_search include specifying k number of documents to retrive, \n",
"a filters dictionary for metadata filtering based on [this syntax](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#use-filters-on-queries),\n",
"as well as the [query_type](https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.similarity_search) which can be ANN or HYBRID "
]
},
{

View File

@ -277,7 +277,13 @@ class DatabricksVectorSearch(VectorStore):
return True
def similarity_search(
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any
self,
query: str,
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
@ -285,17 +291,24 @@ class DatabricksVectorSearch(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents most similar to the embedding.
"""
docs_with_score = self.similarity_search_with_score(
query=query, k=k, filters=filters, **kwargs
query=query, k=k, filters=filters, query_type=query_type, **kwargs
)
return [doc for doc, _ in docs_with_score]
def similarity_search_with_score(
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any
self,
query: str,
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query, along with scores.
@ -303,6 +316,7 @@ class DatabricksVectorSearch(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents most similar to the embedding and score for each.
@ -321,6 +335,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=query_vector,
filters=filters,
num_results=k,
query_type=query_type,
)
return self._parse_search_response(search_resp)
@ -343,6 +358,8 @@ class DatabricksVectorSearch(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -359,6 +376,7 @@ class DatabricksVectorSearch(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents selected by maximal marginal relevance.
"""
@ -377,6 +395,7 @@ class DatabricksVectorSearch(VectorStore):
fetch_k,
lambda_mult=lambda_mult,
filters=filters,
query_type=query_type,
)
return docs
@ -387,6 +406,8 @@ class DatabricksVectorSearch(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -403,6 +424,7 @@ class DatabricksVectorSearch(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents selected by maximal marginal relevance.
"""
@ -420,6 +442,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=embedding,
filters=filters,
num_results=fetch_k,
query_type=query_type,
)
embeddings_result_index = (
@ -449,6 +472,8 @@ class DatabricksVectorSearch(VectorStore):
embedding: List[float],
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
@ -457,12 +482,13 @@ class DatabricksVectorSearch(VectorStore):
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents most similar to the embedding.
"""
docs_with_score = self.similarity_search_by_vector_with_score(
embedding=embedding, k=k, filters=filters, **kwargs
embedding=embedding, k=k, filters=filters, query_type=query_type, **kwargs
)
return [doc for doc, _ in docs_with_score]
@ -471,6 +497,8 @@ class DatabricksVectorSearch(VectorStore):
embedding: List[float],
k: int = 4,
filters: Optional[Any] = None,
*,
query_type: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to embedding vector, along with scores.
@ -479,6 +507,7 @@ class DatabricksVectorSearch(VectorStore):
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filters: Filters to apply to the query. Defaults to None.
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
Returns:
List of Documents most similar to the embedding and score for each.
@ -493,6 +522,7 @@ class DatabricksVectorSearch(VectorStore):
query_vector=embedding,
filters=filters,
num_results=k,
query_type=query_type,
)
return self._parse_search_response(search_resp)

View File

@ -167,6 +167,12 @@ EXAMPLE_SEARCH_RESPONSE_WITH_EMBEDDING = {
"next_page_token": "",
}
ALL_QUERY_TYPES = [
None,
"ANN",
"HYBRID",
]
def mock_index(index_details: dict) -> MagicMock:
from databricks.vector_search.client import VectorSearchIndex
@ -475,8 +481,10 @@ def test_delete_fail_no_ids() -> None:
@pytest.mark.requires("databricks", "databricks.vector_search")
@pytest.mark.parametrize("index_details", ALL_INDEXES)
def test_similarity_search(index_details: dict) -> None:
@pytest.mark.parametrize(
"index_details, query_type", itertools.product(ALL_INDEXES, ALL_QUERY_TYPES)
)
def test_similarity_search(index_details: dict, query_type: Optional[str]) -> None:
index = mock_index(index_details)
index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE
vectorsearch = default_databricks_vector_search(index)
@ -484,7 +492,9 @@ def test_similarity_search(index_details: dict) -> None:
filters = {"some filter": True}
limit = 7
search_result = vectorsearch.similarity_search(query, k=limit, filters=filters)
search_result = vectorsearch.similarity_search(
query, k=limit, filters=filters, query_type=query_type
)
if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS:
index.similarity_search.assert_called_once_with(
columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN],
@ -492,6 +502,7 @@ def test_similarity_search(index_details: dict) -> None:
query_vector=None,
filters=filters,
num_results=limit,
query_type=query_type,
)
else:
index.similarity_search.assert_called_once_with(
@ -500,6 +511,7 @@ def test_similarity_search(index_details: dict) -> None:
query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query),
filters=filters,
num_results=limit,
query_type=query_type,
)
assert len(search_result) == len(fake_texts)
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)
@ -620,6 +632,7 @@ def test_similarity_search_by_vector(index_details: dict) -> None:
query_vector=query_embedding,
filters=filters,
num_results=limit,
query_type=None,
)
assert len(search_result) == len(fake_texts)
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)