mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 22:03:52 +00:00
community: Add support for specifying hybrid search for Databricks vector search (#23528)
**Description:** Databricks Vector Search recently added support for hybrid keyword-similarity search. See [usage examples](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint) from their documentation. This PR updates the Langchain vectorstore interface for Databricks to enable the user to pass the *query_type* parameter to *similarity_search* to make use of this functionality. By default, there will not be any changes for existing users of this interface. To use the new hybrid search feature, it is now possible to do ```python # ... dvs = DatabricksVectorSearch(index) dvs.similarity_search("my search query", query_type="HYBRID") ``` Or using the retriever: ```python retriever = dvs.as_retriever( search_kwargs={ "query_type": "HYBRID", } ) retriever.invoke("my search query") ``` --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
5171ffc026
commit
a3c10fc6ce
@ -174,7 +174,10 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Similarity search"
|
||||
"## Similarity search\n",
|
||||
"Optional keyword arguments to similarity_search include specifying k number of documents to retrive, \n",
|
||||
"a filters dictionary for metadata filtering based on [this syntax](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#use-filters-on-queries),\n",
|
||||
"as well as the [query_type](https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html#databricks.vector_search.index.VectorSearchIndex.similarity_search) which can be ANN or HYBRID "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -277,7 +277,13 @@ class DatabricksVectorSearch(VectorStore):
|
||||
return True
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
@ -285,17 +291,24 @@ class DatabricksVectorSearch(VectorStore):
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding.
|
||||
"""
|
||||
docs_with_score = self.similarity_search_with_score(
|
||||
query=query, k=k, filters=filters, **kwargs
|
||||
query=query, k=k, filters=filters, query_type=query_type, **kwargs
|
||||
)
|
||||
return [doc for doc, _ in docs_with_score]
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4, filters: Optional[Any] = None, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query, along with scores.
|
||||
|
||||
@ -303,6 +316,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding and score for each.
|
||||
@ -321,6 +335,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
query_vector=query_vector,
|
||||
filters=filters,
|
||||
num_results=k,
|
||||
query_type=query_type,
|
||||
)
|
||||
return self._parse_search_response(search_resp)
|
||||
|
||||
@ -343,6 +358,8 @@ class DatabricksVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -359,6 +376,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
@ -377,6 +395,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
filters=filters,
|
||||
query_type=query_type,
|
||||
)
|
||||
return docs
|
||||
|
||||
@ -387,6 +406,8 @@ class DatabricksVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -403,6 +424,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
@ -420,6 +442,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
query_vector=embedding,
|
||||
filters=filters,
|
||||
num_results=fetch_k,
|
||||
query_type=query_type,
|
||||
)
|
||||
|
||||
embeddings_result_index = (
|
||||
@ -449,6 +472,8 @@ class DatabricksVectorSearch(VectorStore):
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
@ -457,12 +482,13 @@ class DatabricksVectorSearch(VectorStore):
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding.
|
||||
"""
|
||||
docs_with_score = self.similarity_search_by_vector_with_score(
|
||||
embedding=embedding, k=k, filters=filters, **kwargs
|
||||
embedding=embedding, k=k, filters=filters, query_type=query_type, **kwargs
|
||||
)
|
||||
return [doc for doc, _ in docs_with_score]
|
||||
|
||||
@ -471,6 +497,8 @@ class DatabricksVectorSearch(VectorStore):
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filters: Optional[Any] = None,
|
||||
*,
|
||||
query_type: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to embedding vector, along with scores.
|
||||
@ -479,6 +507,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filters: Filters to apply to the query. Defaults to None.
|
||||
query_type: The type of this query. Supported values are "ANN" and "HYBRID".
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding and score for each.
|
||||
@ -493,6 +522,7 @@ class DatabricksVectorSearch(VectorStore):
|
||||
query_vector=embedding,
|
||||
filters=filters,
|
||||
num_results=k,
|
||||
query_type=query_type,
|
||||
)
|
||||
return self._parse_search_response(search_resp)
|
||||
|
||||
|
@ -167,6 +167,12 @@ EXAMPLE_SEARCH_RESPONSE_WITH_EMBEDDING = {
|
||||
"next_page_token": "",
|
||||
}
|
||||
|
||||
ALL_QUERY_TYPES = [
|
||||
None,
|
||||
"ANN",
|
||||
"HYBRID",
|
||||
]
|
||||
|
||||
|
||||
def mock_index(index_details: dict) -> MagicMock:
|
||||
from databricks.vector_search.client import VectorSearchIndex
|
||||
@ -475,8 +481,10 @@ def test_delete_fail_no_ids() -> None:
|
||||
|
||||
|
||||
@pytest.mark.requires("databricks", "databricks.vector_search")
|
||||
@pytest.mark.parametrize("index_details", ALL_INDEXES)
|
||||
def test_similarity_search(index_details: dict) -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"index_details, query_type", itertools.product(ALL_INDEXES, ALL_QUERY_TYPES)
|
||||
)
|
||||
def test_similarity_search(index_details: dict, query_type: Optional[str]) -> None:
|
||||
index = mock_index(index_details)
|
||||
index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE
|
||||
vectorsearch = default_databricks_vector_search(index)
|
||||
@ -484,7 +492,9 @@ def test_similarity_search(index_details: dict) -> None:
|
||||
filters = {"some filter": True}
|
||||
limit = 7
|
||||
|
||||
search_result = vectorsearch.similarity_search(query, k=limit, filters=filters)
|
||||
search_result = vectorsearch.similarity_search(
|
||||
query, k=limit, filters=filters, query_type=query_type
|
||||
)
|
||||
if index_details == DELTA_SYNC_INDEX_MANAGED_EMBEDDINGS:
|
||||
index.similarity_search.assert_called_once_with(
|
||||
columns=[DEFAULT_PRIMARY_KEY, DEFAULT_TEXT_COLUMN],
|
||||
@ -492,6 +502,7 @@ def test_similarity_search(index_details: dict) -> None:
|
||||
query_vector=None,
|
||||
filters=filters,
|
||||
num_results=limit,
|
||||
query_type=query_type,
|
||||
)
|
||||
else:
|
||||
index.similarity_search.assert_called_once_with(
|
||||
@ -500,6 +511,7 @@ def test_similarity_search(index_details: dict) -> None:
|
||||
query_vector=DEFAULT_EMBEDDING_MODEL.embed_query(query),
|
||||
filters=filters,
|
||||
num_results=limit,
|
||||
query_type=query_type,
|
||||
)
|
||||
assert len(search_result) == len(fake_texts)
|
||||
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)
|
||||
@ -620,6 +632,7 @@ def test_similarity_search_by_vector(index_details: dict) -> None:
|
||||
query_vector=query_embedding,
|
||||
filters=filters,
|
||||
num_results=limit,
|
||||
query_type=None,
|
||||
)
|
||||
assert len(search_result) == len(fake_texts)
|
||||
assert sorted([d.page_content for d in search_result]) == sorted(fake_texts)
|
||||
|
Loading…
Reference in New Issue
Block a user