community[minor]: Improve Cassandra VectorStore as_retriever (#22465)

The Vectorstore's API `as_retriever` doesn't expose explicitly the
parameters `search_type` and `search_kwargs` and so these are not well
documented.
This PR improves `as_retriever` for the Cassandra VectorStore by making
these parameters explicit.

NB: An alternative would have been to modify `as_retriever` in
`Vectorstore`. But there's probably a good reason these were not exposed
in the first place ? Is it because implementations may decide to not
support them and have fixed values when creating the
VectorStoreRetriever ?
This commit is contained in:
Christophe Bornet 2024-06-04 15:51:17 +02:00 committed by GitHub
parent 23bba18f92
commit 9a8fe58ebe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -24,7 +24,7 @@ if typing.TYPE_CHECKING:
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
from langchain_community.utilities.cassandra import SetupMode
from langchain_community.vectorstores.utils import maximal_marginal_relevance
@ -1073,3 +1073,77 @@ class Cassandra(VectorStore):
body_index_options=body_index_options,
**kwargs,
)
def as_retriever(
self,
search_type: str = "similarity",
search_kwargs: Optional[Dict[str, Any]] = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> VectorStoreRetriever:
"""Return VectorStoreRetriever initialized from this VectorStore.
Args:
search_type: Defines the type of search that
the Retriever should perform.
Can be "similarity" (default), "mmr", or
"similarity_score_threshold".
search_kwargs: Keyword arguments to pass to the
search function. Can include things like:
k: Amount of documents to return (Default: 4)
score_threshold: Minimum relevance threshold
for similarity_score_threshold
fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
lambda_mult: Diversity of results returned by MMR;
1 for minimum diversity and 0 for maximum. (Default: 0.5)
filter: Filter by document metadata
tags: List of tags associated with the retriever.
metadata: Metadata associated with the retriever.
kwargs: Other arguments passed to the VectorStoreRetriever init.
Returns:
Retriever for VectorStore.
Examples:
.. code-block:: python
# Retrieve more documents with higher diversity
# Useful if your dataset has many similar documents
docsearch.as_retriever(
search_type="mmr",
search_kwargs={'k': 6, 'lambda_mult': 0.25}
)
# Fetch more documents for the MMR algorithm to consider
# But only return the top 5
docsearch.as_retriever(
search_type="mmr",
search_kwargs={'k': 5, 'fetch_k': 50}
)
# Only retrieve documents that have a relevance score
# Above a certain threshold
docsearch.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={'score_threshold': 0.8}
)
# Only get the single most similar document from the dataset
docsearch.as_retriever(search_kwargs={'k': 1})
# Use a filter to only retrieve documents from a specific paper
docsearch.as_retriever(
search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}}
)
"""
_tags = tags or [] + self._get_retriever_tags()
return VectorStoreRetriever(
vectorstore=self,
search_type=search_type,
search_kwargs=search_kwargs or {},
tags=_tags,
metadata=metadata,
**kwargs,
)