From 9a8fe58ebeeea7020480e789aabc0cc0bdf607f8 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Tue, 4 Jun 2024 15:51:17 +0200 Subject: [PATCH] community[minor]: Improve Cassandra VectorStore as_retriever (#22465) The Vectorstore's API `as_retriever` doesn't expose explicitly the parameters `search_type` and `search_kwargs` and so these are not well documented. This PR improves `as_retriever` for the Cassandra VectorStore by making these parameters explicit. NB: An alternative would have been to modify `as_retriever` in `Vectorstore`. But there's probably a good reason these were not exposed in the first place ? Is it because implementations may decide to not support them and have fixed values when creating the VectorStoreRetriever ? --- .../vectorstores/cassandra.py | 76 ++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/cassandra.py b/libs/community/langchain_community/vectorstores/cassandra.py index a20c1dc6561..1c4abfbb8e9 100644 --- a/libs/community/langchain_community/vectorstores/cassandra.py +++ b/libs/community/langchain_community/vectorstores/cassandra.py @@ -24,7 +24,7 @@ if typing.TYPE_CHECKING: from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore +from langchain_core.vectorstores import VectorStore, VectorStoreRetriever from langchain_community.utilities.cassandra import SetupMode from langchain_community.vectorstores.utils import maximal_marginal_relevance @@ -1073,3 +1073,77 @@ class Cassandra(VectorStore): body_index_options=body_index_options, **kwargs, ) + + def as_retriever( + self, + search_type: str = "similarity", + search_kwargs: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> VectorStoreRetriever: + """Return VectorStoreRetriever initialized from this VectorStore. + + Args: + search_type: Defines the type of search that + the Retriever should perform. + Can be "similarity" (default), "mmr", or + "similarity_score_threshold". + search_kwargs: Keyword arguments to pass to the + search function. Can include things like: + k: Amount of documents to return (Default: 4) + score_threshold: Minimum relevance threshold + for similarity_score_threshold + fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) + lambda_mult: Diversity of results returned by MMR; + 1 for minimum diversity and 0 for maximum. (Default: 0.5) + filter: Filter by document metadata + tags: List of tags associated with the retriever. + metadata: Metadata associated with the retriever. + kwargs: Other arguments passed to the VectorStoreRetriever init. + + Returns: + Retriever for VectorStore. + + Examples: + + .. code-block:: python + + # Retrieve more documents with higher diversity + # Useful if your dataset has many similar documents + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 6, 'lambda_mult': 0.25} + ) + + # Fetch more documents for the MMR algorithm to consider + # But only return the top 5 + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 5, 'fetch_k': 50} + ) + + # Only retrieve documents that have a relevance score + # Above a certain threshold + docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={'score_threshold': 0.8} + ) + + # Only get the single most similar document from the dataset + docsearch.as_retriever(search_kwargs={'k': 1}) + + # Use a filter to only retrieve documents from a specific paper + docsearch.as_retriever( + search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}} + ) + """ + _tags = tags or [] + self._get_retriever_tags() + return VectorStoreRetriever( + vectorstore=self, + search_type=search_type, + search_kwargs=search_kwargs or {}, + tags=_tags, + metadata=metadata, + **kwargs, + )