From 9a8fe58ebeeea7020480e789aabc0cc0bdf607f8 Mon Sep 17 00:00:00 2001
From: Christophe Bornet <cbornet@hotmail.com>
Date: Tue, 4 Jun 2024 15:51:17 +0200
Subject: [PATCH] community[minor]: Improve Cassandra VectorStore as_retriever
 (#22465)

The Vectorstore's API `as_retriever` doesn't expose explicitly the
parameters `search_type` and `search_kwargs` and so these are not well
documented.
This PR improves `as_retriever` for the Cassandra VectorStore by making
these parameters explicit.

NB: An alternative would have been to modify `as_retriever` in
`Vectorstore`. But there's probably a good reason these were not exposed
in the first place ? Is it because implementations may decide to not
support them and have fixed values when creating the
VectorStoreRetriever ?
---
 .../vectorstores/cassandra.py                 | 76 ++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/libs/community/langchain_community/vectorstores/cassandra.py b/libs/community/langchain_community/vectorstores/cassandra.py
index a20c1dc6561..1c4abfbb8e9 100644
--- a/libs/community/langchain_community/vectorstores/cassandra.py
+++ b/libs/community/langchain_community/vectorstores/cassandra.py
@@ -24,7 +24,7 @@ if typing.TYPE_CHECKING:
 
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
-from langchain_core.vectorstores import VectorStore
+from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
 
 from langchain_community.utilities.cassandra import SetupMode
 from langchain_community.vectorstores.utils import maximal_marginal_relevance
@@ -1073,3 +1073,77 @@ class Cassandra(VectorStore):
             body_index_options=body_index_options,
             **kwargs,
         )
+
+    def as_retriever(
+        self,
+        search_type: str = "similarity",
+        search_kwargs: Optional[Dict[str, Any]] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> VectorStoreRetriever:
+        """Return VectorStoreRetriever initialized from this VectorStore.
+
+        Args:
+            search_type: Defines the type of search that
+                the Retriever should perform.
+                Can be "similarity" (default), "mmr", or
+                "similarity_score_threshold".
+            search_kwargs: Keyword arguments to pass to the
+                search function. Can include things like:
+                    k: Amount of documents to return (Default: 4)
+                    score_threshold: Minimum relevance threshold
+                        for similarity_score_threshold
+                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
+                    lambda_mult: Diversity of results returned by MMR;
+                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
+                    filter: Filter by document metadata
+            tags: List of tags associated with the retriever.
+            metadata: Metadata associated with the retriever.
+            kwargs: Other arguments passed to the VectorStoreRetriever init.
+
+        Returns:
+            Retriever for VectorStore.
+
+        Examples:
+
+        .. code-block:: python
+
+            # Retrieve more documents with higher diversity
+            # Useful if your dataset has many similar documents
+            docsearch.as_retriever(
+                search_type="mmr",
+                search_kwargs={'k': 6, 'lambda_mult': 0.25}
+            )
+
+            # Fetch more documents for the MMR algorithm to consider
+            # But only return the top 5
+            docsearch.as_retriever(
+                search_type="mmr",
+                search_kwargs={'k': 5, 'fetch_k': 50}
+            )
+
+            # Only retrieve documents that have a relevance score
+            # Above a certain threshold
+            docsearch.as_retriever(
+                search_type="similarity_score_threshold",
+                search_kwargs={'score_threshold': 0.8}
+            )
+
+            # Only get the single most similar document from the dataset
+            docsearch.as_retriever(search_kwargs={'k': 1})
+
+            # Use a filter to only retrieve documents from a specific paper
+            docsearch.as_retriever(
+                search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}}
+            )
+        """
+        _tags = tags or [] + self._get_retriever_tags()
+        return VectorStoreRetriever(
+            vectorstore=self,
+            search_type=search_type,
+            search_kwargs=search_kwargs or {},
+            tags=_tags,
+            metadata=metadata,
+            **kwargs,
+        )