diff --git a/libs/community/langchain_community/vectorstores/cassandra.py b/libs/community/langchain_community/vectorstores/cassandra.py index ac70c21d1ce..6eae961e4b6 100644 --- a/libs/community/langchain_community/vectorstores/cassandra.py +++ b/libs/community/langchain_community/vectorstores/cassandra.py @@ -57,17 +57,12 @@ class Cassandra(VectorStore): keyspace: Cassandra key space. table_name: Cassandra table. ttl_seconds: Optional time-to-live for the added texts. + body_index_options: Optional options used to create the body index. + Eg. body_index_options = [cassio.table.cql.STANDARD_ANALYZER] """ _embedding_dimension: Union[int, None] - @staticmethod - def _filter_to_metadata(filter_dict: Optional[Dict[str, str]]) -> Dict[str, Any]: - if filter_dict is None: - return {} - else: - return filter_dict - def _get_embedding_dimension(self) -> int: if self._embedding_dimension is None: self._embedding_dimension = len( @@ -82,6 +77,8 @@ class Cassandra(VectorStore): keyspace: str, table_name: str, ttl_seconds: Optional[int] = None, + *, + body_index_options: Optional[List[Tuple[str, Any]]] = None, ) -> None: try: from cassio.table import MetadataVectorCassandraTable @@ -99,6 +96,10 @@ class Cassandra(VectorStore): # self._embedding_dimension = None # + kwargs = {} + if body_index_options is not None: + kwargs["body_index_options"] = body_index_options + self.table = MetadataVectorCassandraTable( session=session, keyspace=keyspace, @@ -106,6 +107,7 @@ class Cassandra(VectorStore): vector_dimension=self._get_embedding_dimension(), metadata_indexing="all", primary_key_type="TEXT", + **kwargs, ) @property @@ -212,22 +214,30 @@ class Cassandra(VectorStore): embedding: List[float], k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, ) -> List[Tuple[Document, float, str]]: """Return docs most similar to embedding vector. Args: - embedding (str): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 4. + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter on the metadata to apply. + body_search: Document textual search terms to apply. + Only supported by Astra DB at the moment. Returns: List of (Document, score, id), the most similar to the query vector. """ - search_metadata = self._filter_to_metadata(filter) + kwargs: Dict[str, Any] = {} + if filter is not None: + kwargs["metadata"] = filter + if body_search is not None: + kwargs["body_search"] = body_search # hits = self.table.metric_ann_search( vector=embedding, n=k, metric="cos", - metadata=search_metadata, + **kwargs, ) # We stick to 'cos' distance as it can be normalized on a 0-1 axis # (1=most relevant), as required by this class' contract. @@ -248,12 +258,14 @@ class Cassandra(VectorStore): query: str, k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, ) -> List[Tuple[Document, float, str]]: embedding_vector = self.embedding.embed_query(query) return self.similarity_search_with_score_id_by_vector( embedding=embedding_vector, k=k, filter=filter, + body_search=body_search, ) # id-unaware search facilities @@ -262,12 +274,16 @@ class Cassandra(VectorStore): embedding: List[float], k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, ) -> List[Tuple[Document, float]]: """Return docs most similar to embedding vector. Args: - embedding (str): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 4. + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter on the metadata to apply. + body_search: Document textual search terms to apply. + Only supported by Astra DB at the moment. Returns: List of (Document, score), the most similar to the query vector. """ @@ -277,6 +293,7 @@ class Cassandra(VectorStore): embedding=embedding, k=k, filter=filter, + body_search=body_search, ) ] @@ -285,6 +302,7 @@ class Cassandra(VectorStore): query: str, k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> List[Document]: embedding_vector = self.embedding.embed_query(query) @@ -292,6 +310,7 @@ class Cassandra(VectorStore): embedding_vector, k, filter=filter, + body_search=body_search, ) def similarity_search_by_vector( @@ -299,6 +318,7 @@ class Cassandra(VectorStore): embedding: List[float], k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> List[Document]: return [ @@ -307,6 +327,7 @@ class Cassandra(VectorStore): embedding, k, filter=filter, + body_search=body_search, ) ] @@ -315,12 +336,14 @@ class Cassandra(VectorStore): query: str, k: int = 4, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, ) -> List[Tuple[Document, float]]: embedding_vector = self.embedding.embed_query(query) return self.similarity_search_with_score_by_vector( embedding_vector, k, filter=filter, + body_search=body_search, ) def max_marginal_relevance_search_by_vector( @@ -330,6 +353,7 @@ class Cassandra(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -337,22 +361,31 @@ class Cassandra(VectorStore): among selected documents. Args: embedding: Embedding to look up documents similar to. - k: Number of Documents to return. + k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter: Filter on the metadata to apply. + body_search: Document textual search terms to apply. + Only supported by Astra DB at the moment. Returns: List of Documents selected by maximal marginal relevance. """ - search_metadata = self._filter_to_metadata(filter) + _kwargs: Dict[str, Any] = {} + if filter is not None: + _kwargs["metadata"] = filter + if body_search is not None: + _kwargs["body_search"] = body_search prefetch_hits = list( self.table.metric_ann_search( vector=embedding, n=fetch_k, metric="cos", - metadata=search_metadata, + **_kwargs, ) ) # let the mmr utility pick the *indices* in the above array @@ -382,6 +415,7 @@ class Cassandra(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + body_search: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -389,12 +423,16 @@ class Cassandra(VectorStore): among selected documents. Args: query: Text to look up documents similar to. - k: Number of Documents to return. + k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + filter: Filter on the metadata to apply. + body_search: Document textual search terms to apply. + Only supported by Astra DB at the moment. Returns: List of Documents selected by maximal marginal relevance. """ @@ -405,6 +443,7 @@ class Cassandra(VectorStore): fetch_k, lambda_mult=lambda_mult, filter=filter, + body_search=body_search, ) @classmethod @@ -420,6 +459,7 @@ class Cassandra(VectorStore): ids: Optional[List[str]] = None, batch_size: int = 16, ttl_seconds: Optional[int] = None, + body_index_options: Optional[List[Tuple[str, Any]]] = None, **kwargs: Any, ) -> CVST: """Create a Cassandra vectorstore from raw texts. @@ -435,6 +475,8 @@ class Cassandra(VectorStore): batch_size: Number of concurrent requests to send to the server. Defaults to 16. ttl_seconds: Optional time-to-live for the added texts. + body_index_options: Optional options used to create the body index. + Eg. body_index_options = [cassio.table.cql.STANDARD_ANALYZER] Returns: a Cassandra vectorstore. @@ -451,6 +493,7 @@ class Cassandra(VectorStore): keyspace=keyspace, table_name=table_name, ttl_seconds=ttl_seconds, + body_index_options=body_index_options, ) store.add_texts( texts=texts, metadatas=metadatas, ids=ids, batch_size=batch_size @@ -469,6 +512,7 @@ class Cassandra(VectorStore): ids: Optional[List[str]] = None, batch_size: int = 16, ttl_seconds: Optional[int] = None, + body_index_options: Optional[List[Tuple[str, Any]]] = None, **kwargs: Any, ) -> CVST: """Create a Cassandra vectorstore from a document list. @@ -483,6 +527,8 @@ class Cassandra(VectorStore): batch_size: Number of concurrent requests to send to the server. Defaults to 16. ttl_seconds: Optional time-to-live for the added documents. + body_index_options: Optional options used to create the body index. + Eg. body_index_options = [cassio.table.cql.STANDARD_ANALYZER] Returns: a Cassandra vectorstore. @@ -499,5 +545,6 @@ class Cassandra(VectorStore): ids=ids, batch_size=batch_size, ttl_seconds=ttl_seconds, + body_index_options=body_index_options, **kwargs, )