Vector store support for Cassandra (#6426)

This addresses #6291 adding support for using Cassandra (and compatible databases, such as DataStax Astra DB) as a [Vector Store](https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-30%3A+Approximate+Nearest+Neighbor(ANN)+Vector+Search+via+Storage-Attached+Indexes). A new class `Cassandra` is introduced, which complies with the contract and interface for a vector store, along with the corresponding integration test, a sample notebook and modified dependency toml. Dependencies: the implementation relies on the library `cassio`, which simplifies interacting with Cassandra for ML- and LLM-oriented workloads. CassIO, in turn, uses the `cassandra-driver` low-lever drivers to communicate with the database. The former is added as optional dependency (+ in `extended_testing`), the latter was already in the project. Integration testing relies on a locally-running instance of Cassandra. [Here](https://cassio.org/more_info/#use-a-local-vector-capable-cassandra) a detailed description can be found on how to compile and run it (at the time of writing the feature has not made it yet to a release). During development of the integration tests, I added a new "fake embedding" class for what I consider a more controlled way of testing the MMR search method. Likewise, I had to amend what looked like a glitch in the behaviour of `ConsistentFakeEmbeddings` whereby an `embed_query` call would have bypassed storage of the requested text in the class cache for use in later repeated invocations. @dev2049 might be the right person to tag here for a review. Thank you! --------- Co-authored-by: rlm <pexpresss31@gmail.com>
2025-09-13 13:36:15 +00:00 · 2023-06-20 19:46:20 +02:00
parent cac6e45a67
commit 22af93d851
5 changed files with 835 additions and 0 deletions
--- a/tests/integration_tests/vectorstores/fake_embeddings.py
+++ b/tests/integration_tests/vectorstores/fake_embeddings.py
@@ -1,4 +1,5 @@
 """Fake Embedding class for testing purposes."""
+import math
 from typing import List

 from langchain.embeddings.base import Embeddings
@@ -45,3 +46,29 @@ class ConsistentFakeEmbeddings(FakeEmbeddings):
        if text not in self.known_texts:
            return [float(1.0)] * 9 + [float(0.0)]
        return [float(1.0)] * 9 + [float(self.known_texts.index(text))]
+
+
+class AngularTwoDimensionalEmbeddings(Embeddings):
+    """
+    From angles (as strings in units of pi) to unit embedding vectors on a circle.
+    """
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Make a list of texts into a list of embedding vectors.
+        """
+        return [self.embed_query(text) for text in texts]
+
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Convert input text to a 'vector' (list of floats).
+        If the text is a number, use it as the angle for the
+        unit vector in units of pi.
+        Any other input text becomes the singular result [0, 0] !
+        """
+        try:
+            angle = float(text)
+            return [math.cos(angle * math.pi), math.sin(angle * math.pi)]
+        except ValueError:
+            # Assume: just test string, no attention is paid to values.
+            return [0.0, 0.0]
--- a/tests/integration_tests/vectorstores/test_cassandra.py
+++ b/tests/integration_tests/vectorstores/test_cassandra.py
@@ -0,0 +1,135 @@
+"""Test Cassandra functionality."""
+from typing import List, Optional, Type
+
+from cassandra.cluster import Cluster
+
+from langchain.docstore.document import Document
+from langchain.vectorstores import Cassandra
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    AngularTwoDimensionalEmbeddings,
+    ConsistentFakeEmbeddings,
+    Embeddings,
+)
+
+
+def _vectorstore_from_texts(
+    texts: List[str],
+    metadatas: Optional[List[dict]] = None,
+    embedding_class: Type[Embeddings] = ConsistentFakeEmbeddings,
+    drop: bool = True,
+) -> Cassandra:
+    keyspace = "vector_test_keyspace"
+    table_name = "vector_test_table"
+    # get db connection
+    cluster = Cluster()
+    session = cluster.connect()
+    # ensure keyspace exists
+    session.execute(
+        (
+            f"CREATE KEYSPACE IF NOT EXISTS {keyspace} "
+            f"WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}"
+        )
+    )
+    # drop table if required
+    if drop:
+        session.execute(f"DROP TABLE IF EXISTS {keyspace}.{table_name}")
+    #
+    return Cassandra.from_texts(
+        texts,
+        embedding_class(),
+        metadatas=metadatas,
+        session=session,
+        keyspace=keyspace,
+        table_name=table_name,
+    )
+
+
+def test_cassandra() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = _vectorstore_from_texts(texts)
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_cassandra_with_score() -> None:
+    """Test end to end construction and search with scores and IDs."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = _vectorstore_from_texts(texts, metadatas=metadatas)
+    output = docsearch.similarity_search_with_score("foo", k=3)
+    docs = [o[0] for o in output]
+    scores = [o[1] for o in output]
+    assert docs == [
+        Document(page_content="foo", metadata={"page": 0}),
+        Document(page_content="bar", metadata={"page": 1}),
+        Document(page_content="baz", metadata={"page": 2}),
+    ]
+    assert scores[0] > scores[1] > scores[2]
+
+
+def test_cassandra_max_marginal_relevance_search() -> None:
+    """
+    Test end to end construction and MMR search.
+    The embedding function used here ensures `texts` become
+    the following vectors on a circle (numbered v0 through v3):
+
+           ______ v2
+          /      \
+         /        \  v1
+    v3  |     .    | query
+         \        /  v0
+          \______/                 (N.B. very crude drawing)
+
+    With fetch_k==3 and k==2, when query is at (1, ),
+    one expects that v2 and v0 are returned (in some order).
+    """
+    texts = ["-0.125", "+0.125", "+0.25", "+1.0"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = _vectorstore_from_texts(
+        texts, metadatas=metadatas, embedding_class=AngularTwoDimensionalEmbeddings
+    )
+    output = docsearch.max_marginal_relevance_search("0.0", k=2, fetch_k=3)
+    output_set = {
+        (mmr_doc.page_content, mmr_doc.metadata["page"]) for mmr_doc in output
+    }
+    assert output_set == {
+        ("+0.25", 2),
+        ("-0.125", 0),
+    }
+
+
+def test_cassandra_add_extra() -> None:
+    """Test end to end construction with further insertions."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = _vectorstore_from_texts(texts, metadatas=metadatas)
+
+    docsearch.add_texts(texts, metadatas)
+    texts2 = ["foo2", "bar2", "baz2"]
+    docsearch.add_texts(texts2, metadatas)
+
+    output = docsearch.similarity_search("foo", k=10)
+    assert len(output) == 6
+
+
+def test_cassandra_no_drop() -> None:
+    """Test end to end construction and re-opening the same index."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = _vectorstore_from_texts(texts, metadatas=metadatas)
+    del docsearch
+
+    texts2 = ["foo2", "bar2", "baz2"]
+    docsearch = _vectorstore_from_texts(texts2, metadatas=metadatas, drop=False)
+
+    output = docsearch.similarity_search("foo", k=10)
+    assert len(output) == 6
+
+
+# if __name__ == "__main__":
+#     test_cassandra()
+#     test_cassandra_with_score()
+#     test_cassandra_max_marginal_relevance_search()
+#     test_cassandra_add_extra()
+#     test_cassandra_no_drop()