community: Cassandra Vector Store: modernize implementation (#27253)

**Description:** 

This PR updates `CassandraGraphVectorStore` to be based off
`CassandraVectorStore`, instead of using a custom CQL implementation.
This allows users using a `CassandraVectorStore` to upgrade to a
`GraphVectorStore` without having to change their database schema or
re-embed documents.

This PR also updates the documentation of the `GraphVectorStore` base
class and contains native async implementations for the standard graph
methods: `traversal_search` and `mmr_traversal_search` in
`CassandraVectorStore`.

**Issue:** No issue number.

**Dependencies:** https://github.com/langchain-ai/langchain/pull/27078
(already-merged)

**Lint and test**: 
- Lint and tests all pass, including existing
`CassandraGraphVectorStore` tests.
- Also added numerous additional tests based of the tests in
`langchain-astradb` which cover many more scenarios than the existing
tests for `Cassandra` and `CassandraGraphVectorStore`

** BREAKING CHANGE**

Note that this is a breaking change for existing users of
`CassandraGraphVectorStore`. They will need to wipe their database table
and restart.

However:
- The interfaces have not changed. Just the underlying storage
mechanism.
- Any one using `langchain_community.vectorstores.Cassandra` can instead
use `langchain_community.graph_vectorstores.CassandraGraphVectorStore`
and they will gain Graph capabilities without having to re-embed their
existing documents. This is the primary goal of this PR.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Eric Pinzur
2024-10-22 20:11:11 +02:00
committed by GitHub
parent 0640cbf2f1
commit f636c83321
9 changed files with 4070 additions and 679 deletions

View File

@@ -0,0 +1,143 @@
from __future__ import annotations
import math
from langchain_community.graph_vectorstores.mmr_helper import MmrHelper
IDS = {
"-1",
"-2",
"-3",
"-4",
"-5",
"+1",
"+2",
"+3",
"+4",
"+5",
}
class TestMmrHelper:
def test_mmr_helper_functional(self) -> None:
helper = MmrHelper(k=3, query_embedding=[6, 5], lambda_mult=0.5)
assert len(list(helper.candidate_ids())) == 0
helper.add_candidates({"-1": [3, 5]})
helper.add_candidates({"-2": [3, 5]})
helper.add_candidates({"-3": [2, 6]})
helper.add_candidates({"-4": [1, 6]})
helper.add_candidates({"-5": [0, 6]})
assert len(list(helper.candidate_ids())) == 5
helper.add_candidates({"+1": [5, 3]})
helper.add_candidates({"+2": [5, 3]})
helper.add_candidates({"+3": [6, 2]})
helper.add_candidates({"+4": [6, 1]})
helper.add_candidates({"+5": [6, 0]})
assert len(list(helper.candidate_ids())) == 10
for idx in range(3):
best_id = helper.pop_best()
assert best_id in IDS
assert len(list(helper.candidate_ids())) == 9 - idx
assert best_id not in helper.candidate_ids()
def test_mmr_helper_max_diversity(self) -> None:
helper = MmrHelper(k=2, query_embedding=[6, 5], lambda_mult=0)
helper.add_candidates({"-1": [3, 5]})
helper.add_candidates({"-2": [3, 5]})
helper.add_candidates({"-3": [2, 6]})
helper.add_candidates({"-4": [1, 6]})
helper.add_candidates({"-5": [0, 6]})
best = {helper.pop_best(), helper.pop_best()}
assert best == {"-1", "-5"}
def test_mmr_helper_max_similarity(self) -> None:
helper = MmrHelper(k=2, query_embedding=[6, 5], lambda_mult=1)
helper.add_candidates({"-1": [3, 5]})
helper.add_candidates({"-2": [3, 5]})
helper.add_candidates({"-3": [2, 6]})
helper.add_candidates({"-4": [1, 6]})
helper.add_candidates({"-5": [0, 6]})
best = {helper.pop_best(), helper.pop_best()}
assert best == {"-1", "-2"}
def test_mmr_helper_add_candidate(self) -> None:
helper = MmrHelper(5, [0.0, 1.0])
helper.add_candidates(
{
"a": [0.0, 1.0],
"b": [1.0, 0.0],
}
)
assert helper.best_id == "a"
def test_mmr_helper_pop_best(self) -> None:
helper = MmrHelper(5, [0.0, 1.0])
helper.add_candidates(
{
"a": [0.0, 1.0],
"b": [1.0, 0.0],
}
)
assert helper.pop_best() == "a"
assert helper.pop_best() == "b"
assert helper.pop_best() is None
def angular_embedding(self, angle: float) -> list[float]:
return [math.cos(angle * math.pi), math.sin(angle * math.pi)]
def test_mmr_helper_added_documents(self) -> None:
"""Test end to end construction and MMR search.
The embedding function used here ensures `texts` become
the following vectors on a circle (numbered v0 through v3):
______ v2
/ \
/ | v1
v3 | . | query
| / v0
|______/ (N.B. very crude drawing)
With fetch_k==2 and k==2, when query is at 0.0, (1, ),
one expects that v2 and v0 are returned (in some order)
because v1 is "too close" to v0 (and v0 is closer than v1)).
Both v2 and v3 are discovered after v0.
"""
helper = MmrHelper(5, self.angular_embedding(0.0))
# Fetching the 2 nearest neighbors to 0.0
helper.add_candidates(
{
"v0": self.angular_embedding(-0.124),
"v1": self.angular_embedding(+0.127),
}
)
assert helper.pop_best() == "v0"
# After v0 is selected, new nodes are discovered.
# v2 is closer than v3. v1 is "too similar" to "v0" so it's not included.
helper.add_candidates(
{
"v2": self.angular_embedding(+0.25),
"v3": self.angular_embedding(+1.0),
}
)
assert helper.pop_best() == "v2"
assert math.isclose(
helper.selected_similarity_scores[0], 0.9251, abs_tol=0.0001
)
assert math.isclose(
helper.selected_similarity_scores[1], 0.7071, abs_tol=0.0001
)
assert math.isclose(helper.selected_mmr_scores[0], 0.4625, abs_tol=0.0001)
assert math.isclose(helper.selected_mmr_scores[1], 0.1608, abs_tol=0.0001)