community: Added Lantern as VectorStore (#12951)

Support [Lantern](https://github.com/lanterndata/lantern) as a new VectorStore type. - Added Lantern as VectorStore. It will support 3 distance functions `l2 squared`, `cosine` and `hamming` and will use `HNSW` index. - Added tests - Added example notebook
2025-09-16 23:13:31 +00:00 · 2024-01-13 00:00:16 +04:00
parent 1afac77439
commit efe6cfafe2
7 changed files with 2032 additions and 0 deletions
--- a/libs/community/langchain_community/vectorstores/init.py
+++ b/libs/community/langchain_community/vectorstores/init.py
@@ -458,6 +458,12 @@ def _import_zilliz() -> Any:
    return Zilliz


+def _import_lantern() -> Any:
+    from langchain_community.vectorstores.lantern import Lantern
+
+    return Lantern
+
+
 def __getattr__(name: str) -> Any:
    if name == "AnalyticDB":
        return _import_analyticdb()
@@ -599,6 +605,8 @@ def __getattr__(name: str) -> Any:
        return _import_zilliz()
    elif name == "VespaStore":
        return _import_vespa()
+    elif name == "Lantern":
+        return _import_lantern()
    else:
        raise AttributeError(f"Could not find: {name}")

@@ -673,4 +681,5 @@ __all__ = [
    "TencentVectorDB",
    "AzureCosmosDBVectorSearch",
    "VectorStore",
+    "Lantern",
 ]
--- a/libs/community/langchain_community/vectorstores/lantern.py
+++ b/libs/community/langchain_community/vectorstores/lantern.py
--- a/libs/community/tests/integration_tests/vectorstores/test_lantern.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_lantern.py
@@ -0,0 +1,319 @@
+"""Test Lantern functionality."""
+import os
+from typing import List, Tuple
+
+from langchain_core.documents import Document
+
+from langchain_community.embeddings import FakeEmbeddings
+from langchain_community.vectorstores import Lantern
+
+CONNECTION_STRING = Lantern.connection_string_from_db_params(
+    driver=os.environ.get("TEST_LANTERN_DRIVER", "psycopg2"),
+    host=os.environ.get("TEST_LANTERN_HOST", "localhost"),
+    port=int(os.environ.get("TEST_LANTERN_PORT", "5432")),
+    database=os.environ.get("TEST_LANTERN_DATABASE", "postgres"),
+    user=os.environ.get("TEST_LANTERN_USER", "postgres"),
+    password=os.environ.get("TEST_LANTERN_PASSWORD", "postgres"),
+)
+
+
+ADA_TOKEN_COUNT = 1536
+
+
+def fix_distance_precision(
+    results: List[Tuple[Document, float]], precision: int = 2
+) -> List[Tuple[Document, float]]:
+    return list(
+        map(lambda x: (x[0], float(f"{{:.{precision}f}}".format(x[1]))), results)
+    )
+
+
+class FakeEmbeddingsWithAdaDimension(FakeEmbeddings):
+    """Fake embeddings functionality for testing."""
+
+    def __init__(self):
+        super(FakeEmbeddingsWithAdaDimension, self).__init__(size=ADA_TOKEN_COUNT)
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Return simple embeddings."""
+        return [
+            [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts))
+        ]
+
+    def embed_query(self, text: str) -> List[float]:
+        """Return simple embeddings."""
+        return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)]
+
+
+def test_lantern() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_lantern_embeddings() -> None:
+    """Test end to end construction with embeddings and search."""
+    texts = ["foo", "bar", "baz"]
+    text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts)
+    text_embedding_pairs = list(zip(texts, text_embeddings))
+    docsearch = Lantern.from_embeddings(
+        text_embeddings=text_embedding_pairs,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_lantern_embeddings_distance_strategy() -> None:
+    """Test end to end construction with embeddings and search."""
+    texts = ["foo", "bar", "baz"]
+    text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts)
+    text_embedding_pairs = list(zip(texts, text_embeddings))
+    docsearch = Lantern.from_embeddings(
+        text_embeddings=text_embedding_pairs,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        distance_strategy="hamming",
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_lantern_with_metadatas() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo", metadata={"page": "0"})]
+
+
+def test_lantern_with_metadatas_with_scores() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = fix_distance_precision(docsearch.similarity_search_with_score("foo", k=1))
+    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
+
+
+def test_lantern_with_filter_match() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection_filter",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = fix_distance_precision(
+        docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"})
+    )
+    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
+
+
+def test_lantern_with_filter_distant_match() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection_filter",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = fix_distance_precision(
+        docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})
+    )
+    assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.0)]
+
+
+def test_lantern_with_filter_no_match() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection_filter",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"})
+    assert output == []
+
+
+def test_lantern_with_filter_in_set() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection_filter",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = fix_distance_precision(
+        docsearch.similarity_search_with_score(
+            "foo", k=2, filter={"page": {"IN": ["0", "2"]}}
+        ),
+        4,
+    )
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}), 0.0),
+        (Document(page_content="baz", metadata={"page": "2"}), 0.0013),
+    ]
+
+
+def test_lantern_delete_docs() -> None:
+    """Add and delete documents."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection_filter",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        ids=["1", "2", "3"],
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    docsearch.delete(["1", "2", "3"])
+    output = docsearch.similarity_search("foo", k=3)
+    assert output == []
+
+
+def test_lantern_relevance_score() -> None:
+    """Test to make sure the relevance score is scaled to 0-1."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
+    output = fix_distance_precision(
+        docsearch.similarity_search_with_relevance_scores("foo", k=3), 4
+    )
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}), 1.0),
+        (Document(page_content="bar", metadata={"page": "1"}), 0.9997),
+        (Document(page_content="baz", metadata={"page": "2"}), 0.9987),
+    ]
+
+
+def test_lantern_retriever_search_threshold() -> None:
+    """Test using retriever for searching with threshold."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
+    retriever = docsearch.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 3, "score_threshold": 0.999},
+    )
+    output = retriever.get_relevant_documents("summer")
+    assert output == [
+        Document(page_content="foo", metadata={"page": "0"}),
+        Document(page_content="bar", metadata={"page": "1"}),
+    ]
+
+
+def test_lantern_retriever_search_threshold_custom_normalization_fn() -> None:
+    """Test searching with threshold and custom normalization function"""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        relevance_score_fn=lambda d: d * 0,
+        pre_delete_collection=True,
+    )
+
+    retriever = docsearch.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 3, "score_threshold": 0.9999},
+    )
+    output = retriever.get_relevant_documents("foo")
+    assert output == [
+        Document(page_content="foo", metadata={"page": "0"}),
+    ]
+
+
+def test_lantern_max_marginal_relevance_search() -> None:
+    """Test max marginal relevance search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3)
+    assert output == [Document(page_content="foo")]
+
+
+def test_lantern_max_marginal_relevance_search_with_score() -> None:
+    """Test max marginal relevance search with relevance scores."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Lantern.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    output = fix_distance_precision(
+        docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3)
+    )
+    assert output == [(Document(page_content="foo"), 0.0)]
--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@@ -29,6 +29,7 @@ _EXPECTED = [
    "FAISS",
    "Hologres",
    "LanceDB",
+    "Lantern",
    "LLMRails",
    "Marqo",
    "MatchingEngine",