community[minor]: Add DuckDB as a vectorstore (#18916)

DuckDB has a cosine similarity function along list and array data types, which can be used as a vector store. - **Description:** The latest version of DuckDB features a cosine similarity function, which can be used with its support for list or array column types. This PR surfaces this functionality to langchain. - **Dependencies:** duckdb 0.10.0 - **Twitter handle:** @igocrite --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2025-09-15 22:44:36 +00:00 · 2024-03-25 07:02:35 +00:00
parent fa6397d76a
commit 96dc180883
5 changed files with 533 additions and 0 deletions
--- a/libs/community/tests/integration_tests/vectorstores/test_duckdb.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_duckdb.py
@@ -0,0 +1,160 @@
+from typing import Dict, Iterator, List
+from uuid import uuid4
+
+import duckdb
+import pytest
+
+from langchain_community.vectorstores import DuckDB
+from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+
+
+@pytest.fixture
+def duckdb_connection() -> Iterator[duckdb.DuckDBPyConnection]:
+    # Setup a temporary DuckDB database
+    conn = duckdb.connect(":memory:")
+    yield conn
+    conn.close()
+
+
+@pytest.fixture
+def embeddings() -> FakeEmbeddings:
+    return FakeEmbeddings()
+
+
+@pytest.fixture
+def texts() -> List[str]:
+    return ["text 1", "text 2", "item 3"]
+
+
+@pytest.fixture
+def metadatas() -> List[Dict[str, str]]:
+    return [
+        {"source": "Document 1"},
+        {"source": "Document 2"},
+        {"source": "Document 3"},
+    ]
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_with_connection(
+    duckdb_connection: duckdb.DuckDBPyConnection,
+    embeddings: FakeEmbeddings,
+    texts: List[str],
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection, embedding=embeddings, table_name="test_table"
+    )
+    store.add_texts(texts)
+    result = store.similarity_search("text 1")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 1" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_without_connection(
+    embeddings: FakeEmbeddings, texts: List[str]
+) -> None:
+    store = DuckDB(embedding=embeddings, table_name="test_table")
+    store.add_texts(texts)
+    result = store.similarity_search("text 1")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 1" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts(embeddings: FakeEmbeddings) -> None:
+    store = DuckDB(embedding=embeddings, table_name="test_table")
+    store.add_texts(["text 2"])
+    result = store.similarity_search("text 2")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 2" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts_with_metadata(
+    duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection,
+        embedding=embeddings,
+        table_name="test_table_with_metadata",
+    )
+    texts = ["text with metadata 1", "text with metadata 2"]
+    metadatas = [
+        {"author": "Author 1", "date": "2021-01-01"},
+        {"author": "Author 2", "date": "2021-02-01"},
+    ]
+
+    # Add texts along with their metadata
+    store.add_texts(texts, metadatas=metadatas)
+
+    # Perform a similarity search to retrieve the documents
+    result = store.similarity_search("text with metadata", k=2)
+
+    # Check if the metadata is correctly associated with the texts
+    assert len(result) == 2, "Should return two results"
+    assert (
+        result[0].metadata.get("author") == "Author 1"
+    ), "Metadata for Author 1 should be correctly retrieved"
+    assert (
+        result[0].metadata.get("date") == "2021-01-01"
+    ), "Date for Author 1 should be correctly retrieved"
+    assert (
+        result[1].metadata.get("author") == "Author 2"
+    ), "Metadata for Author 2 should be correctly retrieved"
+    assert (
+        result[1].metadata.get("date") == "2021-02-01"
+    ), "Date for Author 2 should be correctly retrieved"
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts_with_predefined_ids(
+    duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection,
+        embedding=embeddings,
+        table_name="test_table_predefined_ids",
+    )
+    texts = ["unique text 1", "unique text 2"]
+    predefined_ids = [str(uuid4()), str(uuid4())]  # Generate unique IDs
+
+    # Add texts with the predefined IDs
+    store.add_texts(texts, ids=predefined_ids)
+
+    # Perform a similarity search for each text and check if it's found
+    for text in texts:
+        result = store.similarity_search(text)
+
+        found_texts = [doc.page_content for doc in result]
+        assert (
+            text in found_texts
+        ), f"Text '{text}' was not found in the search results."
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_from_texts(
+    duckdb_connection: duckdb.DuckDBPyConnection,
+    embeddings: FakeEmbeddings,
+    texts: List[str],
+    metadatas: List[Dict[str, str]],
+) -> None:
+    # Initialize DuckDB from texts using the from_texts class method
+    store = DuckDB.from_texts(
+        texts=texts,
+        embedding=embeddings,
+        metadatas=metadatas,
+        connection=duckdb_connection,
+        table_name="test_from_texts_table",
+    )
+
+    # Perform a similarity search to retrieve the documents
+    query_text = "sample text"
+    result = store.similarity_search(query_text, k=2)
+
+    # Verify that the vector store was populated and can return results
+    assert len(result) > 0, "Should return at least one result"
+
+    # Optionally, check that metadata is correctly associated with the texts
+    for doc in result:
+        assert "source" in doc.metadata, "Document metadata should include 'source' key"
--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@@ -28,6 +28,7 @@ _EXPECTED = [
    "DocArrayHnswSearch",
    "DocArrayInMemorySearch",
    "DocumentDBVectorSearch",
+    "DuckDB",
    "ElasticKnnSearch",
    "ElasticVectorSearch",
    "ElasticsearchStore",