community[patch]: DuckDB VS - expose similarity, improve performance of from_texts (#20971)

3 fixes of DuckDB vector store: - unify defaults in constructor and from_texts (users no longer have to specify `vector_key`). - include search similarity into output metadata (fixes #20969) - significantly improve performance of `from_documents` Dependencies: added Pandas to speed up `from_documents`. I was thinking about CSV and JSON options, but I expect trouble loading JSON values this way and also CSV and JSON options require storing data to disk. Anyway, the poetry file for langchain-community already contains a dependency on Pandas. --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: ccurme <chester.curme@gmail.com>
2025-09-26 13:59:49 +00:00 · 2024-05-25 00:17:52 +02:00
parent 42207f5bef
commit cccc8fbe2f
2 changed files with 60 additions and 22 deletions
--- a/docs/docs/integrations/vectorstores/duckdb.ipynb
+++ b/docs/docs/integrations/vectorstores/duckdb.ipynb
@@ -14,7 +14,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install duckdb langchain-community"
+    "! pip install duckdb langchain langchain-community langchain-openai"
   ]
  },
  {
@@ -86,7 +86,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -100,9 +100,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/libs/community/langchain_community/vectorstores/duckdb.py
+++ b/libs/community/langchain_community/vectorstores/duckdb.py
@@ -2,13 +2,23 @@
 from __future__ import annotations

 import json
+import logging
 import uuid
+import warnings
 from typing import Any, Iterable, List, Optional, Type

 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.vectorstores import VST, VectorStore

+logger = logging.getLogger(__name__)
+
+DEFAULT_VECTOR_KEY = "embedding"
+DEFAULT_ID_KEY = "id"
+DEFAULT_TEXT_KEY = "text"
+DEFAULT_TABLE_NAME = "embeddings"
+SIMILARITY_ALIAS = "similarity_score"
+

 class DuckDB(VectorStore):
    """`DuckDB` vector store.
@@ -76,10 +86,10 @@ class DuckDB(VectorStore):
        *,
        connection: Optional[Any] = None,
        embedding: Embeddings,
-        vector_key: str = "embedding",
-        id_key: str = "id",
-        text_key: str = "text",
-        table_name: str = "vectorstore",
+        vector_key: str = DEFAULT_VECTOR_KEY,
+        id_key: str = DEFAULT_ID_KEY,
+        text_key: str = DEFAULT_TEXT_KEY,
+        table_name: str = DEFAULT_TABLE_NAME,
    ):
        """Initialize with DuckDB connection and setup for vector storage."""
        try:
@@ -100,8 +110,6 @@ class DuckDB(VectorStore):
            raise ValueError("An embedding function or model must be provided.")

        if connection is None:
-            import warnings
-
            warnings.warn(
                "No DuckDB connection provided. A new connection will be created."
                "This connection is running in memory and no data will be persisted."
@@ -138,6 +146,17 @@ class DuckDB(VectorStore):
        Returns:
            List of ids of the added texts.
        """
+        have_pandas = False
+        try:
+            import pandas as pd
+
+            have_pandas = True
+        except ImportError:
+            logger.info(
+                "Unable to import pandas. "
+                "Install it with `pip install -U pandas` "
+                "to improve performance of add_texts()."
+            )

        # Extract ids from kwargs or generate new ones if not provided
        ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
@@ -145,6 +164,7 @@ class DuckDB(VectorStore):
        # Embed texts and create documents
        ids = ids or [str(uuid.uuid4()) for _ in texts]
        embeddings = self._embedding.embed_documents(list(texts))
+        data = []
        for idx, text in enumerate(texts):
            embedding = embeddings[idx]
            # Serialize metadata if present, else default to None
@@ -153,9 +173,26 @@ class DuckDB(VectorStore):
                if metadatas and idx < len(metadatas)
                else None
            )
+            if have_pandas:
+                data.append(
+                    {
+                        self._id_key: ids[idx],
+                        self._text_key: text,
+                        self._vector_key: embedding,
+                        "metadata": metadata,
+                    }
+                )
+            else:
+                self._connection.execute(
+                    f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
+                    [ids[idx], text, embedding, metadata],
+                )
+
+        if have_pandas:
+            # noinspection PyUnusedLocal
+            df = pd.DataFrame.from_dict(data)  # noqa: F841
            self._connection.execute(
-                f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
-                [ids[idx], text, embedding, metadata],
+                f"INSERT INTO {self._table_name} SELECT * FROM df",
            )
        return ids

@@ -181,20 +218,21 @@ class DuckDB(VectorStore):
            self._table.select(
                *[
                    self.duckdb.StarExpression(exclude=[]),
-                    list_cosine_similarity.alias("similarity"),
+                    list_cosine_similarity.alias(SIMILARITY_ALIAS),
                ]
            )
-            .order("similarity desc")
+            .order(f"{SIMILARITY_ALIAS} desc")
            .limit(k)
-            .select(
-                self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
-            )
            .fetchdf()
        )
        return [
            Document(
                page_content=docs[self._text_key][idx],
-                metadata=json.loads(docs["metadata"][idx])
+                metadata={
+                    **json.loads(docs["metadata"][idx]),
+                    # using underscore prefix to avoid conflicts with user metadata keys
+                    f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
+                }
                if docs["metadata"][idx]
                else {},
            )
@@ -231,10 +269,10 @@ class DuckDB(VectorStore):

        # Extract kwargs for DuckDB instance creation
        connection = kwargs.get("connection", None)
-        vector_key = kwargs.get("vector_key", "vector")
-        id_key = kwargs.get("id_key", "id")
-        text_key = kwargs.get("text_key", "text")
-        table_name = kwargs.get("table_name", "embeddings")
+        vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
+        id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
+        text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
+        table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)

        # Create an instance of DuckDB
        instance = DuckDB(