From f9bb5ec5d0e0db27782db9b896fe7adea23ff936 Mon Sep 17 00:00:00 2001
From: Dhruvajyoti Sarma <onlinedhruva71@gmail.com>
Date: Fri, 4 Apr 2025 21:49:18 +0530
Subject: [PATCH] feature: removed pandas dataframe dependency for
 similary_search when using DuckDB as vector store (#30445)

- [ ] **PR title**: "community: Removes pandas dependency for using
DuckDB for similarity search"


- [ ] **PR message**:
- **Description:** Removes pandas dependency for using DuckDB for
similarity search. The old function still exists as
`similarity_search_pd`, while the new one is at `similarity_search` and
requires no code changes. Return format remains the same.
    - **Issue:** Issue #29933 and update on PR #30435
    - **Dependencies:** No dependencies
---
 .../vectorstores/duckdb.py                    | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/libs/community/langchain_community/vectorstores/duckdb.py b/libs/community/langchain_community/vectorstores/duckdb.py
index a11bf87071e..89b4a2f576a 100644
--- a/libs/community/langchain_community/vectorstores/duckdb.py
+++ b/libs/community/langchain_community/vectorstores/duckdb.py
@@ -18,6 +18,9 @@ DEFAULT_ID_KEY = "id"
 DEFAULT_TEXT_KEY = "text"
 DEFAULT_TABLE_NAME = "embeddings"
 SIMILARITY_ALIAS = "similarity_score"
+DUCKDB_FETCHALL_PAGE_CONTENT_INDEX = 1
+DUCKDB_FETCHALL_METADATA_INDEX = 3
+DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX = 4
 
 
 class DuckDB(VectorStore):
@@ -198,10 +201,12 @@ class DuckDB(VectorStore):
             )
         return ids
 
-    def similarity_search(
+    def similarity_search_pd(
         self, query: str, k: int = 4, **kwargs: Any
     ) -> List[Document]:
         """Performs a similarity search for a given query string.
+        Requires pandas to be installed.
+        This was the previously executed method for similarity search.
 
         Args:
             query: The query string to search for.
@@ -246,6 +251,53 @@ class DuckDB(VectorStore):
             for idx in range(len(docs))
         ]
 
+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Performs a similarity search for a given query string.
+        Does not require pandas to be installed.
+
+        Args:
+            query: The query string to search for.
+            k: The number of similar texts to return.
+
+        Returns:
+            A list of Documents most similar to the query.
+        """
+
+        embedding = self._embedding.embed_query(query)  # type: ignore
+        list_cosine_similarity = self.duckdb.FunctionExpression(
+            "list_cosine_similarity",
+            self.duckdb.ColumnExpression(self._vector_key),
+            self.duckdb.ConstantExpression(embedding),
+        )
+        docs = (
+            self._table.select(
+                *[
+                    self.duckdb.StarExpression(exclude=[]),
+                    list_cosine_similarity.alias(SIMILARITY_ALIAS),
+                ]
+            )
+            .order(f"{SIMILARITY_ALIAS} desc")
+            .limit(k)
+            .fetchall()
+        )
+        return [
+            Document(
+                page_content=docs[idx][DUCKDB_FETCHALL_PAGE_CONTENT_INDEX],
+                metadata={
+                    **json.loads(docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]),
+                    # using underscore prefix to avoid conflicts with user metadata keys
+                    f"_{SIMILARITY_ALIAS}": docs[idx][
+                        DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX
+                    ],
+                }
+                if docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]
+                else {},
+            )
+            for idx in range(len(docs))
+        ]
+
     @classmethod
     def from_texts(
         cls: Type[VST],