feature: removed pandas dataframe dependency for similary_search when using DuckDB as vector store (#30445)

- [ ] **PR title**: "community: Removes pandas dependency for using DuckDB for similarity search" - [ ] **PR message**: - **Description:** Removes pandas dependency for using DuckDB for similarity search. The old function still exists as `similarity_search_pd`, while the new one is at `similarity_search` and requires no code changes. Return format remains the same. - **Issue:** Issue #29933 and update on PR #30435 - **Dependencies:** No dependencies
2025-06-24 15:43:54 +00:00 · 2025-04-04 21:49:18 +05:30 · 2025-04-04 21:49:18 +05:30 · f9bb5ec5d0
commit f9bb5ec5d0
parent f79473b752
1 changed files with 53 additions and 1 deletions
--- a/libs/community/langchain_community/vectorstores/duckdb.py
+++ b/libs/community/langchain_community/vectorstores/duckdb.py
@ -18,6 +18,9 @@ DEFAULT_ID_KEY = "id"
 DEFAULT_TEXT_KEY = "text"
 DEFAULT_TABLE_NAME = "embeddings"
 SIMILARITY_ALIAS = "similarity_score"
+DUCKDB_FETCHALL_PAGE_CONTENT_INDEX = 1
+DUCKDB_FETCHALL_METADATA_INDEX = 3
+DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX = 4


 class DuckDB(VectorStore):
@ -198,10 +201,12 @@ class DuckDB(VectorStore):
            )
        return ids

-    def similarity_search(
+    def similarity_search_pd(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Performs a similarity search for a given query string.
+        Requires pandas to be installed.
+        This was the previously executed method for similarity search.

        Args:
            query: The query string to search for.
@ -246,6 +251,53 @@ class DuckDB(VectorStore):
            for idx in range(len(docs))
        ]

+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Performs a similarity search for a given query string.
+        Does not require pandas to be installed.
+
+        Args:
+            query: The query string to search for.
+            k: The number of similar texts to return.
+
+        Returns:
+            A list of Documents most similar to the query.
+        """
+
+        embedding = self._embedding.embed_query(query)  # type: ignore
+        list_cosine_similarity = self.duckdb.FunctionExpression(
+            "list_cosine_similarity",
+            self.duckdb.ColumnExpression(self._vector_key),
+            self.duckdb.ConstantExpression(embedding),
+        )
+        docs = (
+            self._table.select(
+                *[
+                    self.duckdb.StarExpression(exclude=[]),
+                    list_cosine_similarity.alias(SIMILARITY_ALIAS),
+                ]
+            )
+            .order(f"{SIMILARITY_ALIAS} desc")
+            .limit(k)
+            .fetchall()
+        )
+        return [
+            Document(
+                page_content=docs[idx][DUCKDB_FETCHALL_PAGE_CONTENT_INDEX],
+                metadata={
+                    **json.loads(docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]),
+                    # using underscore prefix to avoid conflicts with user metadata keys
+                    f"_{SIMILARITY_ALIAS}": docs[idx][
+                        DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX
+                    ],
+                }
+                if docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]
+                else {},
+            )
+            for idx in range(len(docs))
+        ]
+
    @classmethod
    def from_texts(
        cls: Type[VST],