From f9bb5ec5d0e0db27782db9b896fe7adea23ff936 Mon Sep 17 00:00:00 2001 From: Dhruvajyoti Sarma Date: Fri, 4 Apr 2025 21:49:18 +0530 Subject: [PATCH] feature: removed pandas dataframe dependency for similary_search when using DuckDB as vector store (#30445) - [ ] **PR title**: "community: Removes pandas dependency for using DuckDB for similarity search" - [ ] **PR message**: - **Description:** Removes pandas dependency for using DuckDB for similarity search. The old function still exists as `similarity_search_pd`, while the new one is at `similarity_search` and requires no code changes. Return format remains the same. - **Issue:** Issue #29933 and update on PR #30435 - **Dependencies:** No dependencies --- .../vectorstores/duckdb.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/duckdb.py b/libs/community/langchain_community/vectorstores/duckdb.py index a11bf87071e..89b4a2f576a 100644 --- a/libs/community/langchain_community/vectorstores/duckdb.py +++ b/libs/community/langchain_community/vectorstores/duckdb.py @@ -18,6 +18,9 @@ DEFAULT_ID_KEY = "id" DEFAULT_TEXT_KEY = "text" DEFAULT_TABLE_NAME = "embeddings" SIMILARITY_ALIAS = "similarity_score" +DUCKDB_FETCHALL_PAGE_CONTENT_INDEX = 1 +DUCKDB_FETCHALL_METADATA_INDEX = 3 +DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX = 4 class DuckDB(VectorStore): @@ -198,10 +201,12 @@ class DuckDB(VectorStore): ) return ids - def similarity_search( + def similarity_search_pd( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: """Performs a similarity search for a given query string. + Requires pandas to be installed. + This was the previously executed method for similarity search. Args: query: The query string to search for. @@ -246,6 +251,53 @@ class DuckDB(VectorStore): for idx in range(len(docs)) ] + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Performs a similarity search for a given query string. + Does not require pandas to be installed. + + Args: + query: The query string to search for. + k: The number of similar texts to return. + + Returns: + A list of Documents most similar to the query. + """ + + embedding = self._embedding.embed_query(query) # type: ignore + list_cosine_similarity = self.duckdb.FunctionExpression( + "list_cosine_similarity", + self.duckdb.ColumnExpression(self._vector_key), + self.duckdb.ConstantExpression(embedding), + ) + docs = ( + self._table.select( + *[ + self.duckdb.StarExpression(exclude=[]), + list_cosine_similarity.alias(SIMILARITY_ALIAS), + ] + ) + .order(f"{SIMILARITY_ALIAS} desc") + .limit(k) + .fetchall() + ) + return [ + Document( + page_content=docs[idx][DUCKDB_FETCHALL_PAGE_CONTENT_INDEX], + metadata={ + **json.loads(docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]), + # using underscore prefix to avoid conflicts with user metadata keys + f"_{SIMILARITY_ALIAS}": docs[idx][ + DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX + ], + } + if docs[idx][DUCKDB_FETCHALL_METADATA_INDEX] + else {}, + ) + for idx in range(len(docs)) + ] + @classmethod def from_texts( cls: Type[VST],