feature: removed pandas dataframe dependency for similary_search when using DuckDB as vector store (#30445)

- [ ] **PR title**: "community: Removes pandas dependency for using
DuckDB for similarity search"


- [ ] **PR message**: 
- **Description:** Removes pandas dependency for using DuckDB for
similarity search. The old function still exists as
`similarity_search_pd`, while the new one is at `similarity_search` and
requires no code changes. Return format remains the same.
    - **Issue:** Issue #29933 and update on PR #30435 
    - **Dependencies:** No dependencies
This commit is contained in:
Dhruvajyoti Sarma 2025-04-04 21:49:18 +05:30 committed by GitHub
parent f79473b752
commit f9bb5ec5d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -18,6 +18,9 @@ DEFAULT_ID_KEY = "id"
DEFAULT_TEXT_KEY = "text"
DEFAULT_TABLE_NAME = "embeddings"
SIMILARITY_ALIAS = "similarity_score"
DUCKDB_FETCHALL_PAGE_CONTENT_INDEX = 1
DUCKDB_FETCHALL_METADATA_INDEX = 3
DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX = 4
class DuckDB(VectorStore):
@ -198,10 +201,12 @@ class DuckDB(VectorStore):
)
return ids
def similarity_search(
def similarity_search_pd(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Performs a similarity search for a given query string.
Requires pandas to be installed.
This was the previously executed method for similarity search.
Args:
query: The query string to search for.
@ -246,6 +251,53 @@ class DuckDB(VectorStore):
for idx in range(len(docs))
]
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Performs a similarity search for a given query string.
Does not require pandas to be installed.
Args:
query: The query string to search for.
k: The number of similar texts to return.
Returns:
A list of Documents most similar to the query.
"""
embedding = self._embedding.embed_query(query) # type: ignore
list_cosine_similarity = self.duckdb.FunctionExpression(
"list_cosine_similarity",
self.duckdb.ColumnExpression(self._vector_key),
self.duckdb.ConstantExpression(embedding),
)
docs = (
self._table.select(
*[
self.duckdb.StarExpression(exclude=[]),
list_cosine_similarity.alias(SIMILARITY_ALIAS),
]
)
.order(f"{SIMILARITY_ALIAS} desc")
.limit(k)
.fetchall()
)
return [
Document(
page_content=docs[idx][DUCKDB_FETCHALL_PAGE_CONTENT_INDEX],
metadata={
**json.loads(docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]),
# using underscore prefix to avoid conflicts with user metadata keys
f"_{SIMILARITY_ALIAS}": docs[idx][
DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX
],
}
if docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]
else {},
)
for idx in range(len(docs))
]
@classmethod
def from_texts(
cls: Type[VST],