mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
feature: removed pandas dataframe dependency for similary_search when using DuckDB as vector store (#30445)
- [ ] **PR title**: "community: Removes pandas dependency for using DuckDB for similarity search" - [ ] **PR message**: - **Description:** Removes pandas dependency for using DuckDB for similarity search. The old function still exists as `similarity_search_pd`, while the new one is at `similarity_search` and requires no code changes. Return format remains the same. - **Issue:** Issue #29933 and update on PR #30435 - **Dependencies:** No dependencies
This commit is contained in:
parent
f79473b752
commit
f9bb5ec5d0
@ -18,6 +18,9 @@ DEFAULT_ID_KEY = "id"
|
||||
DEFAULT_TEXT_KEY = "text"
|
||||
DEFAULT_TABLE_NAME = "embeddings"
|
||||
SIMILARITY_ALIAS = "similarity_score"
|
||||
DUCKDB_FETCHALL_PAGE_CONTENT_INDEX = 1
|
||||
DUCKDB_FETCHALL_METADATA_INDEX = 3
|
||||
DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX = 4
|
||||
|
||||
|
||||
class DuckDB(VectorStore):
|
||||
@ -198,10 +201,12 @@ class DuckDB(VectorStore):
|
||||
)
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
def similarity_search_pd(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Performs a similarity search for a given query string.
|
||||
Requires pandas to be installed.
|
||||
This was the previously executed method for similarity search.
|
||||
|
||||
Args:
|
||||
query: The query string to search for.
|
||||
@ -246,6 +251,53 @@ class DuckDB(VectorStore):
|
||||
for idx in range(len(docs))
|
||||
]
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Performs a similarity search for a given query string.
|
||||
Does not require pandas to be installed.
|
||||
|
||||
Args:
|
||||
query: The query string to search for.
|
||||
k: The number of similar texts to return.
|
||||
|
||||
Returns:
|
||||
A list of Documents most similar to the query.
|
||||
"""
|
||||
|
||||
embedding = self._embedding.embed_query(query) # type: ignore
|
||||
list_cosine_similarity = self.duckdb.FunctionExpression(
|
||||
"list_cosine_similarity",
|
||||
self.duckdb.ColumnExpression(self._vector_key),
|
||||
self.duckdb.ConstantExpression(embedding),
|
||||
)
|
||||
docs = (
|
||||
self._table.select(
|
||||
*[
|
||||
self.duckdb.StarExpression(exclude=[]),
|
||||
list_cosine_similarity.alias(SIMILARITY_ALIAS),
|
||||
]
|
||||
)
|
||||
.order(f"{SIMILARITY_ALIAS} desc")
|
||||
.limit(k)
|
||||
.fetchall()
|
||||
)
|
||||
return [
|
||||
Document(
|
||||
page_content=docs[idx][DUCKDB_FETCHALL_PAGE_CONTENT_INDEX],
|
||||
metadata={
|
||||
**json.loads(docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]),
|
||||
# using underscore prefix to avoid conflicts with user metadata keys
|
||||
f"_{SIMILARITY_ALIAS}": docs[idx][
|
||||
DUCKDB_FETCHALL_SIMILARITY_SCORE_INDEX
|
||||
],
|
||||
}
|
||||
if docs[idx][DUCKDB_FETCHALL_METADATA_INDEX]
|
||||
else {},
|
||||
)
|
||||
for idx in range(len(docs))
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VST],
|
||||
|
Loading…
Reference in New Issue
Block a user