mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-06 11:37:12 +00:00
community[patch]: DuckDB VS - expose similarity, improve performance of from_texts (#20971)
3 fixes of DuckDB vector store: - unify defaults in constructor and from_texts (users no longer have to specify `vector_key`). - include search similarity into output metadata (fixes #20969) - significantly improve performance of `from_documents` Dependencies: added Pandas to speed up `from_documents`. I was thinking about CSV and JSON options, but I expect trouble loading JSON values this way and also CSV and JSON options require storing data to disk. Anyway, the poetry file for langchain-community already contains a dependency on Pandas. --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
42207f5bef
commit
cccc8fbe2f
@ -14,7 +14,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install duckdb langchain-community"
|
||||
"! pip install duckdb langchain langchain-community langchain-openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -86,7 +86,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -100,9 +100,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.2"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
@ -2,13 +2,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
import warnings
|
||||
from typing import Any, Iterable, List, Optional, Type
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VST, VectorStore
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_VECTOR_KEY = "embedding"
|
||||
DEFAULT_ID_KEY = "id"
|
||||
DEFAULT_TEXT_KEY = "text"
|
||||
DEFAULT_TABLE_NAME = "embeddings"
|
||||
SIMILARITY_ALIAS = "similarity_score"
|
||||
|
||||
|
||||
class DuckDB(VectorStore):
|
||||
"""`DuckDB` vector store.
|
||||
@ -76,10 +86,10 @@ class DuckDB(VectorStore):
|
||||
*,
|
||||
connection: Optional[Any] = None,
|
||||
embedding: Embeddings,
|
||||
vector_key: str = "embedding",
|
||||
id_key: str = "id",
|
||||
text_key: str = "text",
|
||||
table_name: str = "vectorstore",
|
||||
vector_key: str = DEFAULT_VECTOR_KEY,
|
||||
id_key: str = DEFAULT_ID_KEY,
|
||||
text_key: str = DEFAULT_TEXT_KEY,
|
||||
table_name: str = DEFAULT_TABLE_NAME,
|
||||
):
|
||||
"""Initialize with DuckDB connection and setup for vector storage."""
|
||||
try:
|
||||
@ -100,8 +110,6 @@ class DuckDB(VectorStore):
|
||||
raise ValueError("An embedding function or model must be provided.")
|
||||
|
||||
if connection is None:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"No DuckDB connection provided. A new connection will be created."
|
||||
"This connection is running in memory and no data will be persisted."
|
||||
@ -138,6 +146,17 @@ class DuckDB(VectorStore):
|
||||
Returns:
|
||||
List of ids of the added texts.
|
||||
"""
|
||||
have_pandas = False
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
have_pandas = True
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"Unable to import pandas. "
|
||||
"Install it with `pip install -U pandas` "
|
||||
"to improve performance of add_texts()."
|
||||
)
|
||||
|
||||
# Extract ids from kwargs or generate new ones if not provided
|
||||
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
|
||||
@ -145,6 +164,7 @@ class DuckDB(VectorStore):
|
||||
# Embed texts and create documents
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
embeddings = self._embedding.embed_documents(list(texts))
|
||||
data = []
|
||||
for idx, text in enumerate(texts):
|
||||
embedding = embeddings[idx]
|
||||
# Serialize metadata if present, else default to None
|
||||
@ -153,9 +173,26 @@ class DuckDB(VectorStore):
|
||||
if metadatas and idx < len(metadatas)
|
||||
else None
|
||||
)
|
||||
if have_pandas:
|
||||
data.append(
|
||||
{
|
||||
self._id_key: ids[idx],
|
||||
self._text_key: text,
|
||||
self._vector_key: embedding,
|
||||
"metadata": metadata,
|
||||
}
|
||||
)
|
||||
else:
|
||||
self._connection.execute(
|
||||
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
|
||||
[ids[idx], text, embedding, metadata],
|
||||
)
|
||||
|
||||
if have_pandas:
|
||||
# noinspection PyUnusedLocal
|
||||
df = pd.DataFrame.from_dict(data) # noqa: F841
|
||||
self._connection.execute(
|
||||
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
|
||||
[ids[idx], text, embedding, metadata],
|
||||
f"INSERT INTO {self._table_name} SELECT * FROM df",
|
||||
)
|
||||
return ids
|
||||
|
||||
@ -181,20 +218,21 @@ class DuckDB(VectorStore):
|
||||
self._table.select(
|
||||
*[
|
||||
self.duckdb.StarExpression(exclude=[]),
|
||||
list_cosine_similarity.alias("similarity"),
|
||||
list_cosine_similarity.alias(SIMILARITY_ALIAS),
|
||||
]
|
||||
)
|
||||
.order("similarity desc")
|
||||
.order(f"{SIMILARITY_ALIAS} desc")
|
||||
.limit(k)
|
||||
.select(
|
||||
self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
|
||||
)
|
||||
.fetchdf()
|
||||
)
|
||||
return [
|
||||
Document(
|
||||
page_content=docs[self._text_key][idx],
|
||||
metadata=json.loads(docs["metadata"][idx])
|
||||
metadata={
|
||||
**json.loads(docs["metadata"][idx]),
|
||||
# using underscore prefix to avoid conflicts with user metadata keys
|
||||
f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
|
||||
}
|
||||
if docs["metadata"][idx]
|
||||
else {},
|
||||
)
|
||||
@ -231,10 +269,10 @@ class DuckDB(VectorStore):
|
||||
|
||||
# Extract kwargs for DuckDB instance creation
|
||||
connection = kwargs.get("connection", None)
|
||||
vector_key = kwargs.get("vector_key", "vector")
|
||||
id_key = kwargs.get("id_key", "id")
|
||||
text_key = kwargs.get("text_key", "text")
|
||||
table_name = kwargs.get("table_name", "embeddings")
|
||||
vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
|
||||
id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
|
||||
text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
|
||||
table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)
|
||||
|
||||
# Create an instance of DuckDB
|
||||
instance = DuckDB(
|
||||
|
Loading…
Reference in New Issue
Block a user