mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-06 03:27:55 +00:00
community[patch]: DuckDB VS - expose similarity, improve performance of from_texts (#20971)
3 fixes of DuckDB vector store: - unify defaults in constructor and from_texts (users no longer have to specify `vector_key`). - include search similarity into output metadata (fixes #20969) - significantly improve performance of `from_documents` Dependencies: added Pandas to speed up `from_documents`. I was thinking about CSV and JSON options, but I expect trouble loading JSON values this way and also CSV and JSON options require storing data to disk. Anyway, the poetry file for langchain-community already contains a dependency on Pandas. --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
42207f5bef
commit
cccc8fbe2f
@ -14,7 +14,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"! pip install duckdb langchain-community"
|
"! pip install duckdb langchain langchain-community langchain-openai"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -86,7 +86,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -100,9 +100,9 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.2"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
@ -2,13 +2,23 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
|
import warnings
|
||||||
from typing import Any, Iterable, List, Optional, Type
|
from typing import Any, Iterable, List, Optional, Type
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.embeddings import Embeddings
|
from langchain_core.embeddings import Embeddings
|
||||||
from langchain_core.vectorstores import VST, VectorStore
|
from langchain_core.vectorstores import VST, VectorStore
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_VECTOR_KEY = "embedding"
|
||||||
|
DEFAULT_ID_KEY = "id"
|
||||||
|
DEFAULT_TEXT_KEY = "text"
|
||||||
|
DEFAULT_TABLE_NAME = "embeddings"
|
||||||
|
SIMILARITY_ALIAS = "similarity_score"
|
||||||
|
|
||||||
|
|
||||||
class DuckDB(VectorStore):
|
class DuckDB(VectorStore):
|
||||||
"""`DuckDB` vector store.
|
"""`DuckDB` vector store.
|
||||||
@ -76,10 +86,10 @@ class DuckDB(VectorStore):
|
|||||||
*,
|
*,
|
||||||
connection: Optional[Any] = None,
|
connection: Optional[Any] = None,
|
||||||
embedding: Embeddings,
|
embedding: Embeddings,
|
||||||
vector_key: str = "embedding",
|
vector_key: str = DEFAULT_VECTOR_KEY,
|
||||||
id_key: str = "id",
|
id_key: str = DEFAULT_ID_KEY,
|
||||||
text_key: str = "text",
|
text_key: str = DEFAULT_TEXT_KEY,
|
||||||
table_name: str = "vectorstore",
|
table_name: str = DEFAULT_TABLE_NAME,
|
||||||
):
|
):
|
||||||
"""Initialize with DuckDB connection and setup for vector storage."""
|
"""Initialize with DuckDB connection and setup for vector storage."""
|
||||||
try:
|
try:
|
||||||
@ -100,8 +110,6 @@ class DuckDB(VectorStore):
|
|||||||
raise ValueError("An embedding function or model must be provided.")
|
raise ValueError("An embedding function or model must be provided.")
|
||||||
|
|
||||||
if connection is None:
|
if connection is None:
|
||||||
import warnings
|
|
||||||
|
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No DuckDB connection provided. A new connection will be created."
|
"No DuckDB connection provided. A new connection will be created."
|
||||||
"This connection is running in memory and no data will be persisted."
|
"This connection is running in memory and no data will be persisted."
|
||||||
@ -138,6 +146,17 @@ class DuckDB(VectorStore):
|
|||||||
Returns:
|
Returns:
|
||||||
List of ids of the added texts.
|
List of ids of the added texts.
|
||||||
"""
|
"""
|
||||||
|
have_pandas = False
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
have_pandas = True
|
||||||
|
except ImportError:
|
||||||
|
logger.info(
|
||||||
|
"Unable to import pandas. "
|
||||||
|
"Install it with `pip install -U pandas` "
|
||||||
|
"to improve performance of add_texts()."
|
||||||
|
)
|
||||||
|
|
||||||
# Extract ids from kwargs or generate new ones if not provided
|
# Extract ids from kwargs or generate new ones if not provided
|
||||||
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
|
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
|
||||||
@ -145,6 +164,7 @@ class DuckDB(VectorStore):
|
|||||||
# Embed texts and create documents
|
# Embed texts and create documents
|
||||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||||
embeddings = self._embedding.embed_documents(list(texts))
|
embeddings = self._embedding.embed_documents(list(texts))
|
||||||
|
data = []
|
||||||
for idx, text in enumerate(texts):
|
for idx, text in enumerate(texts):
|
||||||
embedding = embeddings[idx]
|
embedding = embeddings[idx]
|
||||||
# Serialize metadata if present, else default to None
|
# Serialize metadata if present, else default to None
|
||||||
@ -153,9 +173,26 @@ class DuckDB(VectorStore):
|
|||||||
if metadatas and idx < len(metadatas)
|
if metadatas and idx < len(metadatas)
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
if have_pandas:
|
||||||
|
data.append(
|
||||||
|
{
|
||||||
|
self._id_key: ids[idx],
|
||||||
|
self._text_key: text,
|
||||||
|
self._vector_key: embedding,
|
||||||
|
"metadata": metadata,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._connection.execute(
|
||||||
|
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
|
||||||
|
[ids[idx], text, embedding, metadata],
|
||||||
|
)
|
||||||
|
|
||||||
|
if have_pandas:
|
||||||
|
# noinspection PyUnusedLocal
|
||||||
|
df = pd.DataFrame.from_dict(data) # noqa: F841
|
||||||
self._connection.execute(
|
self._connection.execute(
|
||||||
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
|
f"INSERT INTO {self._table_name} SELECT * FROM df",
|
||||||
[ids[idx], text, embedding, metadata],
|
|
||||||
)
|
)
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
@ -181,20 +218,21 @@ class DuckDB(VectorStore):
|
|||||||
self._table.select(
|
self._table.select(
|
||||||
*[
|
*[
|
||||||
self.duckdb.StarExpression(exclude=[]),
|
self.duckdb.StarExpression(exclude=[]),
|
||||||
list_cosine_similarity.alias("similarity"),
|
list_cosine_similarity.alias(SIMILARITY_ALIAS),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
.order("similarity desc")
|
.order(f"{SIMILARITY_ALIAS} desc")
|
||||||
.limit(k)
|
.limit(k)
|
||||||
.select(
|
|
||||||
self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
|
|
||||||
)
|
|
||||||
.fetchdf()
|
.fetchdf()
|
||||||
)
|
)
|
||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
page_content=docs[self._text_key][idx],
|
page_content=docs[self._text_key][idx],
|
||||||
metadata=json.loads(docs["metadata"][idx])
|
metadata={
|
||||||
|
**json.loads(docs["metadata"][idx]),
|
||||||
|
# using underscore prefix to avoid conflicts with user metadata keys
|
||||||
|
f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
|
||||||
|
}
|
||||||
if docs["metadata"][idx]
|
if docs["metadata"][idx]
|
||||||
else {},
|
else {},
|
||||||
)
|
)
|
||||||
@ -231,10 +269,10 @@ class DuckDB(VectorStore):
|
|||||||
|
|
||||||
# Extract kwargs for DuckDB instance creation
|
# Extract kwargs for DuckDB instance creation
|
||||||
connection = kwargs.get("connection", None)
|
connection = kwargs.get("connection", None)
|
||||||
vector_key = kwargs.get("vector_key", "vector")
|
vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
|
||||||
id_key = kwargs.get("id_key", "id")
|
id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
|
||||||
text_key = kwargs.get("text_key", "text")
|
text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
|
||||||
table_name = kwargs.get("table_name", "embeddings")
|
table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)
|
||||||
|
|
||||||
# Create an instance of DuckDB
|
# Create an instance of DuckDB
|
||||||
instance = DuckDB(
|
instance = DuckDB(
|
||||||
|
Loading…
Reference in New Issue
Block a user