community[patch]: DuckDB VS - expose similarity, improve performance of from_texts (#20971)

3 fixes of DuckDB vector store:
- unify defaults in constructor and from_texts (users no longer have to
specify `vector_key`).
- include search similarity into output metadata (fixes #20969)
- significantly improve performance of `from_documents`

Dependencies: added Pandas to speed up `from_documents`.
I was thinking about CSV and JSON options, but I expect trouble loading
JSON values this way and also CSV and JSON options require storing data
to disk.
Anyway, the poetry file for langchain-community already contains a
dependency on Pandas.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Jan Soubusta 2024-05-25 00:17:52 +02:00 committed by GitHub
parent 42207f5bef
commit cccc8fbe2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 60 additions and 22 deletions

View File

@ -14,7 +14,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"! pip install duckdb langchain-community" "! pip install duckdb langchain langchain-community langchain-openai"
] ]
}, },
{ {
@ -86,7 +86,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -100,9 +100,9 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.12.2" "version": "3.9.1"
} }
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 4
} }

View File

@ -2,13 +2,23 @@
from __future__ import annotations from __future__ import annotations
import json import json
import logging
import uuid import uuid
import warnings
from typing import Any, Iterable, List, Optional, Type from typing import Any, Iterable, List, Optional, Type
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VST, VectorStore from langchain_core.vectorstores import VST, VectorStore
logger = logging.getLogger(__name__)
DEFAULT_VECTOR_KEY = "embedding"
DEFAULT_ID_KEY = "id"
DEFAULT_TEXT_KEY = "text"
DEFAULT_TABLE_NAME = "embeddings"
SIMILARITY_ALIAS = "similarity_score"
class DuckDB(VectorStore): class DuckDB(VectorStore):
"""`DuckDB` vector store. """`DuckDB` vector store.
@ -76,10 +86,10 @@ class DuckDB(VectorStore):
*, *,
connection: Optional[Any] = None, connection: Optional[Any] = None,
embedding: Embeddings, embedding: Embeddings,
vector_key: str = "embedding", vector_key: str = DEFAULT_VECTOR_KEY,
id_key: str = "id", id_key: str = DEFAULT_ID_KEY,
text_key: str = "text", text_key: str = DEFAULT_TEXT_KEY,
table_name: str = "vectorstore", table_name: str = DEFAULT_TABLE_NAME,
): ):
"""Initialize with DuckDB connection and setup for vector storage.""" """Initialize with DuckDB connection and setup for vector storage."""
try: try:
@ -100,8 +110,6 @@ class DuckDB(VectorStore):
raise ValueError("An embedding function or model must be provided.") raise ValueError("An embedding function or model must be provided.")
if connection is None: if connection is None:
import warnings
warnings.warn( warnings.warn(
"No DuckDB connection provided. A new connection will be created." "No DuckDB connection provided. A new connection will be created."
"This connection is running in memory and no data will be persisted." "This connection is running in memory and no data will be persisted."
@ -138,6 +146,17 @@ class DuckDB(VectorStore):
Returns: Returns:
List of ids of the added texts. List of ids of the added texts.
""" """
have_pandas = False
try:
import pandas as pd
have_pandas = True
except ImportError:
logger.info(
"Unable to import pandas. "
"Install it with `pip install -U pandas` "
"to improve performance of add_texts()."
)
# Extract ids from kwargs or generate new ones if not provided # Extract ids from kwargs or generate new ones if not provided
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts]) ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
@ -145,6 +164,7 @@ class DuckDB(VectorStore):
# Embed texts and create documents # Embed texts and create documents
ids = ids or [str(uuid.uuid4()) for _ in texts] ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self._embedding.embed_documents(list(texts)) embeddings = self._embedding.embed_documents(list(texts))
data = []
for idx, text in enumerate(texts): for idx, text in enumerate(texts):
embedding = embeddings[idx] embedding = embeddings[idx]
# Serialize metadata if present, else default to None # Serialize metadata if present, else default to None
@ -153,9 +173,26 @@ class DuckDB(VectorStore):
if metadatas and idx < len(metadatas) if metadatas and idx < len(metadatas)
else None else None
) )
if have_pandas:
data.append(
{
self._id_key: ids[idx],
self._text_key: text,
self._vector_key: embedding,
"metadata": metadata,
}
)
else:
self._connection.execute(
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
[ids[idx], text, embedding, metadata],
)
if have_pandas:
# noinspection PyUnusedLocal
df = pd.DataFrame.from_dict(data) # noqa: F841
self._connection.execute( self._connection.execute(
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)", f"INSERT INTO {self._table_name} SELECT * FROM df",
[ids[idx], text, embedding, metadata],
) )
return ids return ids
@ -181,20 +218,21 @@ class DuckDB(VectorStore):
self._table.select( self._table.select(
*[ *[
self.duckdb.StarExpression(exclude=[]), self.duckdb.StarExpression(exclude=[]),
list_cosine_similarity.alias("similarity"), list_cosine_similarity.alias(SIMILARITY_ALIAS),
] ]
) )
.order("similarity desc") .order(f"{SIMILARITY_ALIAS} desc")
.limit(k) .limit(k)
.select(
self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
)
.fetchdf() .fetchdf()
) )
return [ return [
Document( Document(
page_content=docs[self._text_key][idx], page_content=docs[self._text_key][idx],
metadata=json.loads(docs["metadata"][idx]) metadata={
**json.loads(docs["metadata"][idx]),
# using underscore prefix to avoid conflicts with user metadata keys
f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
}
if docs["metadata"][idx] if docs["metadata"][idx]
else {}, else {},
) )
@ -231,10 +269,10 @@ class DuckDB(VectorStore):
# Extract kwargs for DuckDB instance creation # Extract kwargs for DuckDB instance creation
connection = kwargs.get("connection", None) connection = kwargs.get("connection", None)
vector_key = kwargs.get("vector_key", "vector") vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
id_key = kwargs.get("id_key", "id") id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
text_key = kwargs.get("text_key", "text") text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
table_name = kwargs.get("table_name", "embeddings") table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)
# Create an instance of DuckDB # Create an instance of DuckDB
instance = DuckDB( instance = DuckDB(