mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 06:23:20 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
466
libs/community/langchain_community/vectorstores/supabase.py
Normal file
466
libs/community/langchain_community/vectorstores/supabase.py
Normal file
@@ -0,0 +1,466 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from itertools import repeat
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import supabase
|
||||
|
||||
|
||||
class SupabaseVectorStore(VectorStore):
|
||||
"""`Supabase Postgres` vector store.
|
||||
|
||||
It assumes you have the `pgvector`
|
||||
extension installed and a `match_documents` (or similar) function. For more details:
|
||||
https://integrations.langchain.com/vectorstores?integration_name=SupabaseVectorStore
|
||||
|
||||
You can implement your own `match_documents` function in order to limit the search
|
||||
space to a subset of documents based on your own authorization or business logic.
|
||||
|
||||
Note that the Supabase Python client does not yet support async operations.
|
||||
|
||||
If you'd like to use `max_marginal_relevance_search`, please review the instructions
|
||||
below on modifying the `match_documents` function to return matched embeddings.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.vectorstores import SupabaseVectorStore
|
||||
from supabase.client import create_client
|
||||
|
||||
docs = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
supabase_client = create_client("my_supabase_url", "my_supabase_key")
|
||||
vector_store = SupabaseVectorStore.from_documents(
|
||||
docs,
|
||||
embeddings,
|
||||
client=supabase_client,
|
||||
table_name="documents",
|
||||
query_name="match_documents",
|
||||
chunk_size=500,
|
||||
)
|
||||
|
||||
To load from an existing table:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import SupabaseVectorStore
|
||||
from supabase.client import create_client
|
||||
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
supabase_client = create_client("my_supabase_url", "my_supabase_key")
|
||||
vector_store = SupabaseVectorStore(
|
||||
client=supabase_client,
|
||||
embedding=embeddings,
|
||||
table_name="documents",
|
||||
query_name="match_documents",
|
||||
)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: supabase.client.Client,
|
||||
embedding: Embeddings,
|
||||
table_name: str,
|
||||
chunk_size: int = 500,
|
||||
query_name: Union[str, None] = None,
|
||||
) -> None:
|
||||
"""Initialize with supabase client."""
|
||||
try:
|
||||
import supabase # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import supabase python package. "
|
||||
"Please install it with `pip install supabase`."
|
||||
)
|
||||
|
||||
self._client = client
|
||||
self._embedding: Embeddings = embedding
|
||||
self.table_name = table_name or "documents"
|
||||
self.query_name = query_name or "match_documents"
|
||||
self.chunk_size = chunk_size or 500
|
||||
# According to the SupabaseVectorStore JS implementation, the best chunk size
|
||||
# is 500. Though for large datasets it can be too large so it is configurable.
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self._embedding
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
docs = self._texts_to_documents(texts, metadatas)
|
||||
|
||||
vectors = self._embedding.embed_documents(list(texts))
|
||||
return self.add_vectors(vectors, docs, ids)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type["SupabaseVectorStore"],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
client: Optional[supabase.client.Client] = None,
|
||||
table_name: Optional[str] = "documents",
|
||||
query_name: Union[str, None] = "match_documents",
|
||||
chunk_size: int = 500,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "SupabaseVectorStore":
|
||||
"""Return VectorStore initialized from texts and embeddings."""
|
||||
|
||||
if not client:
|
||||
raise ValueError("Supabase client is required.")
|
||||
|
||||
if not table_name:
|
||||
raise ValueError("Supabase document table_name is required.")
|
||||
|
||||
embeddings = embedding.embed_documents(texts)
|
||||
ids = [str(uuid.uuid4()) for _ in texts]
|
||||
docs = cls._texts_to_documents(texts, metadatas)
|
||||
cls._add_vectors(client, table_name, embeddings, docs, ids, chunk_size)
|
||||
|
||||
return cls(
|
||||
client=client,
|
||||
embedding=embedding,
|
||||
table_name=table_name,
|
||||
query_name=query_name,
|
||||
chunk_size=chunk_size,
|
||||
)
|
||||
|
||||
def add_vectors(
|
||||
self,
|
||||
vectors: List[List[float]],
|
||||
documents: List[Document],
|
||||
ids: List[str],
|
||||
) -> List[str]:
|
||||
return self._add_vectors(
|
||||
self._client, self.table_name, vectors, documents, ids, self.chunk_size
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
vector = self._embedding.embed_query(query)
|
||||
return self.similarity_search_by_vector(vector, k=k, filter=filter, **kwargs)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
result = self.similarity_search_by_vector_with_relevance_scores(
|
||||
embedding, k=k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
documents = [doc for doc, _ in result]
|
||||
|
||||
return documents
|
||||
|
||||
def similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
vector = self._embedding.embed_query(query)
|
||||
return self.similarity_search_by_vector_with_relevance_scores(
|
||||
vector, k=k, filter=filter
|
||||
)
|
||||
|
||||
def match_args(
|
||||
self, query: List[float], filter: Optional[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
ret: Dict[str, Any] = dict(query_embedding=query)
|
||||
if filter:
|
||||
ret["filter"] = filter
|
||||
return ret
|
||||
|
||||
def similarity_search_by_vector_with_relevance_scores(
|
||||
self,
|
||||
query: List[float],
|
||||
k: int,
|
||||
filter: Optional[Dict[str, Any]] = None,
|
||||
postgrest_filter: Optional[str] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
match_documents_params = self.match_args(query, filter)
|
||||
query_builder = self._client.rpc(self.query_name, match_documents_params)
|
||||
|
||||
if postgrest_filter:
|
||||
query_builder.params = query_builder.params.set(
|
||||
"and", f"({postgrest_filter})"
|
||||
)
|
||||
|
||||
query_builder.params = query_builder.params.set("limit", k)
|
||||
|
||||
res = query_builder.execute()
|
||||
|
||||
match_result = [
|
||||
(
|
||||
Document(
|
||||
metadata=search.get("metadata", {}), # type: ignore
|
||||
page_content=search.get("content", ""),
|
||||
),
|
||||
search.get("similarity", 0.0),
|
||||
)
|
||||
for search in res.data
|
||||
if search.get("content")
|
||||
]
|
||||
|
||||
return match_result
|
||||
|
||||
def similarity_search_by_vector_returning_embeddings(
|
||||
self,
|
||||
query: List[float],
|
||||
k: int,
|
||||
filter: Optional[Dict[str, Any]] = None,
|
||||
postgrest_filter: Optional[str] = None,
|
||||
) -> List[Tuple[Document, float, np.ndarray[np.float32, Any]]]:
|
||||
match_documents_params = self.match_args(query, filter)
|
||||
query_builder = self._client.rpc(self.query_name, match_documents_params)
|
||||
|
||||
if postgrest_filter:
|
||||
query_builder.params = query_builder.params.set(
|
||||
"and", f"({postgrest_filter})"
|
||||
)
|
||||
|
||||
query_builder.params = query_builder.params.set("limit", k)
|
||||
|
||||
res = query_builder.execute()
|
||||
|
||||
match_result = [
|
||||
(
|
||||
Document(
|
||||
metadata=search.get("metadata", {}), # type: ignore
|
||||
page_content=search.get("content", ""),
|
||||
),
|
||||
search.get("similarity", 0.0),
|
||||
# Supabase returns a vector type as its string represation (!).
|
||||
# This is a hack to convert the string to numpy array.
|
||||
np.fromstring(
|
||||
search.get("embedding", "").strip("[]"), np.float32, sep=","
|
||||
),
|
||||
)
|
||||
for search in res.data
|
||||
if search.get("content")
|
||||
]
|
||||
|
||||
return match_result
|
||||
|
||||
@staticmethod
|
||||
def _texts_to_documents(
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[Iterable[Dict[Any, Any]]] = None,
|
||||
) -> List[Document]:
|
||||
"""Return list of Documents from list of texts and metadatas."""
|
||||
if metadatas is None:
|
||||
metadatas = repeat({})
|
||||
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata)
|
||||
for text, metadata in zip(texts, metadatas)
|
||||
]
|
||||
|
||||
return docs
|
||||
|
||||
@staticmethod
|
||||
def _add_vectors(
|
||||
client: supabase.client.Client,
|
||||
table_name: str,
|
||||
vectors: List[List[float]],
|
||||
documents: List[Document],
|
||||
ids: List[str],
|
||||
chunk_size: int,
|
||||
) -> List[str]:
|
||||
"""Add vectors to Supabase table."""
|
||||
|
||||
rows: List[Dict[str, Any]] = [
|
||||
{
|
||||
"id": ids[idx],
|
||||
"content": documents[idx].page_content,
|
||||
"embedding": embedding,
|
||||
"metadata": documents[idx].metadata, # type: ignore
|
||||
}
|
||||
for idx, embedding in enumerate(vectors)
|
||||
]
|
||||
|
||||
id_list: List[str] = []
|
||||
for i in range(0, len(rows), chunk_size):
|
||||
chunk = rows[i : i + chunk_size]
|
||||
|
||||
result = client.from_(table_name).upsert(chunk).execute() # type: ignore
|
||||
|
||||
if len(result.data) == 0:
|
||||
raise Exception("Error inserting: No rows added")
|
||||
|
||||
# VectorStore.add_vectors returns ids as strings
|
||||
ids = [str(i.get("id")) for i in result.data if i.get("id")]
|
||||
|
||||
id_list.extend(ids)
|
||||
|
||||
return id_list
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
result = self.similarity_search_by_vector_returning_embeddings(
|
||||
embedding, fetch_k
|
||||
)
|
||||
|
||||
matched_documents = [doc_tuple[0] for doc_tuple in result]
|
||||
matched_embeddings = [doc_tuple[2] for doc_tuple in result]
|
||||
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array([embedding], dtype=np.float32),
|
||||
matched_embeddings,
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
|
||||
filtered_documents = [matched_documents[i] for i in mmr_selected]
|
||||
|
||||
return filtered_documents
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
|
||||
`max_marginal_relevance_search` requires that `query_name` returns matched
|
||||
embeddings alongside the match documents. The following function
|
||||
demonstrates how to do this:
|
||||
|
||||
```sql
|
||||
CREATE FUNCTION match_documents_embeddings(query_embedding vector(1536),
|
||||
match_count int)
|
||||
RETURNS TABLE(
|
||||
id uuid,
|
||||
content text,
|
||||
metadata jsonb,
|
||||
embedding vector(1536),
|
||||
similarity float)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
# variable_conflict use_column
|
||||
BEGIN
|
||||
RETURN query
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
metadata,
|
||||
embedding,
|
||||
1 -(docstore.embedding <=> query_embedding) AS similarity
|
||||
FROM
|
||||
docstore
|
||||
ORDER BY
|
||||
docstore.embedding <=> query_embedding
|
||||
LIMIT match_count;
|
||||
END;
|
||||
$$;
|
||||
```
|
||||
"""
|
||||
embedding = self._embedding.embed_query(query)
|
||||
docs = self.max_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult=lambda_mult
|
||||
)
|
||||
return docs
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
|
||||
if ids is None:
|
||||
raise ValueError("No ids provided to delete.")
|
||||
|
||||
rows: List[Dict[str, Any]] = [
|
||||
{
|
||||
"id": id,
|
||||
}
|
||||
for id in ids
|
||||
]
|
||||
|
||||
# TODO: Check if this can be done in bulk
|
||||
for row in rows:
|
||||
self._client.from_(self.table_name).delete().eq("id", row["id"]).execute()
|
Reference in New Issue
Block a user