mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
community[patch]: surrealdb provide functions for MMR (Maximal Marginal Relevance) (#21185)
This PR contains 4 added functions: - max_marginal_relevance_search_by_vector - amax_marginal_relevance_search_by_vector - max_marginal_relevance_search - amax_marginal_relevance_search I'm no langchain expert, but tried do inspect other vectorstore sources like chroma, to build these functions for SurrealDB. If someone has some changes for me, please let me know. Otherwise I would be happy, if these changes are added to the repository, so that I can use the orignal repo and not my local monkey patched version. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
58b6c72375
commit
12d65f17ff
@ -1,16 +1,22 @@
|
||||
import asyncio
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
DEFAULT_K = 4 # Number of Documents to return.
|
||||
|
||||
|
||||
class SurrealDBStore(VectorStore):
|
||||
"""
|
||||
@ -202,14 +208,20 @@ class SurrealDBStore(VectorStore):
|
||||
return asyncio.run(_delete(ids, **kwargs))
|
||||
|
||||
async def _asimilarity_search_by_vector_with_score(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Tuple[Document, float]]:
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float, Any]]:
|
||||
"""Run similarity search for query embedding asynchronously
|
||||
and return documents and scores
|
||||
|
||||
Args:
|
||||
embedding (List[float]): Query embedding.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar along with scores
|
||||
@ -220,14 +232,29 @@ class SurrealDBStore(VectorStore):
|
||||
"k": k,
|
||||
"score_threshold": kwargs.get("score_threshold", 0),
|
||||
}
|
||||
|
||||
# build additional filter criteria
|
||||
custom_filter = ""
|
||||
if filter:
|
||||
for key in filter:
|
||||
# check value type
|
||||
if type(filter[key]) in [str, bool]:
|
||||
filter_value = f"'{filter[key]}'"
|
||||
else:
|
||||
filter_value = f"{filter[key]}"
|
||||
|
||||
custom_filter += f"and metadata.{key} = {filter_value} "
|
||||
|
||||
query = f"""
|
||||
select
|
||||
id,
|
||||
text,
|
||||
metadata,
|
||||
embedding,
|
||||
vector::similarity::cosine(embedding, $embedding) as similarity
|
||||
from ⟨{args["collection"]}⟩
|
||||
where vector::similarity::cosine(embedding, $embedding) >= $score_threshold
|
||||
{custom_filter}
|
||||
order by similarity desc LIMIT $k;
|
||||
"""
|
||||
results = await self.sdb.query(query, args)
|
||||
@ -247,21 +274,28 @@ class SurrealDBStore(VectorStore):
|
||||
(
|
||||
Document(
|
||||
page_content=doc["text"],
|
||||
metadata={"id": doc["id"], **(doc.get("metadata", None) or {})},
|
||||
metadata={"id": doc["id"], **(doc.get("metadata") or {})},
|
||||
),
|
||||
doc["similarity"],
|
||||
doc["embedding"],
|
||||
)
|
||||
for doc in result["result"]
|
||||
]
|
||||
|
||||
async def asimilarity_search_with_relevance_scores(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Run similarity search asynchronously and return relevance scores
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar along with relevance scores
|
||||
@ -269,21 +303,27 @@ class SurrealDBStore(VectorStore):
|
||||
query_embedding = self.embedding_function.embed_query(query)
|
||||
return [
|
||||
(document, similarity)
|
||||
for document, similarity in (
|
||||
for document, similarity, _ in (
|
||||
await self._asimilarity_search_by_vector_with_score(
|
||||
query_embedding, k, **kwargs
|
||||
query_embedding, k, filter=filter, **kwargs
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_with_relevance_scores(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Run similarity search synchronously and return relevance scores
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar along with relevance scores
|
||||
@ -294,19 +334,25 @@ class SurrealDBStore(VectorStore):
|
||||
):
|
||||
await self.initialize()
|
||||
return await self.asimilarity_search_with_relevance_scores(
|
||||
query, k, **kwargs
|
||||
query, k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return asyncio.run(_similarity_search_with_relevance_scores())
|
||||
|
||||
async def asimilarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Run similarity search asynchronously and return distance scores
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar along with relevance distance scores
|
||||
@ -314,21 +360,27 @@ class SurrealDBStore(VectorStore):
|
||||
query_embedding = self.embedding_function.embed_query(query)
|
||||
return [
|
||||
(document, similarity)
|
||||
for document, similarity in (
|
||||
for document, similarity, _ in (
|
||||
await self._asimilarity_search_by_vector_with_score(
|
||||
query_embedding, k, **kwargs
|
||||
query_embedding, k, filter=filter, **kwargs
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Run similarity search synchronously and return distance scores
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar along with relevance distance scores
|
||||
@ -336,37 +388,51 @@ class SurrealDBStore(VectorStore):
|
||||
|
||||
async def _similarity_search_with_score() -> List[Tuple[Document, float]]:
|
||||
await self.initialize()
|
||||
return await self.asimilarity_search_with_score(query, k, **kwargs)
|
||||
return await self.asimilarity_search_with_score(
|
||||
query, k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return asyncio.run(_similarity_search_with_score())
|
||||
|
||||
async def asimilarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search on query embedding asynchronously
|
||||
|
||||
Args:
|
||||
embedding (List[float]): Query embedding
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query
|
||||
"""
|
||||
return [
|
||||
document
|
||||
for document, _ in await self._asimilarity_search_by_vector_with_score(
|
||||
embedding, k, **kwargs
|
||||
for document, _, _ in await self._asimilarity_search_by_vector_with_score(
|
||||
embedding, k, filter=filter, **kwargs
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search on query embedding
|
||||
|
||||
Args:
|
||||
embedding (List[float]): Query embedding
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query
|
||||
@ -374,33 +440,49 @@ class SurrealDBStore(VectorStore):
|
||||
|
||||
async def _similarity_search_by_vector() -> List[Document]:
|
||||
await self.initialize()
|
||||
return await self.asimilarity_search_by_vector(embedding, k, **kwargs)
|
||||
return await self.asimilarity_search_by_vector(
|
||||
embedding, k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return asyncio.run(_similarity_search_by_vector())
|
||||
|
||||
async def asimilarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search on query asynchronously
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query
|
||||
"""
|
||||
query_embedding = self.embedding_function.embed_query(query)
|
||||
return await self.asimilarity_search_by_vector(query_embedding, k, **kwargs)
|
||||
return await self.asimilarity_search_by_vector(
|
||||
query_embedding, k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search on query
|
||||
|
||||
Args:
|
||||
query (str): Query
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query
|
||||
@ -408,10 +490,164 @@ class SurrealDBStore(VectorStore):
|
||||
|
||||
async def _similarity_search() -> List[Document]:
|
||||
await self.initialize()
|
||||
return await self.asimilarity_search(query, k, **kwargs)
|
||||
return await self.asimilarity_search(query, k, filter=filter, **kwargs)
|
||||
|
||||
return asyncio.run(_similarity_search())
|
||||
|
||||
async def amax_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
result = await self._asimilarity_search_by_vector_with_score(
|
||||
embedding, fetch_k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
# extract only document from result
|
||||
docs = [sub[0] for sub in result]
|
||||
# extract only embedding from result
|
||||
embeddings = [sub[-1] for sub in result]
|
||||
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array(embedding, dtype=np.float32),
|
||||
embeddings,
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
|
||||
return [docs[i] for i in mmr_selected]
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
async def _max_marginal_relevance_search_by_vector() -> List[Document]:
|
||||
await self.initialize()
|
||||
return await self.amax_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return asyncio.run(_max_marginal_relevance_search_by_vector())
|
||||
|
||||
async def amax_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
docs = await self.amax_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
||||
)
|
||||
return docs
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_K,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
*,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
async def _max_marginal_relevance_search() -> List[Document]:
|
||||
await self.initialize()
|
||||
return await self.amax_marginal_relevance_search(
|
||||
query, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return asyncio.run(_max_marginal_relevance_search())
|
||||
|
||||
@classmethod
|
||||
async def afrom_texts(
|
||||
cls,
|
||||
|
Loading…
Reference in New Issue
Block a user