community[patch]: surrealdb provide functions for MMR (Maximal Marginal Relevance) (#21185)

This PR contains 4 added functions:

- max_marginal_relevance_search_by_vector
- amax_marginal_relevance_search_by_vector
- max_marginal_relevance_search
- amax_marginal_relevance_search

I'm no langchain expert, but tried do inspect other vectorstore sources
like chroma, to build these functions for SurrealDB. If someone has some
changes for me, please let me know. Otherwise I would be happy, if these
changes are added to the repository, so that I can use the orignal repo
and not my local monkey patched version.

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Sky 2024-05-23 00:53:55 +02:00 committed by GitHub
parent 58b6c72375
commit 12d65f17ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,16 +1,22 @@
import asyncio
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
)
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
DEFAULT_K = 4 # Number of Documents to return.
class SurrealDBStore(VectorStore):
"""
@ -202,14 +208,20 @@ class SurrealDBStore(VectorStore):
return asyncio.run(_delete(ids, **kwargs))
async def _asimilarity_search_by_vector_with_score(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
self,
embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float, Any]]:
"""Run similarity search for query embedding asynchronously
and return documents and scores
Args:
embedding (List[float]): Query embedding.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar along with scores
@ -220,14 +232,29 @@ class SurrealDBStore(VectorStore):
"k": k,
"score_threshold": kwargs.get("score_threshold", 0),
}
# build additional filter criteria
custom_filter = ""
if filter:
for key in filter:
# check value type
if type(filter[key]) in [str, bool]:
filter_value = f"'{filter[key]}'"
else:
filter_value = f"{filter[key]}"
custom_filter += f"and metadata.{key} = {filter_value} "
query = f"""
select
id,
text,
metadata,
embedding,
vector::similarity::cosine(embedding, $embedding) as similarity
from {args["collection"]}
where vector::similarity::cosine(embedding, $embedding) >= $score_threshold
{custom_filter}
order by similarity desc LIMIT $k;
"""
results = await self.sdb.query(query, args)
@ -247,21 +274,28 @@ class SurrealDBStore(VectorStore):
(
Document(
page_content=doc["text"],
metadata={"id": doc["id"], **(doc.get("metadata", None) or {})},
metadata={"id": doc["id"], **(doc.get("metadata") or {})},
),
doc["similarity"],
doc["embedding"],
)
for doc in result["result"]
]
async def asimilarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return relevance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar along with relevance scores
@ -269,21 +303,27 @@ class SurrealDBStore(VectorStore):
query_embedding = self.embedding_function.embed_query(query)
return [
(document, similarity)
for document, similarity in (
for document, similarity, _ in (
await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs
query_embedding, k, filter=filter, **kwargs
)
)
]
def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return relevance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar along with relevance scores
@ -294,19 +334,25 @@ class SurrealDBStore(VectorStore):
):
await self.initialize()
return await self.asimilarity_search_with_relevance_scores(
query, k, **kwargs
query, k, filter=filter, **kwargs
)
return asyncio.run(_similarity_search_with_relevance_scores())
async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return distance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar along with relevance distance scores
@ -314,21 +360,27 @@ class SurrealDBStore(VectorStore):
query_embedding = self.embedding_function.embed_query(query)
return [
(document, similarity)
for document, similarity in (
for document, similarity, _ in (
await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs
query_embedding, k, filter=filter, **kwargs
)
)
]
def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return distance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar along with relevance distance scores
@ -336,37 +388,51 @@ class SurrealDBStore(VectorStore):
async def _similarity_search_with_score() -> List[Tuple[Document, float]]:
await self.initialize()
return await self.asimilarity_search_with_score(query, k, **kwargs)
return await self.asimilarity_search_with_score(
query, k, filter=filter, **kwargs
)
return asyncio.run(_similarity_search_with_score())
async def asimilarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
self,
embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search on query embedding asynchronously
Args:
embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query
"""
return [
document
for document, _ in await self._asimilarity_search_by_vector_with_score(
embedding, k, **kwargs
for document, _, _ in await self._asimilarity_search_by_vector_with_score(
embedding, k, filter=filter, **kwargs
)
]
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
self,
embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search on query embedding
Args:
embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query
@ -374,33 +440,49 @@ class SurrealDBStore(VectorStore):
async def _similarity_search_by_vector() -> List[Document]:
await self.initialize()
return await self.asimilarity_search_by_vector(embedding, k, **kwargs)
return await self.asimilarity_search_by_vector(
embedding, k, filter=filter, **kwargs
)
return asyncio.run(_similarity_search_by_vector())
async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search on query asynchronously
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query
"""
query_embedding = self.embedding_function.embed_query(query)
return await self.asimilarity_search_by_vector(query_embedding, k, **kwargs)
return await self.asimilarity_search_by_vector(
query_embedding, k, filter=filter, **kwargs
)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search on query
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents most similar to the query
@ -408,10 +490,164 @@ class SurrealDBStore(VectorStore):
async def _similarity_search() -> List[Document]:
await self.initialize()
return await self.asimilarity_search(query, k, **kwargs)
return await self.asimilarity_search(query, k, filter=filter, **kwargs)
return asyncio.run(_similarity_search())
async def amax_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
result = await self._asimilarity_search_by_vector_with_score(
embedding, fetch_k, filter=filter, **kwargs
)
# extract only document from result
docs = [sub[0] for sub in result]
# extract only embedding from result
embeddings = [sub[-1] for sub in result]
mmr_selected = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32),
embeddings,
k=k,
lambda_mult=lambda_mult,
)
return [docs[i] for i in mmr_selected]
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
async def _max_marginal_relevance_search_by_vector() -> List[Document]:
await self.initialize()
return await self.amax_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return asyncio.run(_max_marginal_relevance_search_by_vector())
async def amax_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function.embed_query(query)
docs = await self.amax_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return docs
def max_marginal_relevance_search(
self,
query: str,
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
async def _max_marginal_relevance_search() -> List[Document]:
await self.initialize()
return await self.amax_marginal_relevance_search(
query, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return asyncio.run(_max_marginal_relevance_search())
@classmethod
async def afrom_texts(
cls,