community[patch]: surrealdb provide functions for MMR (Maximal Marginal Relevance) (#21185)

This PR contains 4 added functions:

- max_marginal_relevance_search_by_vector
- amax_marginal_relevance_search_by_vector
- max_marginal_relevance_search
- amax_marginal_relevance_search

I'm no langchain expert, but tried do inspect other vectorstore sources
like chroma, to build these functions for SurrealDB. If someone has some
changes for me, please let me know. Otherwise I would be happy, if these
changes are added to the repository, so that I can use the orignal repo
and not my local monkey patched version.

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Sky 2024-05-23 00:53:55 +02:00 committed by GitHub
parent 58b6c72375
commit 12d65f17ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,16 +1,22 @@
import asyncio import asyncio
from typing import ( from typing import (
Any, Any,
Dict,
Iterable, Iterable,
List, List,
Optional, Optional,
Tuple, Tuple,
) )
import numpy as np
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
DEFAULT_K = 4 # Number of Documents to return.
class SurrealDBStore(VectorStore): class SurrealDBStore(VectorStore):
""" """
@ -202,14 +208,20 @@ class SurrealDBStore(VectorStore):
return asyncio.run(_delete(ids, **kwargs)) return asyncio.run(_delete(ids, **kwargs))
async def _asimilarity_search_by_vector_with_score( async def _asimilarity_search_by_vector_with_score(
self, embedding: List[float], k: int = 4, **kwargs: Any self,
) -> List[Tuple[Document, float]]: embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float, Any]]:
"""Run similarity search for query embedding asynchronously """Run similarity search for query embedding asynchronously
and return documents and scores and return documents and scores
Args: Args:
embedding (List[float]): Query embedding. embedding (List[float]): Query embedding.
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar along with scores List of Documents most similar along with scores
@ -220,14 +232,29 @@ class SurrealDBStore(VectorStore):
"k": k, "k": k,
"score_threshold": kwargs.get("score_threshold", 0), "score_threshold": kwargs.get("score_threshold", 0),
} }
# build additional filter criteria
custom_filter = ""
if filter:
for key in filter:
# check value type
if type(filter[key]) in [str, bool]:
filter_value = f"'{filter[key]}'"
else:
filter_value = f"{filter[key]}"
custom_filter += f"and metadata.{key} = {filter_value} "
query = f""" query = f"""
select select
id, id,
text, text,
metadata, metadata,
embedding,
vector::similarity::cosine(embedding, $embedding) as similarity vector::similarity::cosine(embedding, $embedding) as similarity
from {args["collection"]} from {args["collection"]}
where vector::similarity::cosine(embedding, $embedding) >= $score_threshold where vector::similarity::cosine(embedding, $embedding) >= $score_threshold
{custom_filter}
order by similarity desc LIMIT $k; order by similarity desc LIMIT $k;
""" """
results = await self.sdb.query(query, args) results = await self.sdb.query(query, args)
@ -247,21 +274,28 @@ class SurrealDBStore(VectorStore):
( (
Document( Document(
page_content=doc["text"], page_content=doc["text"],
metadata={"id": doc["id"], **(doc.get("metadata", None) or {})}, metadata={"id": doc["id"], **(doc.get("metadata") or {})},
), ),
doc["similarity"], doc["similarity"],
doc["embedding"],
) )
for doc in result["result"] for doc in result["result"]
] ]
async def asimilarity_search_with_relevance_scores( async def asimilarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return relevance scores """Run similarity search asynchronously and return relevance scores
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar along with relevance scores List of Documents most similar along with relevance scores
@ -269,21 +303,27 @@ class SurrealDBStore(VectorStore):
query_embedding = self.embedding_function.embed_query(query) query_embedding = self.embedding_function.embed_query(query)
return [ return [
(document, similarity) (document, similarity)
for document, similarity in ( for document, similarity, _ in (
await self._asimilarity_search_by_vector_with_score( await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs query_embedding, k, filter=filter, **kwargs
) )
) )
] ]
def similarity_search_with_relevance_scores( def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return relevance scores """Run similarity search synchronously and return relevance scores
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar along with relevance scores List of Documents most similar along with relevance scores
@ -294,19 +334,25 @@ class SurrealDBStore(VectorStore):
): ):
await self.initialize() await self.initialize()
return await self.asimilarity_search_with_relevance_scores( return await self.asimilarity_search_with_relevance_scores(
query, k, **kwargs query, k, filter=filter, **kwargs
) )
return asyncio.run(_similarity_search_with_relevance_scores()) return asyncio.run(_similarity_search_with_relevance_scores())
async def asimilarity_search_with_score( async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return distance scores """Run similarity search asynchronously and return distance scores
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar along with relevance distance scores List of Documents most similar along with relevance distance scores
@ -314,21 +360,27 @@ class SurrealDBStore(VectorStore):
query_embedding = self.embedding_function.embed_query(query) query_embedding = self.embedding_function.embed_query(query)
return [ return [
(document, similarity) (document, similarity)
for document, similarity in ( for document, similarity, _ in (
await self._asimilarity_search_by_vector_with_score( await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs query_embedding, k, filter=filter, **kwargs
) )
) )
] ]
def similarity_search_with_score( def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return distance scores """Run similarity search synchronously and return distance scores
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar along with relevance distance scores List of Documents most similar along with relevance distance scores
@ -336,37 +388,51 @@ class SurrealDBStore(VectorStore):
async def _similarity_search_with_score() -> List[Tuple[Document, float]]: async def _similarity_search_with_score() -> List[Tuple[Document, float]]:
await self.initialize() await self.initialize()
return await self.asimilarity_search_with_score(query, k, **kwargs) return await self.asimilarity_search_with_score(
query, k, filter=filter, **kwargs
)
return asyncio.run(_similarity_search_with_score()) return asyncio.run(_similarity_search_with_score())
async def asimilarity_search_by_vector( async def asimilarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any self,
embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Run similarity search on query embedding asynchronously """Run similarity search on query embedding asynchronously
Args: Args:
embedding (List[float]): Query embedding embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar to the query List of Documents most similar to the query
""" """
return [ return [
document document
for document, _ in await self._asimilarity_search_by_vector_with_score( for document, _, _ in await self._asimilarity_search_by_vector_with_score(
embedding, k, **kwargs embedding, k, filter=filter, **kwargs
) )
] ]
def similarity_search_by_vector( def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any self,
embedding: List[float],
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Run similarity search on query embedding """Run similarity search on query embedding
Args: Args:
embedding (List[float]): Query embedding embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar to the query List of Documents most similar to the query
@ -374,33 +440,49 @@ class SurrealDBStore(VectorStore):
async def _similarity_search_by_vector() -> List[Document]: async def _similarity_search_by_vector() -> List[Document]:
await self.initialize() await self.initialize()
return await self.asimilarity_search_by_vector(embedding, k, **kwargs) return await self.asimilarity_search_by_vector(
embedding, k, filter=filter, **kwargs
)
return asyncio.run(_similarity_search_by_vector()) return asyncio.run(_similarity_search_by_vector())
async def asimilarity_search( async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Run similarity search on query asynchronously """Run similarity search on query asynchronously
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar to the query List of Documents most similar to the query
""" """
query_embedding = self.embedding_function.embed_query(query) query_embedding = self.embedding_function.embed_query(query)
return await self.asimilarity_search_by_vector(query_embedding, k, **kwargs) return await self.asimilarity_search_by_vector(
query_embedding, k, filter=filter, **kwargs
)
def similarity_search( def similarity_search(
self, query: str, k: int = 4, **kwargs: Any self,
query: str,
k: int = DEFAULT_K,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Run similarity search on query """Run similarity search on query
Args: Args:
query (str): Query query (str): Query
k (int): Number of results to return. Defaults to 4. k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents most similar to the query List of Documents most similar to the query
@ -408,10 +490,164 @@ class SurrealDBStore(VectorStore):
async def _similarity_search() -> List[Document]: async def _similarity_search() -> List[Document]:
await self.initialize() await self.initialize()
return await self.asimilarity_search(query, k, **kwargs) return await self.asimilarity_search(query, k, filter=filter, **kwargs)
return asyncio.run(_similarity_search()) return asyncio.run(_similarity_search())
async def amax_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
result = await self._asimilarity_search_by_vector_with_score(
embedding, fetch_k, filter=filter, **kwargs
)
# extract only document from result
docs = [sub[0] for sub in result]
# extract only embedding from result
embeddings = [sub[-1] for sub in result]
mmr_selected = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32),
embeddings,
k=k,
lambda_mult=lambda_mult,
)
return [docs[i] for i in mmr_selected]
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
async def _max_marginal_relevance_search_by_vector() -> List[Document]:
await self.initialize()
return await self.amax_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return asyncio.run(_max_marginal_relevance_search_by_vector())
async def amax_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function.embed_query(query)
docs = await self.amax_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return docs
def max_marginal_relevance_search(
self,
query: str,
k: int = DEFAULT_K,
fetch_k: int = 20,
lambda_mult: float = 0.5,
*,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
async def _max_marginal_relevance_search() -> List[Document]:
await self.initialize()
return await self.amax_marginal_relevance_search(
query, k, fetch_k, lambda_mult, filter=filter, **kwargs
)
return asyncio.run(_max_marginal_relevance_search())
@classmethod @classmethod
async def afrom_texts( async def afrom_texts(
cls, cls,