community[patch]: ElasticsearchStore: add relevance function selector (#16378)

Implement similarity function selector for ElasticsearchStore. The
scores coming back from Elasticsearch are already similarities (not
distances) and they are already normalized (see
[docs](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)).
Hence we leave the scores untouched and just forward them.

This fixes #11539.

However, in hybrid mode (when keyword search and vector search are
involved) Elasticsearch currently returns no scores. This PR adds an
error message around this fact. We need to think a bit more to come up
with a solution for this case.

This PR also corrects a small error in the Elasticsearch integration
test.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Max Jakob
2024-01-22 19:52:20 +01:00
committed by GitHub
parent 54f90fc6bc
commit de209af533
5 changed files with 137 additions and 33 deletions

View File

@@ -388,7 +388,6 @@ class ElasticsearchStore(VectorStore):
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectorstore = ElasticsearchStore(
embedding=OpenAIEmbeddings(),
index_name="langchain-demo",
@@ -693,6 +692,25 @@ class ElasticsearchStore(VectorStore):
return selected_docs
@staticmethod
def _identity_fn(score: float) -> float:
return score
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
Vectorstores should define their own selection based method of relevance.
"""
# All scores from Elasticsearch are already normalized similarities:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
return self._identity_fn
def similarity_search_with_score(
self, query: str, k: int = 4, filter: Optional[List[dict]] = None, **kwargs: Any
) -> List[Tuple[Document, float]]:
@@ -706,6 +724,9 @@ class ElasticsearchStore(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
if isinstance(self.strategy, ApproxRetrievalStrategy) and self.strategy.hybrid:
raise ValueError("scores are currently not supported in hybrid mode")
return self._search(query=query, k=k, filter=filter, **kwargs)
def similarity_search_by_vector_with_relevance_scores(
@@ -725,6 +746,9 @@ class ElasticsearchStore(VectorStore):
Returns:
List of Documents most similar to the embedding and score for each
"""
if isinstance(self.strategy, ApproxRetrievalStrategy) and self.strategy.hybrid:
raise ValueError("scores are currently not supported in hybrid mode")
return self._search(query_vector=embedding, k=k, filter=filter, **kwargs)
def _search(