community[patch]: ElasticsearchStore: add relevance function selector (#16378)

Implement similarity function selector for ElasticsearchStore. The
scores coming back from Elasticsearch are already similarities (not
distances) and they are already normalized (see
[docs](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)).
Hence we leave the scores untouched and just forward them.

This fixes #11539.

However, in hybrid mode (when keyword search and vector search are
involved) Elasticsearch currently returns no scores. This PR adds an
error message around this fact. We need to think a bit more to come up
with a solution for this case.

This PR also corrects a small error in the Elasticsearch integration
test.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Max Jakob
2024-01-22 19:52:20 +01:00
committed by GitHub
parent 54f90fc6bc
commit de209af533
5 changed files with 137 additions and 33 deletions

View File

@@ -157,7 +157,7 @@ class TestElasticsearch:
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
async def test_similarity_search_without_metadat_async(
async def test_similarity_search_without_metadata_async(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search without metadata."""
@@ -400,7 +400,7 @@ class TestElasticsearch:
"script": {
"source": """
double value = dotProduct(params.query_vector, 'vector');
return sigmoid(1, Math.E, -value);
return sigmoid(1, Math.E, -value);
""",
"params": {
"query_vector": [
@@ -777,6 +777,44 @@ class TestElasticsearch:
)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
def test_elasticsearch_with_relevance_threshold(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test to make sure the relevance threshold is respected."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = FakeEmbeddings()
docsearch = ElasticsearchStore.from_texts(
index_name=index_name,
texts=texts,
embedding=embeddings,
metadatas=metadatas,
**elasticsearch_connection,
)
# Find a good threshold for testing
query_string = "foo"
embedded_query = embeddings.embed_query(query_string)
top3 = docsearch.similarity_search_by_vector_with_relevance_scores(
embedding=embedded_query, k=3
)
similarity_of_second_ranked = top3[1][1]
assert len(top3) == 3
# Test threshold
retriever = docsearch.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"score_threshold": similarity_of_second_ranked},
)
output = retriever.get_relevant_documents(query=query_string)
assert output == [
top3[0][0],
top3[1][0],
# third ranked is out
]
def test_elasticsearch_delete_ids(
self, elasticsearch_connection: dict, index_name: str
) -> None: