Refactor vector storage to correctly handle relevancy scores (#6570)

Description: This pull request aims to support generating the correct generic relevancy scores for different vector stores by refactoring the relevance score functions and their selection in the base class and subclasses of VectorStore. This is especially relevant with VectorStores that require a distance metric upon initialization. Note many of the current implenetations of `_similarity_search_with_relevance_scores` are not technically correct, as they just return `self.similarity_search_with_score(query, k, **kwargs)` without applying the relevant score function Also includes changes associated with: https://github.com/hwchase17/langchain/pull/6564 and https://github.com/hwchase17/langchain/pull/6494 See more indepth discussion in thread in #6494 Issue: https://github.com/hwchase17/langchain/issues/6526 https://github.com/hwchase17/langchain/issues/6481 https://github.com/hwchase17/langchain/issues/6346 Dependencies: None The changes include: - Properly handling score thresholding in FAISS `similarity_search_with_score_by_vector` for the corresponding distance metric. - Refactoring the `_similarity_search_with_relevance_scores` method in the base class and removing it from the subclasses for incorrectly implemented subclasses. - Adding a `_select_relevance_score_fn` method in the base class and implementing it in the subclasses to select the appropriate relevance score function based on the distance strategy. - Updating the `__init__` methods of the subclasses to set the `relevance_score_fn` attribute. - Removing the `_default_relevance_score_fn` function from the FAISS class and using the base class's `_euclidean_relevance_score_fn` instead. - Adding the `DistanceStrategy` enum to the `utils.py` file and updating the imports in the vector store classes. - Updating the tests to import the `DistanceStrategy` enum from the `utils.py` file. --------- Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
2025-09-13 05:25:07 +00:00 · 2023-07-10 20:37:03 -07:00
parent bd0c6381f5
commit 5171c3bcca
18 changed files with 504 additions and 149 deletions
--- a/tests/integration_tests/vectorstores/test_chroma.py
+++ b/tests/integration_tests/vectorstores/test_chroma.py
@@ -228,3 +228,42 @@ def test_chroma_update_document() -> None:
    ]
    assert new_embedding == embedding.embed_documents([updated_content])[0]
    assert new_embedding != old_embedding
+
+
+def test_chroma_with_relevance_score() -> None:
+    """Test to make sure the relevance score is scaled to 0-1."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Chroma.from_texts(
+        collection_name="test_collection",
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metadatas=metadatas,
+        collection_metadata={"hnsw:space": "l2"},
+    )
+    output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}), 1.0),
+        (Document(page_content="bar", metadata={"page": "1"}), 0.8),
+        (Document(page_content="baz", metadata={"page": "2"}), 0.5),
+    ]
+
+
+def test_chroma_with_relevance_score_custom_normalization_fn() -> None:
+    """Test searching with relevance score and custom normalization function."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = Chroma.from_texts(
+        collection_name="test_collection",
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metadatas=metadatas,
+        relevance_score_fn=lambda d: d * 0,
+        collection_metadata={"hnsw:space": "l2"},
+    )
+    output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}), -0.0),
+        (Document(page_content="bar", metadata={"page": "1"}), -0.0),
+        (Document(page_content="baz", metadata={"page": "2"}), -0.0),
+    ]
--- a/tests/integration_tests/vectorstores/test_faiss.py
+++ b/tests/integration_tests/vectorstores/test_faiss.py
@@ -195,3 +195,14 @@ def test_faiss_invalid_normalize_fn() -> None:
    )
    with pytest.warns(Warning, match="scores must be between"):
        docsearch.similarity_search_with_relevance_scores("foo", k=1)
+
+
+def test_missing_normalize_score_fn() -> None:
+    """Test doesn't perform similarity search without a valid distance strategy."""
+    with pytest.raises(ValueError):
+        texts = ["foo", "bar", "baz"]
+        faiss_instance = FAISS.from_texts(
+            texts, FakeEmbeddings(), distance_strategy="fake"
+        )
+
+        faiss_instance.similarity_search_with_relevance_scores("foo", k=2)
--- a/tests/integration_tests/vectorstores/test_pgvector.py
+++ b/tests/integration_tests/vectorstores/test_pgvector.py
@@ -184,3 +184,70 @@ def test_pgvector_with_filter_in_set() -> None:
        (Document(page_content="foo", metadata={"page": "0"}), 0.0),
        (Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406),
    ]
+
+
+def test_pgvector_relevance_score() -> None:
+    """Test to make sure the relevance score is scaled to 0-1."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = PGVector.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
+    output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}), 1.0),
+        (Document(page_content="bar", metadata={"page": "1"}), 0.9996744261675065),
+        (Document(page_content="baz", metadata={"page": "2"}), 0.9986996093328621),
+    ]
+
+
+def test_pgvector_retriever_search_threshold() -> None:
+    """Test using retriever for searching with threshold."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = PGVector.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
+    retriever = docsearch.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 3, "score_threshold": 0.999},
+    )
+    output = retriever.get_relevant_documents("summer")
+    assert output == [
+        Document(page_content="foo", metadata={"page": "0"}),
+        Document(page_content="bar", metadata={"page": "1"}),
+    ]
+
+
+def test_pgvector_retriever_search_threshold_custom_normalization_fn() -> None:
+    """Test searching with threshold and custom normalization function"""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = PGVector.from_texts(
+        texts=texts,
+        collection_name="test_collection",
+        embedding=FakeEmbeddingsWithAdaDimension(),
+        metadatas=metadatas,
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+        relevance_score_fn=lambda d: d * 0,
+    )
+
+    retriever = docsearch.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 3, "score_threshold": 0.5},
+    )
+    output = retriever.get_relevant_documents("foo")
+    assert output == []
--- a/tests/integration_tests/vectorstores/test_pinecone.py
+++ b/tests/integration_tests/vectorstores/test_pinecone.py
@@ -2,6 +2,7 @@ import importlib
 import os
 import time
 import uuid
+import numpy as np
 from typing import List

 import pinecone
@@ -154,3 +155,21 @@ class TestPinecone:
        time.sleep(20)
        index_stats = self.index.describe_index_stats()
        assert index_stats["total_vector_count"] == len(texts) * 2
+
+    @pytest.mark.vcr()
+    def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None:
+        """Ensures all relevance scores are between 0 and 1."""
+        texts = ["foo", "bar", "baz"]
+        metadatas = [{"page": i} for i in range(len(texts))]
+        docsearch = Pinecone.from_texts(
+            texts,
+            embedding_openai,
+            index_name=index_name,
+            metadatas=metadatas,
+        )
+        # wait for the index to be ready
+        time.sleep(20)
+        output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
+        assert all(
+            (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
+        )
--- a/tests/integration_tests/vectorstores/test_qdrant.py
+++ b/tests/integration_tests/vectorstores/test_qdrant.py
@@ -1,6 +1,7 @@
 """Test Qdrant functionality."""
 import tempfile
 from typing import Callable, Optional
+import numpy as np

 import pytest
 from qdrant_client.http import models as rest
@@ -513,3 +514,26 @@ def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) -
        vector_name in point.vector  # type: ignore[operator]
        for point in client.scroll(collection_name, with_vectors=True)[0]
    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
+@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
+def test_qdrant_similarity_search_with_relevance_scores(
+    batch_size: int, content_payload_key: str, metadata_payload_key: str
+) -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Qdrant.from_texts(
+        texts,
+        ConsistentFakeEmbeddings(),
+        location=":memory:",
+        content_payload_key=content_payload_key,
+        metadata_payload_key=metadata_payload_key,
+        batch_size=batch_size,
+    )
+    output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
+
+    assert all(
+        (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
+    )
--- a/tests/integration_tests/vectorstores/test_singlestoredb.py
+++ b/tests/integration_tests/vectorstores/test_singlestoredb.py
@@ -5,7 +5,8 @@ import numpy as np
 import pytest

 from langchain.docstore.document import Document
-from langchain.vectorstores.singlestoredb import DistanceStrategy, SingleStoreDB
+from langchain.vectorstores.singlestoredb import SingleStoreDB
+from langchain.vectorstores.utils import DistanceStrategy
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings

 TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"