fixed similarity_search_with_score to really use a score

updated unit test with a test for score threshold Updated demo notebook
2025-10-22 01:32:24 +00:00 · 2023-08-28 22:26:55 -07:00
parent 1b6947e56c
commit 8b8d2a6535
3 changed files with 67 additions and 44 deletions
--- a/libs/langchain/langchain/vectorstores/vectara.py
+++ b/libs/langchain/langchain/vectorstores/vectara.py
@@ -245,6 +245,7 @@ class Vectara(VectorStore):
        k: int = 5,
        lambda_val: float = 0.025,
        filter: Optional[str] = None,
+        score_threshold: Optional[float] = None,
        n_sentence_context: int = 2,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
@@ -258,6 +259,8 @@ class Vectara(VectorStore):
                filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see
                https://docs.vectara.com/docs/search-apis/sql/filter-overview
                for more details.
+            score_threshold: minimal score thresold for the result.
+                If defined, results with score less than this value will be filtered out.
            n_sentence_context: number of sentences before/after the matching segment
                to add, defaults to 2

@@ -305,7 +308,10 @@ class Vectara(VectorStore):

        result = response.json()

-        responses = result["responseSet"][0]["response"]
+        if score_threshold:
+            responses = [r for r in result["responseSet"][0]["response"] if r["score"] > score_threshold]
+        else:
+            responses = result["responseSet"][0]["response"]
        documents = result["responseSet"][0]["document"]

        metadatas = []
@@ -316,7 +322,7 @@ class Vectara(VectorStore):
            md.update(doc_md)
            metadatas.append(md)

-        docs = [
+        docs_with_score = [
            (
                Document(
                    page_content=x["text"],
@@ -327,7 +333,7 @@ class Vectara(VectorStore):
            for x, md in zip(responses, metadatas)
        ]

-        return docs
+        return docs_with_score

    def similarity_search(
        self,
@@ -358,6 +364,7 @@ class Vectara(VectorStore):
            k=k,
            lambda_val=lambda_val,
            filter=filter,
+            score_threshold=None,
            n_sentence_context=n_sentence_context,
            **kwargs,
        )
--- a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
@@ -68,6 +68,19 @@ def test_vectara_add_documents() -> None:
    assert output2[0].page_content == "retrieval augmented generation"
    assert output2[0].metadata["abbr"] == "rag"

+    # test without filter but with similarity score
+    # this is similar to the first test, but given the score threshold
+    # we only get one result
+    output3 = docsearch.similarity_search_with_score(
+        "large language model",
+        k=2,
+        score_threshold=0.1,
+        n_sentence_context=0,
+    )
+    assert len(output3) == 1
+    assert output3[0][0].page_content == "large language model"
+    assert output3[0][0].metadata["abbr"] == "llm"
+
    for doc_id in doc_ids:
        docsearch._delete_doc(doc_id)