fixed similarity_search_with_score to really use a score

updated unit test with a test for score threshold
Updated demo notebook
This commit is contained in:
Ofer Mendelevitch
2023-08-28 22:26:55 -07:00
parent 1b6947e56c
commit 8b8d2a6535
3 changed files with 67 additions and 44 deletions

View File

@@ -245,6 +245,7 @@ class Vectara(VectorStore):
k: int = 5,
lambda_val: float = 0.025,
filter: Optional[str] = None,
score_threshold: Optional[float] = None,
n_sentence_context: int = 2,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
@@ -258,6 +259,8 @@ class Vectara(VectorStore):
filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see
https://docs.vectara.com/docs/search-apis/sql/filter-overview
for more details.
score_threshold: minimal score thresold for the result.
If defined, results with score less than this value will be filtered out.
n_sentence_context: number of sentences before/after the matching segment
to add, defaults to 2
@@ -305,7 +308,10 @@ class Vectara(VectorStore):
result = response.json()
responses = result["responseSet"][0]["response"]
if score_threshold:
responses = [r for r in result["responseSet"][0]["response"] if r["score"] > score_threshold]
else:
responses = result["responseSet"][0]["response"]
documents = result["responseSet"][0]["document"]
metadatas = []
@@ -316,7 +322,7 @@ class Vectara(VectorStore):
md.update(doc_md)
metadatas.append(md)
docs = [
docs_with_score = [
(
Document(
page_content=x["text"],
@@ -327,7 +333,7 @@ class Vectara(VectorStore):
for x, md in zip(responses, metadatas)
]
return docs
return docs_with_score
def similarity_search(
self,
@@ -358,6 +364,7 @@ class Vectara(VectorStore):
k=k,
lambda_val=lambda_val,
filter=filter,
score_threshold=None,
n_sentence_context=n_sentence_context,
**kwargs,
)

View File

@@ -68,6 +68,19 @@ def test_vectara_add_documents() -> None:
assert output2[0].page_content == "retrieval augmented generation"
assert output2[0].metadata["abbr"] == "rag"
# test without filter but with similarity score
# this is similar to the first test, but given the score threshold
# we only get one result
output3 = docsearch.similarity_search_with_score(
"large language model",
k=2,
score_threshold=0.1,
n_sentence_context=0,
)
assert len(output3) == 1
assert output3[0][0].page_content == "large language model"
assert output3[0][0].metadata["abbr"] == "llm"
for doc_id in doc_ids:
docsearch._delete_doc(doc_id)