diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py index f6a37286d21..bba941c051c 100644 --- a/langchain/vectorstores/base.py +++ b/langchain/vectorstores/base.py @@ -4,7 +4,7 @@ from __future__ import annotations import asyncio from abc import ABC, abstractmethod from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar from pydantic import BaseModel, Field, root_validator @@ -81,6 +81,41 @@ class VectorStore(ABC): ) -> List[Document]: """Return docs most similar to query.""" + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + """ + docs_and_similarities = self._similarity_search_with_relevance_scores( + query, k=k, **kwargs + ) + if any( + similarity < 0.0 or similarity > 1.0 + for _, similarity in docs_and_similarities + ): + raise ValueError( + "Relevance scores must be between" + f" 0 and 1, got {docs_and_similarities}" + ) + return docs_and_similarities + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + async def asimilarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index 4727a555903..27f7a2ba9dd 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -1,6 +1,7 @@ """Wrapper around FAISS vector database.""" from __future__ import annotations +import math import pickle import uuid from pathlib import Path @@ -29,6 +30,20 @@ def dependable_faiss_import() -> Any: return faiss +def _default_relevance_score_fn(score: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + # - embedding dimensionality + # - etc. + # This function converts the euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + return 1.0 - score / math.sqrt(2) + + class FAISS(VectorStore): """Wrapper around FAISS vector database. @@ -48,12 +63,16 @@ class FAISS(VectorStore): index: Any, docstore: Docstore, index_to_docstore_id: Dict[int, str], + relevance_score_fn: Optional[ + Callable[[float], float] + ] = _default_relevance_score_fn, ): """Initialize with necessary components.""" self.embedding_function = embedding_function self.index = index self.docstore = docstore self.index_to_docstore_id = index_to_docstore_id + self.relevance_score_fn = relevance_score_fn def __add( self, @@ -318,7 +337,7 @@ class FAISS(VectorStore): docstore = InMemoryDocstore( {index_to_id[i]: doc for i, doc in enumerate(documents)} ) - return cls(embedding.embed_query, index, docstore, index_to_id) + return cls(embedding.embed_query, index, docstore, index_to_id, **kwargs) @classmethod def from_texts( @@ -346,7 +365,13 @@ class FAISS(VectorStore): faiss = FAISS.from_texts(texts, embeddings) """ embeddings = embedding.embed_documents(texts) - return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) + return cls.__from( + texts, + embeddings, + embedding, + metadatas, + **kwargs, + ) @classmethod def from_embeddings( @@ -375,7 +400,13 @@ class FAISS(VectorStore): """ texts = [t[0] for t in text_embeddings] embeddings = [t[1] for t in text_embeddings] - return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) + return cls.__from( + texts, + embeddings, + embedding, + metadatas, + **kwargs, + ) def save_local(self, folder_path: str, index_name: str = "index") -> None: """Save FAISS index, docstore, and index_to_docstore_id to disk. @@ -421,3 +452,18 @@ class FAISS(VectorStore): with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f: docstore, index_to_docstore_id = pickle.load(f) return cls(embeddings.embed_query, index, docstore, index_to_docstore_id) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores on a scale from 0 to 1.""" + if self.relevance_score_fn is None: + raise ValueError( + "normalize_score_fn must be provided to" + " FAISS constructor to normalize scores" + ) + docs_and_scores = self.similarity_search_with_score(query, k=k) + return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores] diff --git a/tests/integration_tests/vectorstores/test_faiss.py b/tests/integration_tests/vectorstores/test_faiss.py index d1fc9c5e886..2da963cded4 100644 --- a/tests/integration_tests/vectorstores/test_faiss.py +++ b/tests/integration_tests/vectorstores/test_faiss.py @@ -1,4 +1,5 @@ """Test FAISS functionality.""" +import math import tempfile import pytest @@ -109,3 +110,37 @@ def test_faiss_local_save_load() -> None: docsearch.save_local(temp_file.name) new_docsearch = FAISS.load_local(temp_file.name, FakeEmbeddings()) assert new_docsearch.index is not None + + +def test_faiss_similarity_search_with_relevance_scores() -> None: + """Test the similarity search with normalized similarities.""" + texts = ["foo", "bar", "baz"] + docsearch = FAISS.from_texts( + texts, + FakeEmbeddings(), + normalize_score_fn=lambda score: 1.0 - score / math.sqrt(2), + ) + outputs = docsearch.similarity_search_with_relevance_scores("foo", k=1) + output, score = outputs[0] + assert output == Document(page_content="foo") + assert score == 1.0 + + +def test_faiss_invalid_normalize_fn() -> None: + """Test the similarity search with normalized similarities.""" + texts = ["foo", "bar", "baz"] + docsearch = FAISS.from_texts( + texts, FakeEmbeddings(), normalize_score_fn=lambda _: 2.0 + ) + with pytest.raises( + ValueError, match="Normalized similarity scores must be between 0 and 1" + ): + docsearch.similarity_search_with_relevance_scores("foo", k=1) + + +def test_missing_normalize_score_fn() -> None: + """Test doesn't perform similarity search without a normalize score function.""" + with pytest.raises(ValueError): + texts = ["foo", "bar", "baz"] + faiss_instance = FAISS.from_texts(texts, FakeEmbeddings()) + faiss_instance.similarity_search_with_relevance_scores("foo", k=2)