From 5171c3bccaf8642135a20e558eb8468ccbfcc682 Mon Sep 17 00:00:00 2001 From: Raymond Yuan Date: Mon, 10 Jul 2023 20:37:03 -0700 Subject: [PATCH] Refactor vector storage to correctly handle relevancy scores (#6570) Description: This pull request aims to support generating the correct generic relevancy scores for different vector stores by refactoring the relevance score functions and their selection in the base class and subclasses of VectorStore. This is especially relevant with VectorStores that require a distance metric upon initialization. Note many of the current implenetations of `_similarity_search_with_relevance_scores` are not technically correct, as they just return `self.similarity_search_with_score(query, k, **kwargs)` without applying the relevant score function Also includes changes associated with: https://github.com/hwchase17/langchain/pull/6564 and https://github.com/hwchase17/langchain/pull/6494 See more indepth discussion in thread in #6494 Issue: https://github.com/hwchase17/langchain/issues/6526 https://github.com/hwchase17/langchain/issues/6481 https://github.com/hwchase17/langchain/issues/6346 Dependencies: None The changes include: - Properly handling score thresholding in FAISS `similarity_search_with_score_by_vector` for the corresponding distance metric. - Refactoring the `_similarity_search_with_relevance_scores` method in the base class and removing it from the subclasses for incorrectly implemented subclasses. - Adding a `_select_relevance_score_fn` method in the base class and implementing it in the subclasses to select the appropriate relevance score function based on the distance strategy. - Updating the `__init__` methods of the subclasses to set the `relevance_score_fn` attribute. - Removing the `_default_relevance_score_fn` function from the FAISS class and using the base class's `_euclidean_relevance_score_fn` instead. - Adding the `DistanceStrategy` enum to the `utils.py` file and updating the imports in the vector store classes. - Updating the tests to import the `DistanceStrategy` enum from the `utils.py` file. --------- Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com> --- langchain/vectorstores/analyticdb.py | 27 +----- langchain/vectorstores/base.py | 89 ++++++++++++++++--- langchain/vectorstores/cassandra.py | 20 +---- langchain/vectorstores/chroma.py | 63 +++++++++++-- langchain/vectorstores/faiss.py | 86 +++++++++++++----- langchain/vectorstores/pgvector.py | 37 +++++++- langchain/vectorstores/pinecone.py | 33 +++++-- langchain/vectorstores/qdrant.py | 26 ++++++ langchain/vectorstores/redis.py | 46 +++++----- langchain/vectorstores/singlestoredb.py | 25 +++--- langchain/vectorstores/utils.py | 9 ++ langchain/vectorstores/weaviate.py | 29 ++---- .../vectorstores/test_chroma.py | 39 ++++++++ .../vectorstores/test_faiss.py | 11 +++ .../vectorstores/test_pgvector.py | 67 ++++++++++++++ .../vectorstores/test_pinecone.py | 19 ++++ .../vectorstores/test_qdrant.py | 24 +++++ .../vectorstores/test_singlestoredb.py | 3 +- 18 files changed, 504 insertions(+), 149 deletions(-) diff --git a/langchain/vectorstores/analyticdb.py b/langchain/vectorstores/analyticdb.py index 86917b628fa..2fed04753c4 100644 --- a/langchain/vectorstores/analyticdb.py +++ b/langchain/vectorstores/analyticdb.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Type +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Type from sqlalchemy import REAL, Column, String, Table, create_engine, insert, text from sqlalchemy.dialects.postgresql import ARRAY, JSON, TEXT @@ -79,6 +79,9 @@ class AnalyticDB(VectorStore): self.engine = create_engine(self.connection_string, **_engine_args) self.create_collection() + def _select_relevance_score_fn(self) -> Callable[[float], float]: + return self._euclidean_relevance_score_fn + def create_table_if_not_exists(self) -> None: # Define the dynamic table Table( @@ -242,28 +245,6 @@ class AnalyticDB(VectorStore): ) return docs - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores in the range [0, 1]. - - 0 is dissimilar, 1 is most similar. - - Args: - query: input text - k: Number of Documents to return. Defaults to 4. - **kwargs: kwargs to be passed to similarity search. Should include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - List of Tuples of (doc, similarity_score) - """ - return self.similarity_search_with_score(query, k, **kwargs) - def similarity_search_with_score_by_vector( self, embedding: List[float], diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py index 346108617d4..d529e253abf 100644 --- a/langchain/vectorstores/base.py +++ b/langchain/vectorstores/base.py @@ -3,11 +3,13 @@ from __future__ import annotations import asyncio +import math import warnings from abc import ABC, abstractmethod from functools import partial from typing import ( Any, + Callable, ClassVar, Collection, Dict, @@ -137,6 +139,81 @@ class VectorStore(ABC): ) -> List[Document]: """Return docs most similar to query.""" + @staticmethod + def _euclidean_relevance_score_fn(distance: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many + # others are not!) + # - embedding dimensionality + # - etc. + # This function converts the euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + return 1.0 - distance / math.sqrt(2) + + @staticmethod + def _cosine_relevance_score_fn(distance: float) -> float: + """Normalize the distance to a score on a scale [0, 1].""" + + return 1.0 - distance + + @staticmethod + def _max_inner_product_relevance_score_fn(distance: float) -> float: + """Normalize the distance to a score on a scale [0, 1].""" + if distance > 0: + return 1.0 - distance + + return -1.0 * distance + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + + Vectorstores should define their own selection based method of relevance. + """ + raise NotImplementedError + + def similarity_search_with_score( + self, *args: Any, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search with distance.""" + raise NotImplementedError + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Default similarity search with relevance scores. Modify if necessary + in subclass. + Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + relevance_score_fn = self._select_relevance_score_fn() + docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) + return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] + def similarity_search_with_relevance_scores( self, query: str, @@ -183,18 +260,6 @@ class VectorStore(ABC): ) return docs_and_similarities - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - async def asimilarity_search_with_relevance_scores( self, query: str, k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float]]: diff --git a/langchain/vectorstores/cassandra.py b/langchain/vectorstores/cassandra.py index 6b752c76aa4..aba22f44dc4 100644 --- a/langchain/vectorstores/cassandra.py +++ b/langchain/vectorstores/cassandra.py @@ -3,7 +3,7 @@ from __future__ import annotations import typing import uuid -from typing import Any, Iterable, List, Optional, Tuple, Type, TypeVar +from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, TypeVar import numpy as np @@ -77,6 +77,9 @@ class Cassandra(VectorStore): primary_key_type="TEXT", ) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + return self._cosine_relevance_score_fn + def delete_collection(self) -> None: """ Just an alias for `clear` @@ -268,21 +271,6 @@ class Cassandra(VectorStore): k, ) - # Even though this is a `_`-method, - # it is apparently used by VectorSearch parent class - # in an exposed method (`similarity_search_with_relevance_scores`). - # So we implement it (hmm). - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - return self.similarity_search_with_score( - query, - k, - ) - def max_marginal_relevance_search_by_vector( self, embedding: List[float], diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index c9eca713725..246d6e2d02e 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -3,7 +3,17 @@ from __future__ import annotations import logging import uuid -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, +) import numpy as np @@ -64,6 +74,7 @@ class Chroma(VectorStore): client_settings: Optional[chromadb.config.Settings] = None, collection_metadata: Optional[Dict] = None, client: Optional[chromadb.Client] = None, + relevance_score_fn: Optional[Callable[[float], float]] = None, ) -> None: """Initialize with Chroma client.""" try: @@ -100,6 +111,7 @@ class Chroma(VectorStore): else None, metadata=collection_metadata, ) + self.override_relevance_score_fn = relevance_score_fn @xor_args(("query_texts", "query_embeddings")) def __query_collection( @@ -250,13 +262,37 @@ class Chroma(VectorStore): return _results_to_docs_and_scores(results) - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - return self.similarity_search_with_score(query, k, **kwargs) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn: + return self.override_relevance_score_fn + + distance = "l2" + distance_key = "hnsw:space" + metadata = self._collection.metadata + + if metadata and distance_key in metadata: + distance = metadata[distance_key] + + if distance == "cosine": + return self._cosine_relevance_score_fn + elif distance == "l2": + return self._euclidean_relevance_score_fn + elif distance == "ip": + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "No supported normalization function" + f" for distance metric of type: {distance}." + "Consider providing relevance_score_fn to Chroma constructor." + ) def max_marginal_relevance_search_by_vector( self, @@ -428,6 +464,7 @@ class Chroma(VectorStore): persist_directory: Optional[str] = None, client_settings: Optional[chromadb.config.Settings] = None, client: Optional[chromadb.Client] = None, + collection_metadata: Optional[Dict] = None, **kwargs: Any, ) -> Chroma: """Create a Chroma vectorstore from a raw documents. @@ -443,6 +480,8 @@ class Chroma(VectorStore): metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. ids (Optional[List[str]]): List of document IDs. Defaults to None. client_settings (Optional[chromadb.config.Settings]): Chroma client settings + collection_metadata (Optional[Dict]): Collection configurations. + Defaults to None. Returns: Chroma: Chroma vectorstore. @@ -453,6 +492,8 @@ class Chroma(VectorStore): persist_directory=persist_directory, client_settings=client_settings, client=client, + collection_metadata=collection_metadata, + **kwargs, ) chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids) return chroma_collection @@ -467,6 +508,7 @@ class Chroma(VectorStore): persist_directory: Optional[str] = None, client_settings: Optional[chromadb.config.Settings] = None, client: Optional[chromadb.Client] = None, # Add this line + collection_metadata: Optional[Dict] = None, **kwargs: Any, ) -> Chroma: """Create a Chroma vectorstore from a list of documents. @@ -481,6 +523,9 @@ class Chroma(VectorStore): documents (List[Document]): List of documents to add to the vectorstore. embedding (Optional[Embeddings]): Embedding function. Defaults to None. client_settings (Optional[chromadb.config.Settings]): Chroma client settings + collection_metadata (Optional[Dict]): Collection configurations. + Defaults to None. + Returns: Chroma: Chroma vectorstore. """ @@ -495,6 +540,8 @@ class Chroma(VectorStore): persist_directory=persist_directory, client_settings=client_settings, client=client, + collection_metadata=collection_metadata, + **kwargs, ) def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None: diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index bd5eca018e2..74b5d747414 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -1,10 +1,11 @@ """Wrapper around FAISS vector database.""" from __future__ import annotations -import math +import operator import os import pickle import uuid +import warnings from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple @@ -15,7 +16,7 @@ from langchain.docstore.document import Document from langchain.docstore.in_memory import InMemoryDocstore from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore -from langchain.vectorstores.utils import maximal_marginal_relevance +from langchain.vectorstores.utils import DistanceStrategy, maximal_marginal_relevance def dependable_faiss_import(no_avx2: Optional[bool] = None) -> Any: @@ -45,20 +46,6 @@ def dependable_faiss_import(no_avx2: Optional[bool] = None) -> Any: return faiss -def _default_relevance_score_fn(score: float) -> float: - """Return a similarity score on a scale [0, 1].""" - # The 'correct' relevance function - # may differ depending on a few things, including: - # - the distance / similarity metric used by the VectorStore - # - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - # - embedding dimensionality - # - etc. - # This function converts the euclidean norm of normalized embeddings - # (0 is most similar, sqrt(2) most dissimilar) - # to a similarity function (0 to 1) - return 1.0 - score / math.sqrt(2) - - class FAISS(VectorStore): """Wrapper around FAISS vector database. @@ -78,16 +65,27 @@ class FAISS(VectorStore): index: Any, docstore: Docstore, index_to_docstore_id: Dict[int, str], - relevance_score_fn: Callable[[float], float] = _default_relevance_score_fn, + relevance_score_fn: Optional[Callable[[float], float]] = None, normalize_L2: bool = False, + distance_strategy: DistanceStrategy = DistanceStrategy.EUCLIDEAN_DISTANCE, ): """Initialize with necessary components.""" self.embedding_function = embedding_function self.index = index self.docstore = docstore self.index_to_docstore_id = index_to_docstore_id - self.relevance_score_fn = relevance_score_fn + self.distance_strategy = distance_strategy + self.override_relevance_score_fn = relevance_score_fn self._normalize_L2 = normalize_L2 + if ( + self.distance_strategy != DistanceStrategy.EUCLIDEAN_DISTANCE + and self._normalize_L2 + ): + warnings.warn( + "Normalizing L2 is not applicable for metric type: {strategy}".format( + strategy=self.distance_strategy + ) + ) def __add( self, @@ -227,10 +225,16 @@ class FAISS(VectorStore): score_threshold = kwargs.get("score_threshold") if score_threshold is not None: + cmp = ( + operator.ge + if self.distance_strategy + in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD) + else operator.le + ) docs = [ (doc, similarity) for doc, similarity in docs - if similarity >= score_threshold + if cmp(similarity, score_threshold) ] return docs[:k] @@ -498,9 +502,16 @@ class FAISS(VectorStore): **kwargs: Any, ) -> FAISS: faiss = dependable_faiss_import() - index = faiss.IndexFlatL2(len(embeddings[0])) + distance_strategy = kwargs.get( + "distance_strategy", DistanceStrategy.EUCLIDEAN_DISTANCE + ) + if distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + index = faiss.IndexFlatIP(len(embeddings[0])) + else: + # Default to L2, currently other metric types not initialized. + index = faiss.IndexFlatL2(len(embeddings[0])) vector = np.array(embeddings, dtype=np.float32) - if normalize_L2: + if normalize_L2 and distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: faiss.normalize_L2(vector) index.add(vector) documents = [] @@ -646,6 +657,31 @@ class FAISS(VectorStore): embeddings.embed_query, index, docstore, index_to_docstore_id, **kwargs ) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided in + # vectorstore constructor + if self.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: + # Default behavior is to use euclidean distance relevancy + return self._euclidean_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, max_inner_product," + " or euclidean" + ) + def _similarity_search_with_relevance_scores( self, query: str, @@ -655,6 +691,12 @@ class FAISS(VectorStore): **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs and their similarity scores on a scale from 0 to 1.""" + relevance_score_fn = self._select_relevance_score_fn() + if relevance_score_fn is None: + raise ValueError( + "normalize_score_fn must be provided to" + " FAISS constructor to normalize scores" + ) docs_and_scores = self.similarity_search_with_score( query, k=k, @@ -662,4 +704,4 @@ class FAISS(VectorStore): fetch_k=fetch_k, **kwargs, ) - return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores] + return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] diff --git a/langchain/vectorstores/pgvector.py b/langchain/vectorstores/pgvector.py index 5654db5f715..e3fcb3f04a5 100644 --- a/langchain/vectorstores/pgvector.py +++ b/langchain/vectorstores/pgvector.py @@ -4,7 +4,7 @@ from __future__ import annotations import enum import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type import sqlalchemy from sqlalchemy.dialects.postgresql import JSON, UUID @@ -121,6 +121,7 @@ class PGVector(VectorStore): distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, pre_delete_collection: bool = False, logger: Optional[logging.Logger] = None, + relevance_score_fn: Optional[Callable[[float], float]] = None, ) -> None: self.connection_string = connection_string self.embedding_function = embedding_function @@ -129,6 +130,7 @@ class PGVector(VectorStore): self._distance_strategy = distance_strategy self.pre_delete_collection = pre_delete_collection self.logger = logger or logging.getLogger(__name__) + self.override_relevance_score_fn = relevance_score_fn self.__post_init__() def __post_init__( @@ -201,6 +203,11 @@ class PGVector(VectorStore): pre_delete_collection: bool = False, **kwargs: Any, ) -> PGVector: + if ids is None: + ids = [str(uuid.uuid1()) for _ in texts] + + if not metadatas: + metadatas = [{} for _ in texts] connection_string = cls.get_connection_string(kwargs) store = cls( @@ -209,6 +216,7 @@ class PGVector(VectorStore): embedding_function=embedding, distance_strategy=distance_strategy, pre_delete_collection=pre_delete_collection, + **kwargs, ) store.add_embeddings( @@ -590,3 +598,30 @@ class PGVector(VectorStore): ) -> str: """Return connection string from database parameters.""" return f"postgresql+{driver}://{user}:{password}@{host}:{port}/{database}" + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided + # in vectorstore constructor + if self.distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.EUCLIDEAN: + return self._euclidean_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "No supported normalization function" + f" for distance_strategy of {self.distance_strategy}." + "Consider providing relevance_score_fn to PGVector constructor." + ) diff --git a/langchain/vectorstores/pinecone.py b/langchain/vectorstores/pinecone.py index 2b5c5bf2ecc..baeb69f20df 100644 --- a/langchain/vectorstores/pinecone.py +++ b/langchain/vectorstores/pinecone.py @@ -10,7 +10,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore -from langchain.vectorstores.utils import maximal_marginal_relevance +from langchain.vectorstores.utils import DistanceStrategy, maximal_marginal_relevance logger = logging.getLogger(__name__) @@ -40,6 +40,7 @@ class Pinecone(VectorStore): index: Any, embedding_function: Callable, text_key: str, + distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE, ): """Initialize with Pinecone client.""" try: @@ -57,6 +58,7 @@ class Pinecone(VectorStore): self._index = index self._embedding_function = embedding_function self._text_key = text_key + self.distance_strategy = distance_strategy def add_texts( self, @@ -147,14 +149,27 @@ class Pinecone(VectorStore): ) return [doc for doc, _ in docs_and_scores] - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - kwargs.pop("score_threshold", None) - return self.similarity_search_with_score(query, k, **kwargs) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + + if self.distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: + return self._euclidean_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, max_inner_product " + "(dot product), or euclidean" + ) def max_marginal_relevance_search_by_vector( self, diff --git a/langchain/vectorstores/qdrant.py b/langchain/vectorstores/qdrant.py index c53d46c368e..bfc4ffbd742 100644 --- a/langchain/vectorstores/qdrant.py +++ b/langchain/vectorstores/qdrant.py @@ -61,6 +61,7 @@ class Qdrant(VectorStore): embeddings: Optional[Embeddings] = None, content_payload_key: str = CONTENT_KEY, metadata_payload_key: str = METADATA_KEY, + distance_strategy: str = "COSINE", vector_name: Optional[str] = VECTOR_NAME, embedding_function: Optional[Callable] = None, # deprecated ): @@ -112,6 +113,8 @@ class Qdrant(VectorStore): self._embeddings_function = embeddings self.embeddings = None + self.distance_strategy = distance_strategy.upper() + def add_texts( self, texts: Iterable[str], @@ -419,6 +422,28 @@ class Qdrant(VectorStore): for result in results ] + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + + if self.distance_strategy == "COSINE": + return self._cosine_relevance_score_fn + elif self.distance_strategy == "DOT": + return self._max_inner_product_relevance_score_fn + elif self.distance_strategy == "EUCLID": + return self._euclidean_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, " + "max_inner_product, or euclidean" + ) + def _similarity_search_with_relevance_scores( self, query: str, @@ -723,6 +748,7 @@ class Qdrant(VectorStore): embeddings=embedding, content_payload_key=content_payload_key, metadata_payload_key=metadata_payload_key, + distance_strategy=distance_func, vector_name=vector_name, ) diff --git a/langchain/vectorstores/redis.py b/langchain/vectorstores/redis.py index 20eecf7ee23..f166d010650 100644 --- a/langchain/vectorstores/redis.py +++ b/langchain/vectorstores/redis.py @@ -121,9 +121,8 @@ class Redis(VectorStore): content_key: str = "content", metadata_key: str = "metadata", vector_key: str = "content_vector", - relevance_score_fn: Optional[ - Callable[[float], float] - ] = _default_relevance_score, + relevance_score_fn: Optional[Callable[[float], float]] = None, + distance_metric: REDIS_DISTANCE_METRICS = "COSINE", **kwargs: Any, ): """Initialize with necessary components.""" @@ -149,11 +148,23 @@ class Redis(VectorStore): self.content_key = content_key self.metadata_key = metadata_key self.vector_key = vector_key + self.distance_metric = distance_metric self.relevance_score_fn = relevance_score_fn - def _create_index( - self, dim: int = 1536, distance_metric: REDIS_DISTANCE_METRICS = "COSINE" - ) -> None: + def _select_relevance_score_fn(self) -> Callable[[float], float]: + if self.relevance_score_fn: + return self.relevance_score_fn + + if self.distance_metric == "COSINE": + return self._cosine_relevance_score_fn + elif self.distance_metric == "IP": + return self._max_inner_product_relevance_score_fn + elif self.distance_metric == "L2": + return self._euclidean_relevance_score_fn + else: + return _default_relevance_score + + def _create_index(self, dim: int = 1536) -> None: try: from redis.commands.search.field import TextField, VectorField from redis.commands.search.indexDefinition import IndexDefinition, IndexType @@ -175,7 +186,7 @@ class Redis(VectorStore): { "TYPE": "FLOAT32", "DIM": dim, - "DISTANCE_METRIC": distance_metric, + "DISTANCE_METRIC": self.distance_metric, }, ), ) @@ -347,24 +358,6 @@ class Redis(VectorStore): return docs - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - if self.relevance_score_fn is None: - raise ValueError( - "relevance_score_fn must be provided to" - " Redis constructor to normalize scores" - ) - docs_and_scores = self.similarity_search_with_score(query, k=k) - return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores] - @classmethod def from_texts_return_keys( cls, @@ -413,6 +406,7 @@ class Redis(VectorStore): content_key=content_key, metadata_key=metadata_key, vector_key=vector_key, + distance_metric=distance_metric, **kwargs, ) @@ -420,7 +414,7 @@ class Redis(VectorStore): embeddings = embedding.embed_documents(texts) # Create the search index - instance._create_index(dim=len(embeddings[0]), distance_metric=distance_metric) + instance._create_index(dim=len(embeddings[0])) # Add data to Redis keys = instance.add_texts(texts, metadatas, embeddings) diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py index fd10358352d..209d17cbd81 100644 --- a/langchain/vectorstores/singlestoredb.py +++ b/langchain/vectorstores/singlestoredb.py @@ -2,9 +2,18 @@ from __future__ import annotations -import enum import json -from typing import Any, ClassVar, Collection, Iterable, List, Optional, Tuple, Type +from typing import ( + Any, + Callable, + ClassVar, + Collection, + Iterable, + List, + Optional, + Tuple, + Type, +) from sqlalchemy.pool import QueuePool @@ -15,14 +24,7 @@ from langchain.callbacks.manager import ( from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore, VectorStoreRetriever - - -class DistanceStrategy(str, enum.Enum): - """Enumerator of the Distance strategies for SingleStoreDB.""" - - EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE" - DOT_PRODUCT = "DOT_PRODUCT" - +from langchain.vectorstores.utils import DistanceStrategy DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT @@ -211,6 +213,9 @@ class SingleStoreDB(VectorStore): ) self._create_table() + def _select_relevance_score_fn(self) -> Callable[[float], float]: + return self._max_inner_product_relevance_score_fn + def _create_table(self: SingleStoreDB) -> None: """Create table if it doesn't exist.""" conn = self.connection_pool.connect() diff --git a/langchain/vectorstores/utils.py b/langchain/vectorstores/utils.py index aead758b3e0..a44560e3911 100644 --- a/langchain/vectorstores/utils.py +++ b/langchain/vectorstores/utils.py @@ -1,5 +1,6 @@ """Utility functions for working with vectors and vectorstores.""" +from enum import Enum from typing import List import numpy as np @@ -7,6 +8,14 @@ import numpy as np from langchain.math_utils import cosine_similarity +class DistanceStrategy(str, Enum): + EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE" + MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT" + DOT_PRODUCT = "DOT_PRODUCT" + JACCARD = "JACCARD" + COSINE = "COSINE" + + def maximal_marginal_relevance( query_embedding: np.ndarray, embedding_list: list, diff --git a/langchain/vectorstores/weaviate.py b/langchain/vectorstores/weaviate.py index f623082e415..16d5bc15733 100644 --- a/langchain/vectorstores/weaviate.py +++ b/langchain/vectorstores/weaviate.py @@ -113,11 +113,18 @@ class Weaviate(VectorStore): self._embedding = embedding self._text_key = text_key self._query_attrs = [self._text_key] - self._relevance_score_fn = relevance_score_fn + self.relevance_score_fn = relevance_score_fn self._by_text = by_text if attributes is not None: self._query_attrs.extend(attributes) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + return ( + self.relevance_score_fn + if self.relevance_score_fn + else _default_score_normalizer + ) + def add_texts( self, texts: Iterable[str], @@ -361,26 +368,6 @@ class Weaviate(VectorStore): docs_and_scores.append((Document(page_content=text, metadata=res), score)) return docs_and_scores - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - if self._relevance_score_fn is None: - raise ValueError( - "relevance_score_fn must be provided to" - " Weaviate constructor to normalize scores" - ) - docs_and_scores = self.similarity_search_with_score(query, k=k, **kwargs) - return [ - (doc, self._relevance_score_fn(score)) for doc, score in docs_and_scores - ] - @classmethod def from_texts( cls: Type[Weaviate], diff --git a/tests/integration_tests/vectorstores/test_chroma.py b/tests/integration_tests/vectorstores/test_chroma.py index f19164e5207..80be89596ee 100644 --- a/tests/integration_tests/vectorstores/test_chroma.py +++ b/tests/integration_tests/vectorstores/test_chroma.py @@ -228,3 +228,42 @@ def test_chroma_update_document() -> None: ] assert new_embedding == embedding.embed_documents([updated_content])[0] assert new_embedding != old_embedding + + +def test_chroma_with_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + collection_metadata={"hnsw:space": "l2"}, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.8), + (Document(page_content="baz", metadata={"page": "2"}), 0.5), + ] + + +def test_chroma_with_relevance_score_custom_normalization_fn() -> None: + """Test searching with relevance score and custom normalization function.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + relevance_score_fn=lambda d: d * 0, + collection_metadata={"hnsw:space": "l2"}, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), -0.0), + (Document(page_content="bar", metadata={"page": "1"}), -0.0), + (Document(page_content="baz", metadata={"page": "2"}), -0.0), + ] diff --git a/tests/integration_tests/vectorstores/test_faiss.py b/tests/integration_tests/vectorstores/test_faiss.py index 7dc5ac5884b..e19f6e08b59 100644 --- a/tests/integration_tests/vectorstores/test_faiss.py +++ b/tests/integration_tests/vectorstores/test_faiss.py @@ -195,3 +195,14 @@ def test_faiss_invalid_normalize_fn() -> None: ) with pytest.warns(Warning, match="scores must be between"): docsearch.similarity_search_with_relevance_scores("foo", k=1) + + +def test_missing_normalize_score_fn() -> None: + """Test doesn't perform similarity search without a valid distance strategy.""" + with pytest.raises(ValueError): + texts = ["foo", "bar", "baz"] + faiss_instance = FAISS.from_texts( + texts, FakeEmbeddings(), distance_strategy="fake" + ) + + faiss_instance.similarity_search_with_relevance_scores("foo", k=2) diff --git a/tests/integration_tests/vectorstores/test_pgvector.py b/tests/integration_tests/vectorstores/test_pgvector.py index b8d3314d811..46c8f11e19a 100644 --- a/tests/integration_tests/vectorstores/test_pgvector.py +++ b/tests/integration_tests/vectorstores/test_pgvector.py @@ -184,3 +184,70 @@ def test_pgvector_with_filter_in_set() -> None: (Document(page_content="foo", metadata={"page": "0"}), 0.0), (Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406), ] + + +def test_pgvector_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.9996744261675065), + (Document(page_content="baz", metadata={"page": "2"}), 0.9986996093328621), + ] + + +def test_pgvector_retriever_search_threshold() -> None: + """Test using retriever for searching with threshold.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.999}, + ) + output = retriever.get_relevant_documents("summer") + assert output == [ + Document(page_content="foo", metadata={"page": "0"}), + Document(page_content="bar", metadata={"page": "1"}), + ] + + +def test_pgvector_retriever_search_threshold_custom_normalization_fn() -> None: + """Test searching with threshold and custom normalization function""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + relevance_score_fn=lambda d: d * 0, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.5}, + ) + output = retriever.get_relevant_documents("foo") + assert output == [] diff --git a/tests/integration_tests/vectorstores/test_pinecone.py b/tests/integration_tests/vectorstores/test_pinecone.py index d66ddf9a760..238f46f6ccb 100644 --- a/tests/integration_tests/vectorstores/test_pinecone.py +++ b/tests/integration_tests/vectorstores/test_pinecone.py @@ -2,6 +2,7 @@ import importlib import os import time import uuid +import numpy as np from typing import List import pinecone @@ -154,3 +155,21 @@ class TestPinecone: time.sleep(20) index_stats = self.index.describe_index_stats() assert index_stats["total_vector_count"] == len(texts) * 2 + + @pytest.mark.vcr() + def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None: + """Ensures all relevance scores are between 0 and 1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Pinecone.from_texts( + texts, + embedding_openai, + index_name=index_name, + metadatas=metadatas, + ) + # wait for the index to be ready + time.sleep(20) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + assert all( + (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output + ) diff --git a/tests/integration_tests/vectorstores/test_qdrant.py b/tests/integration_tests/vectorstores/test_qdrant.py index 20b590ff335..de779dadf85 100644 --- a/tests/integration_tests/vectorstores/test_qdrant.py +++ b/tests/integration_tests/vectorstores/test_qdrant.py @@ -1,6 +1,7 @@ """Test Qdrant functionality.""" import tempfile from typing import Callable, Optional +import numpy as np import pytest from qdrant_client.http import models as rest @@ -513,3 +514,26 @@ def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) - vector_name in point.vector # type: ignore[operator] for point in client.scroll(collection_name, with_vectors=True)[0] ) + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +def test_qdrant_similarity_search_with_relevance_scores( + batch_size: int, content_payload_key: str, metadata_payload_key: str +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + + assert all( + (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output + ) diff --git a/tests/integration_tests/vectorstores/test_singlestoredb.py b/tests/integration_tests/vectorstores/test_singlestoredb.py index f01ab1d7a49..b53ed8ba272 100644 --- a/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -5,7 +5,8 @@ import numpy as np import pytest from langchain.docstore.document import Document -from langchain.vectorstores.singlestoredb import DistanceStrategy, SingleStoreDB +from langchain.vectorstores.singlestoredb import SingleStoreDB +from langchain.vectorstores.utils import DistanceStrategy from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"