From dc9b080b9e697fb54f6e45a6f585d74a14b060d9 Mon Sep 17 00:00:00 2001 From: Prerit Das Date: Sat, 22 Apr 2023 21:55:18 -0400 Subject: [PATCH] Chroma: Catch and handle `NotEnoughElementsException` (#3368) ## Problem and Solution This PR solves #1793, which is more of a convenience for users than anything else. When using Chroma as a vectorstore, if you try to run similarity search with a `k` value that is larger than the number of documents stored in the vectorstore, Chroma will raise a `chromadb.errors.NotEnoughElementsException`. The workaround is to add a new parameter in all similarity search methods under the `Chroma` class called `find_highest_possible_k`, an optional boolean parameter that defaults to True (changes default behavior). If this parameter is set to `False`, the methods will behave exactly as they did before this PR. If the parameter is `True`, however, the method will try running similarity search with the given `k`, and if `chromadb.errors.NotEnoughElementsException` is raised, iteratively lower `k` (down to `k=1`) until the error is no longer raised. The following is an example of how this is implemented in the `Chroma.similarity_search` method. https://github.com/preritdas/langchain/blob/e0846c2bcaafa4f54a193a6a7dfa8ed46480c326/langchain/vectorstores/chroma.py#L127-L159 We add the `find_highest_possible_k` parameter as `Optional` and defaulting to True. We explain it briefly in the docstring. We wrap the previous similarity search logic inside a private local function that takes `k`. If `find_highest_possible_k` is False, we return that private function, retaining previous behavior. If it is True, which it is by default, we iteratively lower `k` (until it is 1) until we can find `k` documents from the Chroma vectorstore. ## Example You create a `Chroma` object from 1 document. You then run `.similarity_search()`, `.similarity_search_by_vector()`, or `similarity_search_with_score()`. If you only pass a query, the default `k` is `4`. All methods would previously raise a `chromadb.errors.NotEnoughElementsException`. Now, however, all methods will return one document, the document inside the vectorstore (unless you're filtering, setting a maximum distance, etc.). ## Note I didn't find any places in the documentation to mention this change, other than the example Jupyter notebook for the Chroma vectorstore. In that notebook, there was never a cell running similarity search with parameters. If it's important to include information on altering the `find_highest_possible_k` parameter, I'll happily document it wherever. --- langchain/vectorstores/chroma.py | 74 +++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 7d29dbe5d72..19617947048 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -128,6 +128,7 @@ class Chroma(VectorStore): self, query: str, k: int = 4, + find_highest_possible_k: Optional[bool] = True, filter: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: @@ -136,18 +137,32 @@ class Chroma(VectorStore): Args: query (str): Query text to search for. k (int): Number of results to return. Defaults to 4. + find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k + until there are enough items in the vectorstore to not raise an Error. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: List[Document]: List of documents most similar to the query text. """ - docs_and_scores = self.similarity_search_with_score(query, k, filter=filter) - return [doc for doc, _ in docs_and_scores] + def _similarity_search(k: int): + docs_and_scores = self.similarity_search_with_score(query, k, filter=filter) + return [doc for doc, _ in docs_and_scores] + if not find_highest_possible_k: + return _similarity_search(k=k) + + # Iteratively lower k until an error isn't raised by Chroma + for try_k in range(k, 0, -1): + try: + return _similarity_search(k=try_k) + except chromadb.errors.NotEnoughElementsException: + continue + def similarity_search_by_vector( self, embedding: List[float], k: int = 4, + find_highest_possible_k: Optional[bool] = True, filter: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: @@ -155,18 +170,32 @@ class Chroma(VectorStore): Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. + find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k + until there are enough items in the vectorstore to not raise an Error. Returns: List of Documents most similar to the query vector. """ - results = self._collection.query( - query_embeddings=embedding, n_results=k, where=filter - ) - return _results_to_docs(results) + def _similarity_search(k: int): + results = self._collection.query( + query_embeddings=embedding, n_results=k, where=filter + ) + return _results_to_docs(results) + + if not find_highest_possible_k: + return _similarity_search(k=k) + + # Iteratively lower k until an error isn't raised by Chroma + for try_k in range(k, 0, -1): + try: + return _similarity_search(k=try_k) + except chromadb.errors.NotEnoughElementsException: + continue def similarity_search_with_score( self, query: str, k: int = 4, + find_highest_possible_k: Optional[bool] = True, filter: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: @@ -175,23 +204,36 @@ class Chroma(VectorStore): Args: query (str): Query text to search for. k (int): Number of results to return. Defaults to 4. + find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k + until there are enough items in the vectorstore to not raise an Error. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: List[Tuple[Document, float]]: List of documents most similar to the query text with distance in float. """ - if self._embedding_function is None: - results = self._collection.query( - query_texts=[query], n_results=k, where=filter - ) - else: - query_embedding = self._embedding_function.embed_query(query) - results = self._collection.query( - query_embeddings=[query_embedding], n_results=k, where=filter - ) + def _similarity_search(k: int): + if self._embedding_function is None: + results = self._collection.query( + query_texts=[query], n_results=k, where=filter + ) + else: + query_embedding = self._embedding_function.embed_query(query) + results = self._collection.query( + query_embeddings=[query_embedding], n_results=k, where=filter + ) - return _results_to_docs_and_scores(results) + return _results_to_docs_and_scores(results) + + if not find_highest_possible_k: + return _similarity_search(k=k) + + # Iteratively lower k until an error isn't raised by Chroma + for try_k in range(k, 0, -1): + try: + return _similarity_search(k=try_k) + except chromadb.errors.NotEnoughElementsException: + continue def max_marginal_relevance_search_by_vector( self,