Compare commits

...

1 Commits

Author SHA1 Message Date
Prerit Das
dc9b080b9e Chroma: Catch and handle NotEnoughElementsException (#3368)
## Problem and Solution

This PR solves #1793, which is more of a convenience for users than
anything else. When using Chroma as a vectorstore, if you try to run
similarity search with a `k` value that is larger than the number of
documents stored in the vectorstore, Chroma will raise a
`chromadb.errors.NotEnoughElementsException`.

The workaround is to add a new parameter in all similarity search
methods under the `Chroma` class called `find_highest_possible_k`, an
optional boolean parameter that defaults to True (changes default
behavior). If this parameter is set to `False`, the methods will behave
exactly as they did before this PR.

If the parameter is `True`, however, the method will try running
similarity search with the given `k`, and if
`chromadb.errors.NotEnoughElementsException` is raised, iteratively
lower `k` (down to `k=1`) until the error is no longer raised.

The following is an example of how this is implemented in the
`Chroma.similarity_search` method.


e0846c2bca/langchain/vectorstores/chroma.py (L127-L159)

We add the `find_highest_possible_k` parameter as `Optional` and
defaulting to True. We explain it briefly in the docstring. We wrap the
previous similarity search logic inside a private local function that
takes `k`. If `find_highest_possible_k` is False, we return that private
function, retaining previous behavior. If it is True, which it is by
default, we iteratively lower `k` (until it is 1) until we can find `k`
documents from the Chroma vectorstore.

## Example

You create a `Chroma` object from 1 document. You then run
`.similarity_search()`, `.similarity_search_by_vector()`, or
`similarity_search_with_score()`. If you only pass a query, the default
`k` is `4`. All methods would previously raise a
`chromadb.errors.NotEnoughElementsException`.

Now, however, all methods will return one document, the document inside
the vectorstore (unless you're filtering, setting a maximum distance,
etc.).

## Note

I didn't find any places in the documentation to mention this change,
other than the example Jupyter notebook for the Chroma vectorstore. In
that notebook, there was never a cell running similarity search with
parameters. If it's important to include information on altering the
`find_highest_possible_k` parameter, I'll happily document it wherever.
2023-04-22 18:55:18 -07:00

View File

@@ -128,6 +128,7 @@ class Chroma(VectorStore):
self,
query: str,
k: int = 4,
find_highest_possible_k: Optional[bool] = True,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
@@ -136,18 +137,32 @@ class Chroma(VectorStore):
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
until there are enough items in the vectorstore to not raise an Error.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List[Document]: List of documents most similar to the query text.
"""
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
return [doc for doc, _ in docs_and_scores]
def _similarity_search(k: int):
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
return [doc for doc, _ in docs_and_scores]
if not find_highest_possible_k:
return _similarity_search(k=k)
# Iteratively lower k until an error isn't raised by Chroma
for try_k in range(k, 0, -1):
try:
return _similarity_search(k=try_k)
except chromadb.errors.NotEnoughElementsException:
continue
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
find_highest_possible_k: Optional[bool] = True,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
@@ -155,18 +170,32 @@ class Chroma(VectorStore):
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
until there are enough items in the vectorstore to not raise an Error.
Returns:
List of Documents most similar to the query vector.
"""
results = self._collection.query(
query_embeddings=embedding, n_results=k, where=filter
)
return _results_to_docs(results)
def _similarity_search(k: int):
results = self._collection.query(
query_embeddings=embedding, n_results=k, where=filter
)
return _results_to_docs(results)
if not find_highest_possible_k:
return _similarity_search(k=k)
# Iteratively lower k until an error isn't raised by Chroma
for try_k in range(k, 0, -1):
try:
return _similarity_search(k=try_k)
except chromadb.errors.NotEnoughElementsException:
continue
def similarity_search_with_score(
self,
query: str,
k: int = 4,
find_highest_possible_k: Optional[bool] = True,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
@@ -175,23 +204,36 @@ class Chroma(VectorStore):
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
until there are enough items in the vectorstore to not raise an Error.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List[Tuple[Document, float]]: List of documents most similar to the query
text with distance in float.
"""
if self._embedding_function is None:
results = self._collection.query(
query_texts=[query], n_results=k, where=filter
)
else:
query_embedding = self._embedding_function.embed_query(query)
results = self._collection.query(
query_embeddings=[query_embedding], n_results=k, where=filter
)
def _similarity_search(k: int):
if self._embedding_function is None:
results = self._collection.query(
query_texts=[query], n_results=k, where=filter
)
else:
query_embedding = self._embedding_function.embed_query(query)
results = self._collection.query(
query_embeddings=[query_embedding], n_results=k, where=filter
)
return _results_to_docs_and_scores(results)
return _results_to_docs_and_scores(results)
if not find_highest_possible_k:
return _similarity_search(k=k)
# Iteratively lower k until an error isn't raised by Chroma
for try_k in range(k, 0, -1):
try:
return _similarity_search(k=try_k)
except chromadb.errors.NotEnoughElementsException:
continue
def max_marginal_relevance_search_by_vector(
self,