mirror of
https://github.com/hwchase17/langchain.git
synced 2026-04-25 01:16:55 +00:00
Chroma: Catch and handle NotEnoughElementsException (#3368)
## Problem and Solution
This PR solves #1793, which is more of a convenience for users than
anything else. When using Chroma as a vectorstore, if you try to run
similarity search with a `k` value that is larger than the number of
documents stored in the vectorstore, Chroma will raise a
`chromadb.errors.NotEnoughElementsException`.
The workaround is to add a new parameter in all similarity search
methods under the `Chroma` class called `find_highest_possible_k`, an
optional boolean parameter that defaults to True (changes default
behavior). If this parameter is set to `False`, the methods will behave
exactly as they did before this PR.
If the parameter is `True`, however, the method will try running
similarity search with the given `k`, and if
`chromadb.errors.NotEnoughElementsException` is raised, iteratively
lower `k` (down to `k=1`) until the error is no longer raised.
The following is an example of how this is implemented in the
`Chroma.similarity_search` method.
e0846c2bca/langchain/vectorstores/chroma.py (L127-L159)
We add the `find_highest_possible_k` parameter as `Optional` and
defaulting to True. We explain it briefly in the docstring. We wrap the
previous similarity search logic inside a private local function that
takes `k`. If `find_highest_possible_k` is False, we return that private
function, retaining previous behavior. If it is True, which it is by
default, we iteratively lower `k` (until it is 1) until we can find `k`
documents from the Chroma vectorstore.
## Example
You create a `Chroma` object from 1 document. You then run
`.similarity_search()`, `.similarity_search_by_vector()`, or
`similarity_search_with_score()`. If you only pass a query, the default
`k` is `4`. All methods would previously raise a
`chromadb.errors.NotEnoughElementsException`.
Now, however, all methods will return one document, the document inside
the vectorstore (unless you're filtering, setting a maximum distance,
etc.).
## Note
I didn't find any places in the documentation to mention this change,
other than the example Jupyter notebook for the Chroma vectorstore. In
that notebook, there was never a cell running similarity search with
parameters. If it's important to include information on altering the
`find_highest_possible_k` parameter, I'll happily document it wherever.
This commit is contained in:
@@ -128,6 +128,7 @@ class Chroma(VectorStore):
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
find_highest_possible_k: Optional[bool] = True,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@@ -136,18 +137,32 @@ class Chroma(VectorStore):
|
||||
Args:
|
||||
query (str): Query text to search for.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
|
||||
until there are enough items in the vectorstore to not raise an Error.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents most similar to the query text.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
def _similarity_search(k: int):
|
||||
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
if not find_highest_possible_k:
|
||||
return _similarity_search(k=k)
|
||||
|
||||
# Iteratively lower k until an error isn't raised by Chroma
|
||||
for try_k in range(k, 0, -1):
|
||||
try:
|
||||
return _similarity_search(k=try_k)
|
||||
except chromadb.errors.NotEnoughElementsException:
|
||||
continue
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
find_highest_possible_k: Optional[bool] = True,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@@ -155,18 +170,32 @@ class Chroma(VectorStore):
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
|
||||
until there are enough items in the vectorstore to not raise an Error.
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
results = self._collection.query(
|
||||
query_embeddings=embedding, n_results=k, where=filter
|
||||
)
|
||||
return _results_to_docs(results)
|
||||
def _similarity_search(k: int):
|
||||
results = self._collection.query(
|
||||
query_embeddings=embedding, n_results=k, where=filter
|
||||
)
|
||||
return _results_to_docs(results)
|
||||
|
||||
if not find_highest_possible_k:
|
||||
return _similarity_search(k=k)
|
||||
|
||||
# Iteratively lower k until an error isn't raised by Chroma
|
||||
for try_k in range(k, 0, -1):
|
||||
try:
|
||||
return _similarity_search(k=try_k)
|
||||
except chromadb.errors.NotEnoughElementsException:
|
||||
continue
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
find_highest_possible_k: Optional[bool] = True,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
@@ -175,23 +204,36 @@ class Chroma(VectorStore):
|
||||
Args:
|
||||
query (str): Query text to search for.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
find_highest_possible_k (Optional[bool], True): If True, will iteratively lower k
|
||||
until there are enough items in the vectorstore to not raise an Error.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Tuple[Document, float]]: List of documents most similar to the query
|
||||
text with distance in float.
|
||||
"""
|
||||
if self._embedding_function is None:
|
||||
results = self._collection.query(
|
||||
query_texts=[query], n_results=k, where=filter
|
||||
)
|
||||
else:
|
||||
query_embedding = self._embedding_function.embed_query(query)
|
||||
results = self._collection.query(
|
||||
query_embeddings=[query_embedding], n_results=k, where=filter
|
||||
)
|
||||
def _similarity_search(k: int):
|
||||
if self._embedding_function is None:
|
||||
results = self._collection.query(
|
||||
query_texts=[query], n_results=k, where=filter
|
||||
)
|
||||
else:
|
||||
query_embedding = self._embedding_function.embed_query(query)
|
||||
results = self._collection.query(
|
||||
query_embeddings=[query_embedding], n_results=k, where=filter
|
||||
)
|
||||
|
||||
return _results_to_docs_and_scores(results)
|
||||
return _results_to_docs_and_scores(results)
|
||||
|
||||
if not find_highest_possible_k:
|
||||
return _similarity_search(k=k)
|
||||
|
||||
# Iteratively lower k until an error isn't raised by Chroma
|
||||
for try_k in range(k, 0, -1):
|
||||
try:
|
||||
return _similarity_search(k=try_k)
|
||||
except chromadb.errors.NotEnoughElementsException:
|
||||
continue
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user