partners[chroma]: add retrieval of embedding vectors (#28290)

This PR adds an additional method to `Chroma` to retrieve the embedding
vectors, besides the most relevant Documents. This is sometimes of use
when you need to run a postprocessing algorithm on the retrieved results
based on the vectors, which has been the case for me lately.

Example issue (discussion) requesting this change:
https://github.com/langchain-ai/langchain/discussions/20383

---------

Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Massimiliano Pronesti 2024-11-27 16:34:02 +00:00 committed by GitHub
parent 733a6ad328
commit 83586661d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 74 additions and 0 deletions

View File

@ -53,6 +53,17 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
]
def _results_to_docs_and_vectors(results: Any) -> List[Tuple[Document, np.ndarray]]:
return [
(Document(page_content=result[0], metadata=result[1] or {}), result[2])
for result in zip(
results["documents"][0],
results["metadatas"][0],
results["embeddings"][0],
)
]
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
@ -687,6 +698,51 @@ class Chroma(VectorStore):
return _results_to_docs_and_scores(results)
def similarity_search_with_vectors(
self,
query: str,
k: int = DEFAULT_K,
filter: Optional[Dict[str, str]] = None,
where_document: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, np.ndarray]]:
"""Run similarity search with Chroma with vectors.
Args:
query: Query text to search for.
k: Number of results to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
kwargs: Additional keyword arguments to pass to Chroma collection query.
Returns:
List of documents most similar to the query text and
embedding vectors for each.
"""
include = ["documents", "metadatas", "embeddings"]
if self._embedding_function is None:
results = self.__query_collection(
query_texts=[query],
n_results=k,
where=filter,
where_document=where_document,
include=include,
**kwargs,
)
else:
query_embedding = self._embedding_function.embed_query(query)
results = self.__query_collection(
query_embeddings=[query_embedding],
n_results=k,
where=filter,
where_document=where_document,
include=include,
**kwargs,
)
return _results_to_docs_and_vectors(results)
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""Select the relevance score function based on collections distance metric.

View File

@ -92,6 +92,24 @@ def test_chroma_with_metadatas_with_scores() -> None:
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
def test_chroma_with_metadatas_with_vectors() -> None:
"""Test end to end construction and scored search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = ConsistentFakeEmbeddings()
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=embeddings,
metadatas=metadatas,
)
vec_1 = embeddings.embed_query(texts[0])
output = docsearch.similarity_search_with_vectors("foo", k=1)
docsearch.delete_collection()
assert output[0][0] == Document(page_content="foo", metadata={"page": "0"})
assert (output[0][1] == vec_1).all()
def test_chroma_with_metadatas_with_scores_using_vector() -> None:
"""Test end to end construction and scored search, using embedding vector."""
texts = ["foo", "bar", "baz"]