docs: Chroma docstrings update (#22001)

Thank you for contributing to LangChain!

- [X] **PR title**: "docs: Chroma docstrings update"
- Where "package" is whichever of langchain, community, core,
experimental, etc. is being modified. Use "docs: ..." for purely docs
changes, "templates: ..." for template changes, "infra: ..." for CI
changes.
  - Example: "community: add foobar LLM"


- [X] **PR message**: 
    - **Description:** Added and updated Chroma docstrings
    - **Issue:** https://github.com/langchain-ai/langchain/issues/21983


- [X] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
  - only docs


- [X] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
Klaudia Lemiec 2024-05-22 23:45:30 +02:00 committed by GitHub
parent 28456c2c33
commit 45351d1bc6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -52,7 +52,11 @@ Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
"""Row-wise cosine similarity between two equal-width matrices.
Raises:
ValueError: If the number of columns in X and Y are not the same.
"""
if len(X) == 0 or len(Y) == 0:
return np.array([])
@ -80,7 +84,21 @@ def maximal_marginal_relevance(
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
"""Calculate maximal marginal relevance.
Args:
query_embedding: Query embedding.
embedding_list: List of embeddings to select from.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
k: Number of Documents to return. Defaults to 4.
Returns:
List of indices of embeddings selected by maximal marginal relevance.
"""
if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
@ -136,8 +154,21 @@ class Chroma(VectorStore):
relevance_score_fn: Optional[Callable[[float], float]] = None,
create_collection_if_not_exists: Optional[bool] = True,
) -> None:
"""Initialize with a Chroma client."""
"""Initialize with a Chroma client.
Args:
collection_name: Name of the collection to create.
embedding_function: Embedding class object. Used to embed texts.
persist_director: Directory to persist the collection.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
client: Chroma client. Documentation:
https://docs.trychroma.com/reference/js-client#class:-chromaclient
relevance_score_fn: Function to calculate relevance score from distance.
Used only in `similarity_search_with_relevance_scores`
create_collection_if_not_exists: Whether to create collection
if it doesn't exist. Defaults to True.
"""
if client is not None:
self._client_settings = client_settings
self._client = client
@ -204,7 +235,23 @@ class Chroma(VectorStore):
where_document: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> Union[List[Document], chromadb.QueryResult]:
"""Query the chroma collection."""
"""Query the chroma collection.
Args:
query_texts: List of query texts.
query_embeddings: List of query embeddings.
n_results: Number of results to return. Defaults to 4.
where: dict used to filter results by
e.g. {"color" : "red", "price": 4.20}.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
Returns:
List of `n_results` nearest neighbor embeddings for provided
query_embeddings or query_texts.
See more: https://docs.trychroma.com/reference/py-collection#query
"""
return self._collection.query(
query_texts=query_texts,
query_embeddings=query_embeddings, # type: ignore
@ -229,12 +276,16 @@ class Chroma(VectorStore):
"""Run more images through the embeddings and add to the vectorstore.
Args:
uris List[str]: File path to the image.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
uris: File path to the image.
metadatas: Optional list of metadatas.
When querying, you can filter on this metadata.
ids: Optional list of IDs.
Returns:
List[str]: List of IDs of the added images.
List of IDs of the added images.
Raises:
ValueError: When metadata is incorrect.
"""
# Map from uris to b64 encoded strings
b64_texts = [self.encode_image(uri=uri) for uri in uris]
@ -312,14 +363,18 @@ class Chroma(VectorStore):
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts (Iterable[str]): Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
texts: Texts to add to the vectorstore.
metadatas: Optional list of metadatas.
When querying, you can filter on this metadata.
ids: Optional list of IDs.
Returns:
List[str]: List of IDs of the added texts.
List of IDs of the added texts.
Raises:
ValueError: When metadata is incorrect.
"""
# TODO: Handle the case where the user doesn't provide ids on the Collection
if ids is None:
ids = [str(uuid.uuid4()) for _ in texts]
embeddings = None
@ -391,12 +446,12 @@ class Chroma(VectorStore):
"""Run similarity search with Chroma.
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
query: Query text to search for.
k: Number of results to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
Returns:
List[Document]: List of documents most similar to the query text.
List of documents most similar to the query text.
"""
docs_and_scores = self.similarity_search_with_score(
query, k, filter=filter, **kwargs
@ -412,10 +467,14 @@ class Chroma(VectorStore):
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding (List[float]): Embedding to look up documents similar to.
k (int): Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
Returns:
List of Documents most similar to the query vector.
"""
@ -441,13 +500,14 @@ class Chroma(VectorStore):
Args:
embedding (List[float]): Embedding to look up documents similar to.
k (int): Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
k: Number of Documents to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
Returns:
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
List of documents most similar to the query text and relevance score
in float for each. Lower score represents more similarity.
"""
results = self.__query_collection(
query_embeddings=embedding,
@ -469,14 +529,15 @@ class Chroma(VectorStore):
"""Run similarity search with Chroma with distance.
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
query: Query text to search for.
k: Number of results to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
Returns:
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
List of documents most similar to the query text and
distance in float for each. Lower score represents more similarity.
"""
if self._embedding_function is None:
results = self.__query_collection(
@ -499,14 +560,21 @@ class Chroma(VectorStore):
return _results_to_docs_and_scores(results)
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""Select the relevance score function based on collections distance metric.
The most similar documents will have the lowest relevance score. Default
relevance score function is euclidean distance. Distance metric must be
provided in `collection_metadata` during initizalition of Chroma object.
Example: collection_metadata={"hnsw:space": "cosine"}. Available distance
metrics are: 'cosine', 'l2' and 'ip'.
Returns:
The relevance score function.
Raises:
ValueError: If the distance metric is not supported.
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""
if self.override_relevance_score_fn:
return self.override_relevance_score_fn
@ -541,18 +609,20 @@ class Chroma(VectorStore):
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to
20.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
@ -600,10 +670,15 @@ class Chroma(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.
Returns:
List of Documents selected by maximal marginal relevance.
Raises:
ValueError: If the embedding function is not provided.
"""
if self._embedding_function is None:
raise ValueError(
@ -611,7 +686,7 @@ class Chroma(VectorStore):
)
embedding = self._embedding_function.embed_query(query)
docs = self.max_marginal_relevance_search_by_vector(
return self.max_marginal_relevance_search_by_vector(
embedding,
k,
fetch_k,
@ -619,7 +694,6 @@ class Chroma(VectorStore):
filter=filter,
where_document=where_document,
)
return docs
def delete_collection(self) -> None:
"""Delete the collection."""
@ -656,6 +730,9 @@ class Chroma(VectorStore):
Can contain `"embeddings"`, `"metadatas"`, `"documents"`.
Ids are always included.
Defaults to `["metadatas", "documents"]`. Optional.
Return:
A dict with the keys `"ids"`, `"embeddings"`, `"metadatas"`, `"documents"`.
"""
kwargs = {
"ids": ids,
@ -674,8 +751,8 @@ class Chroma(VectorStore):
"""Update a document in the collection.
Args:
document_id (str): ID of the document to update.
document (Document): Document to update.
document_id: ID of the document to update.
document: Document to update.
"""
return self.update_documents([document_id], [document])
@ -684,8 +761,11 @@ class Chroma(VectorStore):
"""Update a document in the collection.
Args:
ids (List[str]): List of ids of the document to update.
documents (List[Document]): List of documents to update.
ids: List of ids of the document to update.
documents: List of documents to update.
Raises:
ValueError: If the embedding function is not provided.
"""
text = [document.page_content for document in documents]
metadata = [document.metadata for document in documents]
@ -741,14 +821,14 @@ class Chroma(VectorStore):
Otherwise, the data will be ephemeral in-memory.
Args:
texts (List[str]): List of texts to add to the collection.
collection_name (str): Name of the collection to create.
persist_directory (Optional[str]): Directory to persist the collection.
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
ids (Optional[List[str]]): List of document IDs. Defaults to None.
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
collection_metadata (Optional[Dict]): Collection configurations.
texts: List of texts to add to the collection.
collection_name: Name of the collection to create.
persist_directory: Directory to persist the collection.
embedding: Embedding function. Defaults to None.
metadatas: List of metadatas. Defaults to None.
ids: List of document IDs. Defaults to None.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
Defaults to None.
Returns:
@ -804,13 +884,13 @@ class Chroma(VectorStore):
Otherwise, the data will be ephemeral in-memory.
Args:
collection_name (str): Name of the collection to create.
persist_directory (Optional[str]): Directory to persist the collection.
ids (Optional[List[str]]): List of document IDs. Defaults to None.
documents (List[Document]): List of documents to add to the vectorstore.
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
collection_metadata (Optional[Dict]): Collection configurations.
collection_name: Name of the collection to create.
persist_directory: Directory to persist the collection.
ids : List of document IDs. Defaults to None.
documents: List of documents to add to the vectorstore.
embedding: Embedding function. Defaults to None.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
Defaults to None.
Returns: