From 5ccf8ebfacf956d9f31f684d1e4612122f60a6b1 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Tue, 16 Jul 2024 09:58:11 -0700 Subject: [PATCH] core: docstrings `vectorstores` update (#24281) Added missed docstrings. Formatted docstrings to the consistent form. --------- Co-authored-by: Erick Friis --- libs/core/langchain_core/vectorstores/base.py | 205 +++++++++++++----- .../langchain_core/vectorstores/in_memory.py | 41 +++- .../core/langchain_core/vectorstores/utils.py | 32 ++- 3 files changed, 217 insertions(+), 61 deletions(-) diff --git a/libs/core/langchain_core/vectorstores/base.py b/libs/core/langchain_core/vectorstores/base.py index 90608708789..19b8b86ac81 100644 --- a/libs/core/langchain_core/vectorstores/base.py +++ b/libs/core/langchain_core/vectorstores/base.py @@ -91,6 +91,10 @@ class VectorStore(ABC): Returns: List of ids from adding the texts into the vectorstore. + + Raises: + ValueError: If the number of metadatas does not match the number of texts. + ValueError: If the number of ids does not match the number of texts. """ if type(self).upsert != VectorStore.upsert: # Import document in local scope to avoid circular imports @@ -145,7 +149,12 @@ class VectorStore(ABC): kwargs should only include parameters that are common to all documents. (e.g., timeout for indexing, retry policy, etc.) kwargs should not include ids to avoid ambiguous semantics. - Instead the ID should be provided as part of the Document object. + Instead, the ID should be provided as part of the Document object. + + Yields: + UpsertResponse: A response object that contains the list of IDs that were + successfully added or updated in the vectorstore and the list of IDs that + failed to be added or updated. .. versionadded:: 0.2.11 """ @@ -244,6 +253,11 @@ class VectorStore(ABC): kwargs should not include ids to avoid ambiguous semantics. Instead the ID should be provided as part of the Document object. + Yields: + UpsertResponse: A response object that contains the list of IDs that were + successfully added or updated in the vectorstore and the list of IDs that + failed to be added or updated. + .. versionadded:: 0.2.11 """ async for batch in abatch_iterate(batch_size, items): @@ -292,7 +306,7 @@ class VectorStore(ABC): """Delete by vector ID or other criteria. Args: - ids: List of ids to delete. + ids: List of ids to delete. If None, delete all. Default is None. **kwargs: Other keyword arguments that subclasses might use. Returns: @@ -332,7 +346,7 @@ class VectorStore(ABC): # Implementations should override this method to provide an async native version. async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]: - """Get documents by their IDs. + """Async get documents by their IDs. The returned documents are expected to have the ID field set to the ID of the document in the vector store. @@ -360,10 +374,10 @@ class VectorStore(ABC): async def adelete( self, ids: Optional[List[str]] = None, **kwargs: Any ) -> Optional[bool]: - """Delete by vector ID or other criteria. + """Async delete by vector ID or other criteria. Args: - ids: List of ids to delete. + ids: List of ids to delete. If None, delete all. Default is None. **kwargs: Other keyword arguments that subclasses might use. Returns: @@ -378,15 +392,20 @@ class VectorStore(ABC): metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. + """Async run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. + Default is None. **kwargs: vectorstore specific parameters. Returns: List of ids from adding the texts into the vectorstore. + + Raises: + ValueError: If the number of metadatas does not match the number of texts. + ValueError: If the number of ids does not match the number of texts. """ if type(self).aupsert != VectorStore.aupsert: # Import document in local scope to avoid circular imports @@ -435,6 +454,9 @@ class VectorStore(ABC): Returns: List of IDs of the added texts. + + Raises: + ValueError: If the number of ids does not match the number of documents. """ if type(self).upsert != VectorStore.upsert: from langchain_core.documents import Document @@ -471,13 +493,18 @@ class VectorStore(ABC): async def aadd_documents( self, documents: List[Document], **kwargs: Any ) -> List[str]: - """Run more documents through the embeddings and add to the vectorstore. + """Async run more documents through the embeddings and add to + the vectorstore. Args: documents: Documents to add to the vectorstore. + kwargs: Additional keyword arguments. Returns: List of IDs of the added texts. + + Raises: + ValueError: If the number of IDs does not match the number of documents. """ # If either upsert or aupsert has been implemented, we delegate to them! if ( @@ -516,13 +543,20 @@ class VectorStore(ABC): return await self.aadd_texts(texts, metadatas, **kwargs) def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]: - """Return docs most similar to query using specified search type. + """Return docs most similar to query using a specified search type. Args: query: Input text search_type: Type of search to perform. Can be "similarity", "mmr", or "similarity_score_threshold". **kwargs: Arguments to pass to the search method. + + Returns: + List of Documents most similar to the query. + + Raises: + ValueError: If search_type is not one of "similarity", + "mmr", or "similarity_score_threshold". """ if search_type == "similarity": return self.similarity_search(query, **kwargs) @@ -536,19 +570,27 @@ class VectorStore(ABC): else: raise ValueError( f"search_type of {search_type} not allowed. Expected " - "search_type to be 'similarity', 'similarity_score_threshold' or 'mmr'." + "search_type to be 'similarity', 'similarity_score_threshold'" + " or 'mmr'." ) async def asearch( self, query: str, search_type: str, **kwargs: Any ) -> List[Document]: - """Return docs most similar to query using specified search type. + """Async return docs most similar to query using a specified search type. Args: query: Input text. search_type: Type of search to perform. Can be "similarity", "mmr", or "similarity_score_threshold". **kwargs: Arguments to pass to the search method. + + Returns: + List of Documents most similar to the query. + + Raises: + ValueError: If search_type is not one of "similarity", + "mmr", or "similarity_score_threshold". """ if search_type == "similarity": return await self.asimilarity_search(query, **kwargs) @@ -574,6 +616,7 @@ class VectorStore(ABC): Args: query: Input text. k: Number of Documents to return. Defaults to 4. + **kwargs: Arguments to pass to the search method. Returns: List of Documents most similar to the query. @@ -589,7 +632,7 @@ class VectorStore(ABC): # others are not!) # - embedding dimensionality # - etc. - # This function converts the euclidean norm of normalized embeddings + # This function converts the Euclidean norm of normalized embeddings # (0 is most similar, sqrt(2) most dissimilar) # to a similarity function (0 to 1) return 1.0 - distance / math.sqrt(2) @@ -617,7 +660,7 @@ class VectorStore(ABC): - embedding dimensionality - etc. - Vectorstores should define their own selection based method of relevance. + Vectorstores should define their own selection-based method of relevance. """ raise NotImplementedError @@ -626,18 +669,26 @@ class VectorStore(ABC): ) -> List[Tuple[Document, float]]: """Run similarity search with distance. + Args: + *args: Arguments to pass to the search method. + **kwargs: Arguments to pass to the search method. + Returns: - List of Tuples of (doc, similarity_score) + List of Tuples of (doc, similarity_score). """ raise NotImplementedError async def asimilarity_search_with_score( self, *args: Any, **kwargs: Any ) -> List[Tuple[Document, float]]: - """Run similarity search with distance. + """Async run similarity search with distance. + + Args: + *args: Arguments to pass to the search method. + **kwargs: Arguments to pass to the search method. Returns: - List of Tuples of (doc, similarity_score) + List of Tuples of (doc, similarity_score). """ # This is a temporary workaround to make the similarity search @@ -716,10 +767,10 @@ class VectorStore(ABC): k: Number of Documents to return. Defaults to 4. **kwargs: kwargs to be passed to similarity search. Should include: score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs + filter the resulting set of retrieved docs. Returns: - List of Tuples of (doc, similarity_score) + List of Tuples of (doc, similarity_score). """ score_threshold = kwargs.pop("score_threshold", None) @@ -754,7 +805,7 @@ class VectorStore(ABC): k: int = 4, **kwargs: Any, ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores in the range [0, 1]. + """Async return docs and relevance scores in the range [0, 1]. 0 is dissimilar, 1 is most similar. @@ -798,11 +849,12 @@ class VectorStore(ABC): async def asimilarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: - """Return docs most similar to query. + """Async return docs most similar to query. Args: query: Input text. k: Number of Documents to return. Defaults to 4. + **kwargs: Arguments to pass to the search method. Returns: List of Documents most similar to the query. @@ -821,6 +873,7 @@ class VectorStore(ABC): Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. + **kwargs: Arguments to pass to the search method. Returns: List of Documents most similar to the query vector. @@ -830,11 +883,12 @@ class VectorStore(ABC): async def asimilarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Document]: - """Return docs most similar to embedding vector. + """Async return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. + **kwargs: Arguments to pass to the search method. Returns: List of Documents most similar to the query vector. @@ -864,10 +918,13 @@ class VectorStore(ABC): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Default is 20. lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + **kwargs: Arguments to pass to the search method. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -881,7 +938,7 @@ class VectorStore(ABC): lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. + """Async return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. @@ -890,10 +947,12 @@ class VectorStore(ABC): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Default is 20. lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -928,10 +987,13 @@ class VectorStore(ABC): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Default is 20. lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + **kwargs: Arguments to pass to the search method. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -945,7 +1007,7 @@ class VectorStore(ABC): lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. + """Async return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. @@ -954,10 +1016,13 @@ class VectorStore(ABC): embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Default is 20. lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + **kwargs: Arguments to pass to the search method. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -983,6 +1048,10 @@ class VectorStore(ABC): Args: documents: List of Documents to add to the vectorstore. embedding: Embedding function to use. + **kwargs: Additional keyword arguments. + + Returns: + VectorStore: VectorStore initialized from documents and embeddings. """ texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] @@ -995,11 +1064,15 @@ class VectorStore(ABC): embedding: Embeddings, **kwargs: Any, ) -> VST: - """Return VectorStore initialized from documents and embeddings. + """Async return VectorStore initialized from documents and embeddings. Args: documents: List of Documents to add to the vectorstore. embedding: Embedding function to use. + **kwargs: Additional keyword arguments. + + Returns: + VectorStore: VectorStore initialized from documents and embeddings. """ texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] @@ -1018,8 +1091,13 @@ class VectorStore(ABC): Args: texts: Texts to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. embedding: Embedding function to use. + metadatas: Optional list of metadatas associated with the texts. + Default is None. + **kwargs: Additional keyword arguments. + + Returns: + VectorStore: VectorStore initialized from texts and embeddings. """ @classmethod @@ -1030,12 +1108,17 @@ class VectorStore(ABC): metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> VST: - """Return VectorStore initialized from texts and embeddings. + """Async return VectorStore initialized from texts and embeddings. Args: texts: Texts to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. embedding: Embedding function to use. + metadatas: Optional list of metadatas associated with the texts. + Default is None. + **kwargs: Additional keyword arguments. + + Returns: + VectorStore: VectorStore initialized from texts and embeddings. """ return await run_in_executor( None, cls.from_texts, texts, embedding, metadatas, **kwargs @@ -1052,19 +1135,22 @@ class VectorStore(ABC): """Return VectorStoreRetriever initialized from this VectorStore. Args: - search_type (Optional[str]): Defines the type of search that - the Retriever should perform. - Can be "similarity" (default), "mmr", or - "similarity_score_threshold". - search_kwargs (Optional[Dict]): Keyword arguments to pass to the - search function. Can include things like: - k: Amount of documents to return (Default: 4) - score_threshold: Minimum relevance threshold - for similarity_score_threshold - fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) - lambda_mult: Diversity of results returned by MMR; - 1 for minimum diversity and 0 for maximum. (Default: 0.5) - filter: Filter by document metadata + **kwargs: Keyword arguments to pass to the search function. + Can include: + search_type (Optional[str]): Defines the type of search that + the Retriever should perform. + Can be "similarity" (default), "mmr", or + "similarity_score_threshold". + search_kwargs (Optional[Dict]): Keyword arguments to pass to the + search function. Can include things like: + k: Amount of documents to return (Default: 4) + score_threshold: Minimum relevance threshold + for similarity_score_threshold + fetch_k: Amount of documents to pass to MMR algorithm + (Default: 20) + lambda_mult: Diversity of results returned by MMR; + 1 for minimum diversity and 0 for maximum. (Default: 0.5) + filter: Filter by document metadata Returns: VectorStoreRetriever: Retriever class for VectorStore. @@ -1128,7 +1214,18 @@ class VectorStoreRetriever(BaseRetriever): @root_validator(pre=True) def validate_search_type(cls, values: Dict) -> Dict: - """Validate search type.""" + """Validate search type. + + Args: + values: Values to validate. + + Returns: + Values: Validated values. + + Raises: + ValueError: If search_type is not one of the allowed search types. + ValueError: If score_threshold is not specified with a float value(0~1) + """ search_type = values.get("search_type", "similarity") if search_type not in cls.allowed_search_types: raise ValueError( @@ -1191,6 +1288,7 @@ class VectorStoreRetriever(BaseRetriever): Args: documents: Documents to add to the vectorstore. + **kwargs: Other keyword arguments that subclasses might use. Returns: List of IDs of the added texts. @@ -1200,10 +1298,11 @@ class VectorStoreRetriever(BaseRetriever): async def aadd_documents( self, documents: List[Document], **kwargs: Any ) -> List[str]: - """Add documents to the vectorstore. + """Async add documents to the vectorstore. Args: documents: Documents to add to the vectorstore. + **kwargs: Other keyword arguments that subclasses might use. Returns: List of IDs of the added texts. diff --git a/libs/core/langchain_core/vectorstores/in_memory.py b/libs/core/langchain_core/vectorstores/in_memory.py index deb93a5ce91..e284d0b509e 100644 --- a/libs/core/langchain_core/vectorstores/in_memory.py +++ b/libs/core/langchain_core/vectorstores/in_memory.py @@ -32,13 +32,14 @@ class InMemoryVectorStore(VectorStore): """In-memory implementation of VectorStore using a dictionary. Uses numpy to compute cosine similarity for search. - - Args: - embedding: embedding function to use. """ def __init__(self, embedding: Embeddings) -> None: - """Initialize with the given embedding function.""" + """Initialize with the given embedding function. + + Args: + embedding: embedding function to use. + """ # TODO: would be nice to change to # Dict[str, Document] at some point (will be a breaking change) self.store: Dict[str, Dict[str, Any]] = {} @@ -74,7 +75,14 @@ class InMemoryVectorStore(VectorStore): } def get_by_ids(self, ids: Sequence[str], /) -> List[Document]: - """Get documents by their ids.""" + """Get documents by their ids. + + Args: + ids: The ids of the documents to get. + + Returns: + A list of Document objects. + """ documents = [] for doc_id in ids: @@ -90,6 +98,14 @@ class InMemoryVectorStore(VectorStore): return documents async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]: + """Async get documents by their ids. + + Args: + ids: The ids of the documents to get. + + Returns: + A list of Document objects. + """ return self.get_by_ids(ids) async def aadd_texts( @@ -261,6 +277,16 @@ class InMemoryVectorStore(VectorStore): def load( cls, path: str, embedding: Embeddings, **kwargs: Any ) -> "InMemoryVectorStore": + """Load a vector store from a file. + + Args: + path: The path to load the vector store from. + embedding: The embedding to use. + **kwargs: Additional arguments to pass to the constructor. + + Returns: + A VectorStore object. + """ _path: Path = Path(path) with _path.open("r") as f: store = load(json.load(f)) @@ -269,6 +295,11 @@ class InMemoryVectorStore(VectorStore): return vectorstore def dump(self, path: str) -> None: + """Dump the vector store to a file. + + Args: + path: The path to dump the vector store to. + """ _path: Path = Path(path) _path.parent.mkdir(exist_ok=True, parents=True) with _path.open("w") as f: diff --git a/libs/core/langchain_core/vectorstores/utils.py b/libs/core/langchain_core/vectorstores/utils.py index 115ca6f0f9f..5bcf756747b 100644 --- a/libs/core/langchain_core/vectorstores/utils.py +++ b/libs/core/langchain_core/vectorstores/utils.py @@ -1,6 +1,6 @@ """Internal utilities for the in memory implementation of VectorStore. -These are part of a private API and users should not used them directly +These are part of a private API, and users should not use them directly as they can change without notice. """ @@ -18,7 +18,20 @@ logger = logging.getLogger(__name__) def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" + """Row-wise cosine similarity between two equal-width matrices. + + Args: + X: A matrix of shape (n, m). + Y: A matrix of shape (k, m). + + Returns: + A matrix of shape (n, k) where each element (i, j) is the cosine similarity + between the ith row of X and the jth row of Y. + + Raises: + ValueError: If the number of columns in X and Y are not the same. + ImportError: If numpy is not installed. + """ try: import numpy as np except ImportError: @@ -64,7 +77,20 @@ def _maximal_marginal_relevance( lambda_mult: float = 0.5, k: int = 4, ) -> List[int]: - """Calculate maximal marginal relevance.""" + """Calculate maximal marginal relevance. + + Args: + query_embedding: The query embedding. + embedding_list: A list of embeddings. + lambda_mult: The lambda parameter for MMR. Default is 0.5. + k: The number of embeddings to return. Default is 4. + + Returns: + A list of indices of the embeddings to return. + + Raises: + ImportError: If numpy is not installed. + """ try: import numpy as np except ImportError: