core: docstrings vectorstores update (#24281)

Added missed docstrings. Formatted docstrings to the consistent form. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-07-05 20:58:25 +00:00 · 2024-07-16 09:58:11 -07:00 · 2024-07-16 09:58:11 -07:00 · 5ccf8ebfac
commit 5ccf8ebfac
parent 1e9cc02ed8
3 changed files with 217 additions and 61 deletions
--- a/libs/core/langchain_core/vectorstores/base.py
+++ b/libs/core/langchain_core/vectorstores/base.py
@ -91,6 +91,10 @@ class VectorStore(ABC):

        Returns:
            List of ids from adding the texts into the vectorstore.
+
+        Raises:
+            ValueError: If the number of metadatas does not match the number of texts.
+            ValueError: If the number of ids does not match the number of texts.
        """
        if type(self).upsert != VectorStore.upsert:
            # Import document in local scope to avoid circular imports
@ -145,7 +149,12 @@ class VectorStore(ABC):
                kwargs should only include parameters that are common to all
                documents. (e.g., timeout for indexing, retry policy, etc.)
                kwargs should not include ids to avoid ambiguous semantics.
-                Instead the ID should be provided as part of the Document object.
+                Instead, the ID should be provided as part of the Document object.
+
+        Yields:
+            UpsertResponse: A response object that contains the list of IDs that were
+            successfully added or updated in the vectorstore and the list of IDs that
+            failed to be added or updated.

        .. versionadded:: 0.2.11
        """
@ -244,6 +253,11 @@ class VectorStore(ABC):
                kwargs should not include ids to avoid ambiguous semantics.
                Instead the ID should be provided as part of the Document object.

+        Yields:
+            UpsertResponse: A response object that contains the list of IDs that were
+            successfully added or updated in the vectorstore and the list of IDs that
+            failed to be added or updated.
+
        .. versionadded:: 0.2.11
        """
        async for batch in abatch_iterate(batch_size, items):
@ -292,7 +306,7 @@ class VectorStore(ABC):
        """Delete by vector ID or other criteria.

        Args:
-            ids: List of ids to delete.
+            ids: List of ids to delete. If None, delete all. Default is None.
            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
@ -332,7 +346,7 @@ class VectorStore(ABC):

    # Implementations should override this method to provide an async native version.
    async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
-        """Get documents by their IDs.
+        """Async get documents by their IDs.

        The returned documents are expected to have the ID field set to the ID of the
        document in the vector store.
@ -360,10 +374,10 @@ class VectorStore(ABC):
    async def adelete(
        self, ids: Optional[List[str]] = None, **kwargs: Any
    ) -> Optional[bool]:
-        """Delete by vector ID or other criteria.
+        """Async delete by vector ID or other criteria.

        Args:
-            ids: List of ids to delete.
+            ids: List of ids to delete. If None, delete all. Default is None.
            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
@ -378,15 +392,20 @@ class VectorStore(ABC):
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
-        """Run more texts through the embeddings and add to the vectorstore.
+        """Async run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
+                Default is None.
            **kwargs: vectorstore specific parameters.

        Returns:
            List of ids from adding the texts into the vectorstore.
+
+        Raises:
+            ValueError: If the number of metadatas does not match the number of texts.
+            ValueError: If the number of ids does not match the number of texts.
        """
        if type(self).aupsert != VectorStore.aupsert:
            # Import document in local scope to avoid circular imports
@ -435,6 +454,9 @@ class VectorStore(ABC):

        Returns:
            List of IDs of the added texts.
+
+        Raises:
+            ValueError: If the number of ids does not match the number of documents.
        """
        if type(self).upsert != VectorStore.upsert:
            from langchain_core.documents import Document
@ -471,13 +493,18 @@ class VectorStore(ABC):
    async def aadd_documents(
        self, documents: List[Document], **kwargs: Any
    ) -> List[str]:
-        """Run more documents through the embeddings and add to the vectorstore.
+        """Async run more documents through the embeddings and add to
+        the vectorstore.

        Args:
            documents: Documents to add to the vectorstore.
+            kwargs: Additional keyword arguments.

        Returns:
            List of IDs of the added texts.
+
+        Raises:
+            ValueError: If the number of IDs does not match the number of documents.
        """
        # If either upsert or aupsert has been implemented, we delegate to them!
        if (
@ -516,13 +543,20 @@ class VectorStore(ABC):
        return await self.aadd_texts(texts, metadatas, **kwargs)

    def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]:
-        """Return docs most similar to query using specified search type.
+        """Return docs most similar to query using a specified search type.

        Args:
            query: Input text
            search_type: Type of search to perform. Can be "similarity",
                "mmr", or "similarity_score_threshold".
            **kwargs: Arguments to pass to the search method.
+
+        Returns:
+            List of Documents most similar to the query.
+
+        Raises:
+            ValueError: If search_type is not one of "similarity",
+                "mmr", or "similarity_score_threshold".
        """
        if search_type == "similarity":
            return self.similarity_search(query, **kwargs)
@ -536,19 +570,27 @@ class VectorStore(ABC):
        else:
            raise ValueError(
                f"search_type of {search_type} not allowed. Expected "
-                "search_type to be 'similarity', 'similarity_score_threshold' or 'mmr'."
+                "search_type to be 'similarity', 'similarity_score_threshold'"
+                " or 'mmr'."
            )

    async def asearch(
        self, query: str, search_type: str, **kwargs: Any
    ) -> List[Document]:
-        """Return docs most similar to query using specified search type.
+        """Async return docs most similar to query using a specified search type.

        Args:
            query: Input text.
            search_type: Type of search to perform. Can be "similarity",
                "mmr", or "similarity_score_threshold".
            **kwargs: Arguments to pass to the search method.
+
+        Returns:
+            List of Documents most similar to the query.
+
+        Raises:
+            ValueError: If search_type is not one of "similarity",
+                "mmr", or "similarity_score_threshold".
        """
        if search_type == "similarity":
            return await self.asimilarity_search(query, **kwargs)
@ -574,6 +616,7 @@ class VectorStore(ABC):
        Args:
            query: Input text.
            k: Number of Documents to return. Defaults to 4.
+            **kwargs: Arguments to pass to the search method.

        Returns:
            List of Documents most similar to the query.
@ -589,7 +632,7 @@ class VectorStore(ABC):
        #  others are not!)
        # - embedding dimensionality
        # - etc.
-        # This function converts the euclidean norm of normalized embeddings
+        # This function converts the Euclidean norm of normalized embeddings
        # (0 is most similar, sqrt(2) most dissimilar)
        # to a similarity function (0 to 1)
        return 1.0 - distance / math.sqrt(2)
@ -617,7 +660,7 @@ class VectorStore(ABC):
        - embedding dimensionality
        - etc.

-        Vectorstores should define their own selection based method of relevance.
+        Vectorstores should define their own selection-based method of relevance.
        """
        raise NotImplementedError

@ -626,18 +669,26 @@ class VectorStore(ABC):
    ) -> List[Tuple[Document, float]]:
        """Run similarity search with distance.

+        Args:
+            *args: Arguments to pass to the search method.
+            **kwargs: Arguments to pass to the search method.
+
        Returns:
-            List of Tuples of (doc, similarity_score)
+            List of Tuples of (doc, similarity_score).
        """
        raise NotImplementedError

    async def asimilarity_search_with_score(
        self, *args: Any, **kwargs: Any
    ) -> List[Tuple[Document, float]]:
-        """Run similarity search with distance.
+        """Async run similarity search with distance.
+
+        Args:
+            *args: Arguments to pass to the search method.
+            **kwargs: Arguments to pass to the search method.

        Returns:
-            List of Tuples of (doc, similarity_score)
+            List of Tuples of (doc, similarity_score).
        """

        # This is a temporary workaround to make the similarity search
@ -716,10 +767,10 @@ class VectorStore(ABC):
            k: Number of Documents to return. Defaults to 4.
            **kwargs: kwargs to be passed to similarity search. Should include:
                score_threshold: Optional, a floating point value between 0 to 1 to
-                    filter the resulting set of retrieved docs
+                    filter the resulting set of retrieved docs.

        Returns:
-            List of Tuples of (doc, similarity_score)
+            List of Tuples of (doc, similarity_score).
        """
        score_threshold = kwargs.pop("score_threshold", None)

@ -754,7 +805,7 @@ class VectorStore(ABC):
        k: int = 4,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
-        """Return docs and relevance scores in the range [0, 1].
+        """Async return docs and relevance scores in the range [0, 1].

        0 is dissimilar, 1 is most similar.

@ -798,11 +849,12 @@ class VectorStore(ABC):
    async def asimilarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
-        """Return docs most similar to query.
+        """Async return docs most similar to query.

        Args:
            query: Input text.
            k: Number of Documents to return. Defaults to 4.
+            **kwargs: Arguments to pass to the search method.

        Returns:
            List of Documents most similar to the query.
@ -821,6 +873,7 @@ class VectorStore(ABC):
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
+            **kwargs: Arguments to pass to the search method.

        Returns:
            List of Documents most similar to the query vector.
@ -830,11 +883,12 @@ class VectorStore(ABC):
    async def asimilarity_search_by_vector(
        self, embedding: List[float], k: int = 4, **kwargs: Any
    ) -> List[Document]:
-        """Return docs most similar to embedding vector.
+        """Async return docs most similar to embedding vector.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
+            **kwargs: Arguments to pass to the search method.

        Returns:
            List of Documents most similar to the query vector.
@ -864,10 +918,13 @@ class VectorStore(ABC):
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+                Default is 20.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
-                        Defaults to 0.5.
+                of diversity among the results with 0 corresponding
+                to maximum diversity and 1 to minimum diversity.
+                Defaults to 0.5.
+            **kwargs: Arguments to pass to the search method.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -881,7 +938,7 @@ class VectorStore(ABC):
        lambda_mult: float = 0.5,
        **kwargs: Any,
    ) -> List[Document]:
-        """Return docs selected using the maximal marginal relevance.
+        """Async return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
@ -890,10 +947,12 @@ class VectorStore(ABC):
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+                Default is 20.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
-                        Defaults to 0.5.
+                of diversity among the results with 0 corresponding
+                to maximum diversity and 1 to minimum diversity.
+                Defaults to 0.5.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -928,10 +987,13 @@ class VectorStore(ABC):
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+                Default is 20.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
-                        Defaults to 0.5.
+                of diversity among the results with 0 corresponding
+                to maximum diversity and 1 to minimum diversity.
+                Defaults to 0.5.
+            **kwargs: Arguments to pass to the search method.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -945,7 +1007,7 @@ class VectorStore(ABC):
        lambda_mult: float = 0.5,
        **kwargs: Any,
    ) -> List[Document]:
-        """Return docs selected using the maximal marginal relevance.
+        """Async return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
@ -954,10 +1016,13 @@ class VectorStore(ABC):
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+                Default is 20.
            lambda_mult: Number between 0 and 1 that determines the degree
-                        of diversity among the results with 0 corresponding
-                        to maximum diversity and 1 to minimum diversity.
-                        Defaults to 0.5.
+                of diversity among the results with 0 corresponding
+                to maximum diversity and 1 to minimum diversity.
+                Defaults to 0.5.
+            **kwargs: Arguments to pass to the search method.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -983,6 +1048,10 @@ class VectorStore(ABC):
        Args:
            documents: List of Documents to add to the vectorstore.
            embedding: Embedding function to use.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            VectorStore: VectorStore initialized from documents and embeddings.
        """
        texts = [d.page_content for d in documents]
        metadatas = [d.metadata for d in documents]
@ -995,11 +1064,15 @@ class VectorStore(ABC):
        embedding: Embeddings,
        **kwargs: Any,
    ) -> VST:
-        """Return VectorStore initialized from documents and embeddings.
+        """Async return VectorStore initialized from documents and embeddings.

        Args:
            documents: List of Documents to add to the vectorstore.
            embedding: Embedding function to use.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            VectorStore: VectorStore initialized from documents and embeddings.
        """
        texts = [d.page_content for d in documents]
        metadatas = [d.metadata for d in documents]
@ -1018,8 +1091,13 @@ class VectorStore(ABC):

        Args:
            texts: Texts to add to the vectorstore.
-            metadatas: Optional list of metadatas associated with the texts.
            embedding: Embedding function to use.
+            metadatas: Optional list of metadatas associated with the texts.
+                Default is None.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            VectorStore: VectorStore initialized from texts and embeddings.
        """

    @classmethod
@ -1030,12 +1108,17 @@ class VectorStore(ABC):
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> VST:
-        """Return VectorStore initialized from texts and embeddings.
+        """Async return VectorStore initialized from texts and embeddings.

        Args:
            texts: Texts to add to the vectorstore.
-            metadatas: Optional list of metadatas associated with the texts.
            embedding: Embedding function to use.
+            metadatas: Optional list of metadatas associated with the texts.
+                Default is None.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            VectorStore: VectorStore initialized from texts and embeddings.
        """
        return await run_in_executor(
            None, cls.from_texts, texts, embedding, metadatas, **kwargs
@ -1052,19 +1135,22 @@ class VectorStore(ABC):
        """Return VectorStoreRetriever initialized from this VectorStore.

        Args:
-            search_type (Optional[str]): Defines the type of search that
-                the Retriever should perform.
-                Can be "similarity" (default), "mmr", or
-                "similarity_score_threshold".
-            search_kwargs (Optional[Dict]): Keyword arguments to pass to the
-                search function. Can include things like:
-                    k: Amount of documents to return (Default: 4)
-                    score_threshold: Minimum relevance threshold
-                        for similarity_score_threshold
-                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
-                    lambda_mult: Diversity of results returned by MMR;
-                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
-                    filter: Filter by document metadata
+            **kwargs: Keyword arguments to pass to the search function.
+                Can include:
+                search_type (Optional[str]): Defines the type of search that
+                    the Retriever should perform.
+                    Can be "similarity" (default), "mmr", or
+                    "similarity_score_threshold".
+                search_kwargs (Optional[Dict]): Keyword arguments to pass to the
+                    search function. Can include things like:
+                        k: Amount of documents to return (Default: 4)
+                        score_threshold: Minimum relevance threshold
+                            for similarity_score_threshold
+                        fetch_k: Amount of documents to pass to MMR algorithm
+                            (Default: 20)
+                        lambda_mult: Diversity of results returned by MMR;
+                            1 for minimum diversity and 0 for maximum. (Default: 0.5)
+                        filter: Filter by document metadata

        Returns:
            VectorStoreRetriever: Retriever class for VectorStore.
@ -1128,7 +1214,18 @@ class VectorStoreRetriever(BaseRetriever):

    @root_validator(pre=True)
    def validate_search_type(cls, values: Dict) -> Dict:
-        """Validate search type."""
+        """Validate search type.
+
+        Args:
+            values: Values to validate.
+
+        Returns:
+            Values: Validated values.
+
+        Raises:
+            ValueError: If search_type is not one of the allowed search types.
+            ValueError: If score_threshold is not specified with a float value(0~1)
+        """
        search_type = values.get("search_type", "similarity")
        if search_type not in cls.allowed_search_types:
            raise ValueError(
@ -1191,6 +1288,7 @@ class VectorStoreRetriever(BaseRetriever):

        Args:
            documents: Documents to add to the vectorstore.
+            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
            List of IDs of the added texts.
@ -1200,10 +1298,11 @@ class VectorStoreRetriever(BaseRetriever):
    async def aadd_documents(
        self, documents: List[Document], **kwargs: Any
    ) -> List[str]:
-        """Add documents to the vectorstore.
+        """Async add documents to the vectorstore.

        Args:
            documents: Documents to add to the vectorstore.
+            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
            List of IDs of the added texts.
--- a/libs/core/langchain_core/vectorstores/in_memory.py
+++ b/libs/core/langchain_core/vectorstores/in_memory.py
@ -32,13 +32,14 @@ class InMemoryVectorStore(VectorStore):
    """In-memory implementation of VectorStore using a dictionary.

    Uses numpy to compute cosine similarity for search.
-
-    Args:
-        embedding:  embedding function to use.
    """

    def __init__(self, embedding: Embeddings) -> None:
-        """Initialize with the given embedding function."""
+        """Initialize with the given embedding function.
+
+        Args:
+            embedding: embedding function to use.
+        """
        # TODO: would be nice to change to
        # Dict[str, Document] at some point (will be a breaking change)
        self.store: Dict[str, Dict[str, Any]] = {}
@ -74,7 +75,14 @@ class InMemoryVectorStore(VectorStore):
        }

    def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
-        """Get documents by their ids."""
+        """Get documents by their ids.
+
+        Args:
+            ids: The ids of the documents to get.
+
+        Returns:
+            A list of Document objects.
+        """
        documents = []

        for doc_id in ids:
@ -90,6 +98,14 @@ class InMemoryVectorStore(VectorStore):
        return documents

    async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
+        """Async get documents by their ids.
+
+        Args:
+            ids: The ids of the documents to get.
+
+        Returns:
+            A list of Document objects.
+        """
        return self.get_by_ids(ids)

    async def aadd_texts(
@ -261,6 +277,16 @@ class InMemoryVectorStore(VectorStore):
    def load(
        cls, path: str, embedding: Embeddings, **kwargs: Any
    ) -> "InMemoryVectorStore":
+        """Load a vector store from a file.
+
+        Args:
+            path: The path to load the vector store from.
+            embedding: The embedding to use.
+            **kwargs: Additional arguments to pass to the constructor.
+
+        Returns:
+            A VectorStore object.
+        """
        _path: Path = Path(path)
        with _path.open("r") as f:
            store = load(json.load(f))
@ -269,6 +295,11 @@ class InMemoryVectorStore(VectorStore):
        return vectorstore

    def dump(self, path: str) -> None:
+        """Dump the vector store to a file.
+
+        Args:
+            path: The path to dump the vector store to.
+        """
        _path: Path = Path(path)
        _path.parent.mkdir(exist_ok=True, parents=True)
        with _path.open("w") as f:
--- a/libs/core/langchain_core/vectorstores/utils.py
+++ b/libs/core/langchain_core/vectorstores/utils.py
@ -1,6 +1,6 @@
 """Internal utilities for the in memory implementation of VectorStore.

-These are part of a private API and users should not used them directly
+These are part of a private API, and users should not use them directly
 as they can change without notice.
 """

@ -18,7 +18,20 @@ logger = logging.getLogger(__name__)


 def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
-    """Row-wise cosine similarity between two equal-width matrices."""
+    """Row-wise cosine similarity between two equal-width matrices.
+
+    Args:
+        X: A matrix of shape (n, m).
+        Y: A matrix of shape (k, m).
+
+    Returns:
+        A matrix of shape (n, k) where each element (i, j) is the cosine similarity
+        between the ith row of X and the jth row of Y.
+
+    Raises:
+        ValueError: If the number of columns in X and Y are not the same.
+        ImportError: If numpy is not installed.
+    """
    try:
        import numpy as np
    except ImportError:
@ -64,7 +77,20 @@ def _maximal_marginal_relevance(
    lambda_mult: float = 0.5,
    k: int = 4,
 ) -> List[int]:
-    """Calculate maximal marginal relevance."""
+    """Calculate maximal marginal relevance.
+
+    Args:
+        query_embedding: The query embedding.
+        embedding_list: A list of embeddings.
+        lambda_mult: The lambda parameter for MMR. Default is 0.5.
+        k: The number of embeddings to return. Default is 4.
+
+    Returns:
+        A list of indices of the embeddings to return.
+
+    Raises:
+        ImportError: If numpy is not installed.
+    """
    try:
        import numpy as np
    except ImportError: