core: docstrings indexing (#23785)

Added missed docstrings. Formatted docstrings to the consistent form.
2025-08-22 10:59:22 +00:00 · 2024-07-03 08:27:34 -07:00 · 2024-07-03 08:27:34 -07:00 · 716a316654
commit 716a316654
parent 30fdc2dbe7
8 changed files with 310 additions and 33 deletions
--- a/libs/core/langchain_core/indexing/init.py
+++ b/libs/core/langchain_core/indexing/init.py
@ -4,6 +4,7 @@ This package contains helper logic to help deal with indexing data into
 a vectorstore while avoiding duplicated content and over-writing content
 if it's unchanged.
 """
 from langchain_core.indexing.api import IndexingResult, aindex, index
 from langchain_core.indexing.base import InMemoryRecordManager, RecordManager
--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@ -1,4 +1,5 @@
 """Module contains logic for indexing documents into vector stores."""
 from __future__ import annotations
 import hashlib
@ -232,8 +233,8 @@ def index(
        record_manager: Timestamped set to keep track of which documents were
                         updated.
        vector_store: Vector store to index the documents into.
-        batch_size: Batch size to use when indexing.
+        batch_size: Batch size to use when indexing. Default is 100.
-        cleanup: How to handle clean up of documents.
+        cleanup: How to handle clean up of documents. Default is None.
            - Incremental: Cleans up all documents that haven't been updated AND
                           that are associated with source ids that were seen
                           during indexing.
@ -246,14 +247,23 @@ def index(
                    This means that users may see duplicated content during indexing.
            - None: Do not delete any documents.
        source_id_key: Optional key that helps identify the original source
-            of the document.
+            of the document. Default is None.
        cleanup_batch_size: Batch size to use when cleaning up documents.
            Default is 1_000.
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
    Returns:
        Indexing result which contains information about how many documents
        were added, updated, deleted, or skipped.
    Raises:
        ValueError: If cleanup mode is not one of 'incremental', 'full' or None
        ValueError: If cleanup mode is incremental and source_id_key is None.
        ValueError: If vectorstore does not have
            "delete" and "add_documents" required methods.
        ValueError: If source_id_key is not None, but is not a string or callable.
    """
    if cleanup not in {"incremental", "full", None}:
        raise ValueError(
@ -415,7 +425,7 @@ async def aindex(
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
 ) -> IndexingResult:
-    """Index data from the loader into the vector store.
+    """Async index data from the loader into the vector store.
    Indexing functionality uses a manager to keep track of which documents
    are in the vector store.
@ -437,8 +447,8 @@ async def aindex(
        record_manager: Timestamped set to keep track of which documents were
                         updated.
        vector_store: Vector store to index the documents into.
-        batch_size: Batch size to use when indexing.
+        batch_size: Batch size to use when indexing. Default is 100.
-        cleanup: How to handle clean up of documents.
+        cleanup: How to handle clean up of documents. Default is None.
            - Incremental: Cleans up all documents that haven't been updated AND
                           that are associated with source ids that were seen
                           during indexing.
@ -450,14 +460,23 @@ async def aindex(
                    This means that users may see duplicated content during indexing.
            - None: Do not delete any documents.
        source_id_key: Optional key that helps identify the original source
-            of the document.
+            of the document. Default is None.
        cleanup_batch_size: Batch size to use when cleaning up documents.
            Default is 1_000.
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
    Returns:
        Indexing result which contains information about how many documents
        were added, updated, deleted, or skipped.
    Raises:
        ValueError: If cleanup mode is not one of 'incremental', 'full' or None
        ValueError: If cleanup mode is incremental and source_id_key is None.
        ValueError: If vectorstore does not have
            "adelete" and "aadd_documents" required methods.
        ValueError: If source_id_key is not None, but is not a string or callable.
    """
    if cleanup not in {"incremental", "full", None}:
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@ -37,7 +37,7 @@ class RecordManager(ABC):
    2. The record manager is currently implemented separately from the
       vectorstore, which means that the overall system becomes distributed
       and may create issues with consistency. For example, writing to
-       record manager succeeds but corresponding writing to vectorstore fails.
+       record manager succeeds, but corresponding writing to vectorstore fails.
    """
    def __init__(
@ -227,6 +227,11 @@ class InMemoryRecordManager(RecordManager):
    """An in-memory record manager for testing purposes."""
    def __init__(self, namespace: str) -> None:
        """Initialize the in-memory record manager.
        Args:
            namespace (str): The namespace for the record manager.
        """
        super().__init__(namespace)
        # Each key points to a dictionary
        # of {'group_id': group_id, 'updated_at': timestamp}
@ -237,14 +242,16 @@ class InMemoryRecordManager(RecordManager):
        """In-memory schema creation is simply ensuring the structure is initialized."""
    async def acreate_schema(self) -> None:
-        """In-memory schema creation is simply ensuring the structure is initialized."""
+        """Async in-memory schema creation is simply ensuring
        the structure is initialized.
        """
    def get_time(self) -> float:
        """Get the current server time as a high resolution timestamp!"""
        return time.time()
    async def aget_time(self) -> float:
-        """Get the current server time as a high resolution timestamp!"""
+        """Async get the current server time as a high resolution timestamp!"""
        return self.get_time()
    def update(
@ -254,6 +261,27 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[Optional[str]]] = None,
        time_at_least: Optional[float] = None,
    ) -> None:
        """Upsert records into the database.
        Args:
            keys: A list of record keys to upsert.
            group_ids: A list of group IDs corresponding to the keys.
                Defaults to None.
            time_at_least: Optional timestamp. Implementation can use this
                to optionally verify that the timestamp IS at least this time
                in the system that stores. Defaults to None.
                E.g., use to validate that the time in the postgres database
                is equal to or larger than the given timestamp, if not
                raise an error.
                This is meant to help prevent time-drift issues since
                time may not be monotonically increasing!
        Raises:
            ValueError: If the length of keys doesn't match the length of group
                ids.
            ValueError: If time_at_least is in the future.
        """
        if group_ids and len(keys) != len(group_ids):
            raise ValueError("Length of keys must match length of group_ids")
        for index, key in enumerate(keys):
@ -269,12 +297,48 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[Optional[str]]] = None,
        time_at_least: Optional[float] = None,
    ) -> None:
        """Async upsert records into the database.
        Args:
            keys: A list of record keys to upsert.
            group_ids: A list of group IDs corresponding to the keys.
                Defaults to None.
            time_at_least: Optional timestamp. Implementation can use this
                to optionally verify that the timestamp IS at least this time
                in the system that stores. Defaults to None.
                E.g., use to validate that the time in the postgres database
                is equal to or larger than the given timestamp, if not
                raise an error.
                This is meant to help prevent time-drift issues since
                time may not be monotonically increasing!
        Raises:
            ValueError: If the length of keys doesn't match the length of group
                ids.
            ValueError: If time_at_least is in the future.
        """
        self.update(keys, group_ids=group_ids, time_at_least=time_at_least)
    def exists(self, keys: Sequence[str]) -> List[bool]:
        """Check if the provided keys exist in the database.
        Args:
            keys: A list of keys to check.
        Returns:
            A list of boolean values indicating the existence of each key.
        """
        return [key in self.records for key in keys]
    async def aexists(self, keys: Sequence[str]) -> List[bool]:
        """Async check if the provided keys exist in the database.
        Args:
            keys: A list of keys to check.
        Returns:
            A list of boolean values indicating the existence of each key.
        """
        return self.exists(keys)
    def list_keys(
@ -285,6 +349,21 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[str]] = None,
        limit: Optional[int] = None,
    ) -> List[str]:
        """List records in the database based on the provided filters.
        Args:
            before: Filter to list records updated before this time.
                Defaults to None.
            after: Filter to list records updated after this time.
                Defaults to None.
            group_ids: Filter to list records with specific group IDs.
                Defaults to None.
            limit: optional limit on the number of records to return.
                Defaults to None.
        Returns:
            A list of keys for the matching records.
        """
        result = []
        for key, data in self.records.items():
            if before and data["updated_at"] >= before:
@ -306,14 +385,39 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[str]] = None,
        limit: Optional[int] = None,
    ) -> List[str]:
        """Async list records in the database based on the provided filters.
        Args:
            before: Filter to list records updated before this time.
                Defaults to None.
            after: Filter to list records updated after this time.
                Defaults to None.
            group_ids: Filter to list records with specific group IDs.
                Defaults to None.
            limit: optional limit on the number of records to return.
                Defaults to None.
        Returns:
            A list of keys for the matching records.
        """
        return self.list_keys(
            before=before, after=after, group_ids=group_ids, limit=limit
        )
    def delete_keys(self, keys: Sequence[str]) -> None:
        """Delete specified records from the database.
        Args:
            keys: A list of keys to delete.
        """
        for key in keys:
            if key in self.records:
                del self.records[key]
    async def adelete_keys(self, keys: Sequence[str]) -> None:
        """Async delete specified records from the database.
        Args:
            keys: A list of keys to delete.
        """
        self.delete_keys(keys)
--- a/libs/core/langchain_core/language_models/init.py
+++ b/libs/core/langchain_core/language_models/init.py
@ -1,15 +1,15 @@
 """**Language Model** is a type of model that can generate text or complete
-text prompts. 
+text prompts.
-LangChain has two main classes to work with language models: **Chat Models** 
+LangChain has two main classes to work with language models: **Chat Models**
 and "old-fashioned" **LLMs**.
 ## Chat Models
-Language models that use a sequence of messages as inputs and return chat messages 
+Language models that use a sequence of messages as inputs and return chat messages
 as outputs (as opposed to using plain text). These are traditionally newer models (
-older models are generally LLMs, see below). Chat models support the assignment of 
+older models are generally LLMs, see below). Chat models support the assignment of
-distinct roles to conversation messages, helping to distinguish messages from the AI, 
+distinct roles to conversation messages, helping to distinguish messages from the AI,
 users, and instructions such as system messages.
 The key abstraction for chat models is `BaseChatModel`. Implementations
@ -23,15 +23,15 @@ https://python.langchain.com/v0.2/docs/how_to/custom_chat_model/
 ## LLMs
-Language models that takes a string as input and returns a string. 
+Language models that takes a string as input and returns a string.
 These are traditionally older models (newer models generally are Chat Models, see below).
-Although the underlying models are string in, string out, the LangChain wrappers 
+Although the underlying models are string in, string out, the LangChain wrappers
-also allow these models to take messages as input. This gives them the same interface 
+also allow these models to take messages as input. This gives them the same interface
-as Chat Models. When messages are passed in as input, they will be formatted into a 
+as Chat Models. When messages are passed in as input, they will be formatted into a
 string under the hood before being passed to the underlying model.
-To implement a custom LLM, inherit from `BaseLLM` or `LLM`. 
+To implement a custom LLM, inherit from `BaseLLM` or `LLM`.
 Please see the following guide for more information on how to implement a custom LLM:
 https://python.langchain.com/v0.2/docs/how_to/custom_llm/
--- a/libs/core/langchain_core/language_models/base.py
+++ b/libs/core/langchain_core/language_models/base.py
@ -39,6 +39,11 @@ if TYPE_CHECKING:
@lru_cache(maxsize=None)  # Cache the tokenizer
 def get_tokenizer() -> Any:
    """Get a GPT-2 tokenizer instance.
    This function is cached to avoid re-loading the tokenizer
    every time it is called.
    """
    try:
        from transformers import GPT2TokenizerFast  # type: ignore[import]
    except ImportError:
@ -77,7 +82,7 @@ class BaseLanguageModel(
 ):
    """Abstract base class for interfacing with language models.
-    All language model wrappers inherit from BaseLanguageModel.
+    All language model wrappers inherited from BaseLanguageModel.
    """
    cache: Union[BaseCache, bool, None] = None
@ -108,6 +113,12 @@ class BaseLanguageModel(
        """If verbose is None, set it.
        This allows users to pass in None as verbose to access the global setting.
        Args:
            verbose: The verbosity setting to use.
        Returns:
            The verbosity setting to use.
        """
        if verbose is None:
            return _get_verbosity()
@ -324,7 +335,7 @@ class BaseLanguageModel(
    def get_num_tokens(self, text: str) -> int:
        """Get the number of tokens present in the text.
-        Useful for checking if an input will fit in a model's context window.
+        Useful for checking if an input fits in a model's context window.
        Args:
            text: The string input to tokenize.
@ -337,7 +348,7 @@ class BaseLanguageModel(
    def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
        """Get the number of tokens in the messages.
-        Useful for checking if an input will fit in a model's context window.
+        Useful for checking if an input fits in a model's context window.
        Args:
            messages: The message inputs to tokenize.
--- a/libs/core/langchain_core/language_models/chat_models.py
+++ b/libs/core/langchain_core/language_models/chat_models.py
@ -68,16 +68,31 @@ if TYPE_CHECKING:
 class LangSmithParams(TypedDict, total=False):
    """LangSmith parameters for tracing."""
    ls_provider: str
    """Provider of the model."""
    ls_model_name: str
    """Name of the model."""
    ls_model_type: Literal["chat"]
    """Type of the model. Should be 'chat'."""
    ls_temperature: Optional[float]
    """Temperature for generation."""
    ls_max_tokens: Optional[int]
    """Max tokens for generation."""
    ls_stop: Optional[List[str]]
    """Stop words for generation."""
 def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult:
-    """Generate from a stream."""
+    """Generate from a stream.
    Args:
        stream: Iterator of ChatGenerationChunk.
    Returns:
        ChatResult: Chat result.
    """
    generation: Optional[ChatGenerationChunk] = None
    for chunk in stream:
@ -99,7 +114,14 @@ def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult:
 async def agenerate_from_stream(
    stream: AsyncIterator[ChatGenerationChunk],
 ) -> ChatResult:
-    """Async generate from a stream."""
+    """Async generate from a stream.
    Args:
        stream: Iterator of ChatGenerationChunk.
    Returns:
        ChatResult: Chat result.
    """
    generation: Optional[ChatGenerationChunk] = None
    async for chunk in stream:
@ -200,7 +222,17 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
    @root_validator(pre=True)
    def raise_deprecation(cls, values: Dict) -> Dict:
-        """Raise deprecation warning if callback_manager is used."""
+        """Raise deprecation warning if callback_manager is used.
        Args:
            values (Dict): Values to validate.
        Returns:
            Dict: Validated values.
        Raises:
            DeprecationWarning: If callback_manager is used.
        """
        if values.get("callback_manager") is not None:
            warnings.warn(
                "callback_manager is deprecated. Please use callbacks instead.",
--- a/libs/core/langchain_core/language_models/fake_chat_models.py
+++ b/libs/core/langchain_core/language_models/fake_chat_models.py
@ -1,4 +1,5 @@
 """Fake ChatModel for testing purposes."""
 import asyncio
 import re
 import time
--- a/libs/core/langchain_core/language_models/llms.py
+++ b/libs/core/langchain_core/language_models/llms.py
@ -78,7 +78,20 @@ def create_base_retry_decorator(
        Union[AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun]
    ] = None,
 ) -> Callable[[Any], Any]:
-    """Create a retry decorator for a given LLM and provided list of error types."""
+    """Create a retry decorator for a given LLM and provided
     a list of error types.
    Args:
        error_types: List of error types to retry on.
        max_retries: Number of retries. Default is 1.
        run_manager: Callback manager for the run. Default is None.
    Returns:
        A retry decorator.
    Raises:
        ValueError: If the cache is not set and cache is True.
    """
    _logging = before_sleep_log(logger, logging.WARNING)
@ -141,7 +154,20 @@ def get_prompts(
    prompts: List[str],
    cache: Optional[Union[BaseCache, bool, None]] = None,
 ) -> Tuple[Dict[int, List], str, List[int], List[str]]:
-    """Get prompts that are already cached."""
+    """Get prompts that are already cached.
    Args:
        params: Dictionary of parameters.
        prompts: List of prompts.
        cache: Cache object. Default is None.
    Returns:
        A tuple of existing prompts, llm_string, missing prompt indexes,
            and missing prompts.
    Raises:
        ValueError: If the cache is not set and cache is True.
    """
    llm_string = str(sorted([(k, v) for k, v in params.items()]))
    missing_prompts = []
    missing_prompt_idxs = []
@ -164,7 +190,20 @@ async def aget_prompts(
    prompts: List[str],
    cache: Optional[Union[BaseCache, bool, None]] = None,
 ) -> Tuple[Dict[int, List], str, List[int], List[str]]:
-    """Get prompts that are already cached. Async version."""
+    """Get prompts that are already cached. Async version.
    Args:
        params: Dictionary of parameters.
        prompts: List of prompts.
        cache: Cache object. Default is None.
    Returns:
        A tuple of existing prompts, llm_string, missing prompt indexes,
            and missing prompts.
    Raises:
        ValueError: If the cache is not set and cache is True.
    """
    llm_string = str(sorted([(k, v) for k, v in params.items()]))
    missing_prompts = []
    missing_prompt_idxs = []
@ -189,7 +228,22 @@ def update_cache(
    new_results: LLMResult,
    prompts: List[str],
 ) -> Optional[dict]:
-    """Update the cache and get the LLM output."""
+    """Update the cache and get the LLM output.
    Args:
        cache: Cache object.
        existing_prompts: Dictionary of existing prompts.
        llm_string: LLM string.
        missing_prompt_idxs: List of missing prompt indexes.
        new_results: LLMResult object.
        prompts: List of prompts.
    Returns:
        LLM output.
    Raises:
        ValueError: If the cache is not set and cache is True.
    """
    llm_cache = _resolve_cache(cache)
    for i, result in enumerate(new_results.generations):
        existing_prompts[missing_prompt_idxs[i]] = result
@ -208,7 +262,23 @@ async def aupdate_cache(
    new_results: LLMResult,
    prompts: List[str],
 ) -> Optional[dict]:
-    """Update the cache and get the LLM output. Async version"""
+    """Update the cache and get the LLM output. Async version.
    Args:
        cache: Cache object.
        existing_prompts: Dictionary of existing prompts.
        llm_string: LLM string.
        missing_prompt_idxs: List of missing prompt indexes.
        new_results: LLMResult object.
        prompts: List of prompts.
    Returns:
        LLM output.
    Raises:
        ValueError: If the cache is not set and cache is True.
    """
    llm_cache = _resolve_cache(cache)
    for i, result in enumerate(new_results.generations):
        existing_prompts[missing_prompt_idxs[i]] = result
@ -706,6 +776,15 @@ class BaseLLM(BaseLanguageModel[str], ABC):
                first occurrence of any of these substrings.
            callbacks: Callbacks to pass through. Used for executing additional
                functionality, such as logging or streaming, throughout generation.
            tags: List of tags to associate with each prompt. If provided, the length
                of the list must match the length of the prompts list.
            metadata: List of metadata dictionaries to associate with each prompt. If
                provided, the length of the list must match the length of the prompts
                list.
            run_name: List of run names to associate with each prompt. If provided, the
                length of the list must match the length of the prompts list.
            run_id: List of run IDs to associate with each prompt. If provided, the
                length of the list must match the length of the prompts list.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.
@ -924,6 +1003,15 @@ class BaseLLM(BaseLanguageModel[str], ABC):
                first occurrence of any of these substrings.
            callbacks: Callbacks to pass through. Used for executing additional
                functionality, such as logging or streaming, throughout generation.
            tags: List of tags to associate with each prompt. If provided, the length
                of the list must match the length of the prompts list.
            metadata: List of metadata dictionaries to associate with each prompt. If
                provided, the length of the list must match the length of the prompts
                list.
            run_name: List of run names to associate with each prompt. If provided, the
                length of the list must match the length of the prompts list.
            run_id: List of run IDs to associate with each prompt. If provided, the
                length of the list must match the length of the prompts list.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.
@ -1075,7 +1163,25 @@ class BaseLLM(BaseLanguageModel[str], ABC):
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> str:
-        """Check Cache and run the LLM on the given prompt and input."""
+        """Check Cache and run the LLM on the given prompt and input.
        Args:
            prompt: The prompt to generate from.
            stop: Stop words to use when generating. Model output is cut off at the
                first occurrence of any of these substrings.
            callbacks: Callbacks to pass through. Used for executing additional
                functionality, such as logging or streaming, throughout generation.
            tags: List of tags to associate with the prompt.
            metadata: Metadata to associate with the prompt.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.
        Returns:
            The generated text.
        Raises:
            ValueError: If the prompt is not a string.
        """
        if not isinstance(prompt, str):
            raise ValueError(
                "Argument `prompt` is expected to be a string. Instead found "
@ -1190,6 +1296,9 @@ class BaseLLM(BaseLanguageModel[str], ABC):
        Args:
            file_path: Path to file to save the LLM to.
        Raises:
            ValueError: If the file path is not a string or Path object.
        Example:
        .. code-block:: python
@ -1333,7 +1442,7 @@ class LLM(BaseLLM):
        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> LLMResult:
-        """Run the LLM on the given prompt and input."""
+        """Async run the LLM on the given prompt and input."""
        generations = []
        new_arg_supported = inspect.signature(self._acall).parameters.get("run_manager")
        for prompt in prompts: