core: docstrings indexing (#23785)

Added missed docstrings. Formatted docstrings to the consistent form.
2025-09-25 04:49:17 +00:00 · 2024-07-03 08:27:34 -07:00
parent 30fdc2dbe7
commit 716a316654
8 changed files with 310 additions and 33 deletions
--- a/libs/core/langchain_core/indexing/init.py
+++ b/libs/core/langchain_core/indexing/init.py
@@ -4,6 +4,7 @@ This package contains helper logic to help deal with indexing data into
 a vectorstore while avoiding duplicated content and over-writing content
 if it's unchanged.
 """
+
 from langchain_core.indexing.api import IndexingResult, aindex, index
 from langchain_core.indexing.base import InMemoryRecordManager, RecordManager

--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@@ -1,4 +1,5 @@
 """Module contains logic for indexing documents into vector stores."""
+
 from __future__ import annotations

 import hashlib
@@ -232,8 +233,8 @@ def index(
        record_manager: Timestamped set to keep track of which documents were
                         updated.
        vector_store: Vector store to index the documents into.
-        batch_size: Batch size to use when indexing.
-        cleanup: How to handle clean up of documents.
+        batch_size: Batch size to use when indexing. Default is 100.
+        cleanup: How to handle clean up of documents. Default is None.
            - Incremental: Cleans up all documents that haven't been updated AND
                           that are associated with source ids that were seen
                           during indexing.
@@ -246,14 +247,23 @@ def index(
                    This means that users may see duplicated content during indexing.
            - None: Do not delete any documents.
        source_id_key: Optional key that helps identify the original source
-            of the document.
+            of the document. Default is None.
        cleanup_batch_size: Batch size to use when cleaning up documents.
+            Default is 1_000.
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
+            Default is False.

    Returns:
        Indexing result which contains information about how many documents
        were added, updated, deleted, or skipped.
+
+    Raises:
+        ValueError: If cleanup mode is not one of 'incremental', 'full' or None
+        ValueError: If cleanup mode is incremental and source_id_key is None.
+        ValueError: If vectorstore does not have
+            "delete" and "add_documents" required methods.
+        ValueError: If source_id_key is not None, but is not a string or callable.
    """
    if cleanup not in {"incremental", "full", None}:
        raise ValueError(
@@ -415,7 +425,7 @@ async def aindex(
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
 ) -> IndexingResult:
-    """Index data from the loader into the vector store.
+    """Async index data from the loader into the vector store.

    Indexing functionality uses a manager to keep track of which documents
    are in the vector store.
@@ -437,8 +447,8 @@ async def aindex(
        record_manager: Timestamped set to keep track of which documents were
                         updated.
        vector_store: Vector store to index the documents into.
-        batch_size: Batch size to use when indexing.
-        cleanup: How to handle clean up of documents.
+        batch_size: Batch size to use when indexing. Default is 100.
+        cleanup: How to handle clean up of documents. Default is None.
            - Incremental: Cleans up all documents that haven't been updated AND
                           that are associated with source ids that were seen
                           during indexing.
@@ -450,14 +460,23 @@ async def aindex(
                    This means that users may see duplicated content during indexing.
            - None: Do not delete any documents.
        source_id_key: Optional key that helps identify the original source
-            of the document.
+            of the document. Default is None.
        cleanup_batch_size: Batch size to use when cleaning up documents.
+            Default is 1_000.
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
+            Default is False.

    Returns:
        Indexing result which contains information about how many documents
        were added, updated, deleted, or skipped.
+
+    Raises:
+        ValueError: If cleanup mode is not one of 'incremental', 'full' or None
+        ValueError: If cleanup mode is incremental and source_id_key is None.
+        ValueError: If vectorstore does not have
+            "adelete" and "aadd_documents" required methods.
+        ValueError: If source_id_key is not None, but is not a string or callable.
    """

    if cleanup not in {"incremental", "full", None}:
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@@ -37,7 +37,7 @@ class RecordManager(ABC):
    2. The record manager is currently implemented separately from the
       vectorstore, which means that the overall system becomes distributed
       and may create issues with consistency. For example, writing to
-       record manager succeeds but corresponding writing to vectorstore fails.
+       record manager succeeds, but corresponding writing to vectorstore fails.
    """

    def __init__(
@@ -227,6 +227,11 @@ class InMemoryRecordManager(RecordManager):
    """An in-memory record manager for testing purposes."""

    def __init__(self, namespace: str) -> None:
+        """Initialize the in-memory record manager.
+
+        Args:
+            namespace (str): The namespace for the record manager.
+        """
        super().__init__(namespace)
        # Each key points to a dictionary
        # of {'group_id': group_id, 'updated_at': timestamp}
@@ -237,14 +242,16 @@ class InMemoryRecordManager(RecordManager):
        """In-memory schema creation is simply ensuring the structure is initialized."""

    async def acreate_schema(self) -> None:
-        """In-memory schema creation is simply ensuring the structure is initialized."""
+        """Async in-memory schema creation is simply ensuring
+        the structure is initialized.
+        """

    def get_time(self) -> float:
        """Get the current server time as a high resolution timestamp!"""
        return time.time()

    async def aget_time(self) -> float:
-        """Get the current server time as a high resolution timestamp!"""
+        """Async get the current server time as a high resolution timestamp!"""
        return self.get_time()

    def update(
@@ -254,6 +261,27 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[Optional[str]]] = None,
        time_at_least: Optional[float] = None,
    ) -> None:
+        """Upsert records into the database.
+
+        Args:
+            keys: A list of record keys to upsert.
+            group_ids: A list of group IDs corresponding to the keys.
+                Defaults to None.
+            time_at_least: Optional timestamp. Implementation can use this
+                to optionally verify that the timestamp IS at least this time
+                in the system that stores. Defaults to None.
+                E.g., use to validate that the time in the postgres database
+                is equal to or larger than the given timestamp, if not
+                raise an error.
+                This is meant to help prevent time-drift issues since
+                time may not be monotonically increasing!
+
+        Raises:
+            ValueError: If the length of keys doesn't match the length of group
+                ids.
+            ValueError: If time_at_least is in the future.
+        """
+
        if group_ids and len(keys) != len(group_ids):
            raise ValueError("Length of keys must match length of group_ids")
        for index, key in enumerate(keys):
@@ -269,12 +297,48 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[Optional[str]]] = None,
        time_at_least: Optional[float] = None,
    ) -> None:
+        """Async upsert records into the database.
+
+        Args:
+            keys: A list of record keys to upsert.
+            group_ids: A list of group IDs corresponding to the keys.
+                Defaults to None.
+            time_at_least: Optional timestamp. Implementation can use this
+                to optionally verify that the timestamp IS at least this time
+                in the system that stores. Defaults to None.
+                E.g., use to validate that the time in the postgres database
+                is equal to or larger than the given timestamp, if not
+                raise an error.
+                This is meant to help prevent time-drift issues since
+                time may not be monotonically increasing!
+
+        Raises:
+            ValueError: If the length of keys doesn't match the length of group
+                ids.
+            ValueError: If time_at_least is in the future.
+        """
        self.update(keys, group_ids=group_ids, time_at_least=time_at_least)

    def exists(self, keys: Sequence[str]) -> List[bool]:
+        """Check if the provided keys exist in the database.
+
+        Args:
+            keys: A list of keys to check.
+
+        Returns:
+            A list of boolean values indicating the existence of each key.
+        """
        return [key in self.records for key in keys]

    async def aexists(self, keys: Sequence[str]) -> List[bool]:
+        """Async check if the provided keys exist in the database.
+
+        Args:
+            keys: A list of keys to check.
+
+        Returns:
+            A list of boolean values indicating the existence of each key.
+        """
        return self.exists(keys)

    def list_keys(
@@ -285,6 +349,21 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[str]] = None,
        limit: Optional[int] = None,
    ) -> List[str]:
+        """List records in the database based on the provided filters.
+
+        Args:
+            before: Filter to list records updated before this time.
+                Defaults to None.
+            after: Filter to list records updated after this time.
+                Defaults to None.
+            group_ids: Filter to list records with specific group IDs.
+                Defaults to None.
+            limit: optional limit on the number of records to return.
+                Defaults to None.
+
+        Returns:
+            A list of keys for the matching records.
+        """
        result = []
        for key, data in self.records.items():
            if before and data["updated_at"] >= before:
@@ -306,14 +385,39 @@ class InMemoryRecordManager(RecordManager):
        group_ids: Optional[Sequence[str]] = None,
        limit: Optional[int] = None,
    ) -> List[str]:
+        """Async list records in the database based on the provided filters.
+
+        Args:
+            before: Filter to list records updated before this time.
+                Defaults to None.
+            after: Filter to list records updated after this time.
+                Defaults to None.
+            group_ids: Filter to list records with specific group IDs.
+                Defaults to None.
+            limit: optional limit on the number of records to return.
+                Defaults to None.
+
+        Returns:
+            A list of keys for the matching records.
+        """
        return self.list_keys(
            before=before, after=after, group_ids=group_ids, limit=limit
        )

    def delete_keys(self, keys: Sequence[str]) -> None:
+        """Delete specified records from the database.
+
+        Args:
+            keys: A list of keys to delete.
+        """
        for key in keys:
            if key in self.records:
                del self.records[key]

    async def adelete_keys(self, keys: Sequence[str]) -> None:
+        """Async delete specified records from the database.
+
+        Args:
+            keys: A list of keys to delete.
+        """
        self.delete_keys(keys)