core: docstrings indexing (#23785)

Added missed docstrings. Formatted docstrings to the consistent form.
This commit is contained in:
Leonid Ganeline
2024-07-03 08:27:34 -07:00
committed by GitHub
parent 30fdc2dbe7
commit 716a316654
8 changed files with 310 additions and 33 deletions

View File

@@ -4,6 +4,7 @@ This package contains helper logic to help deal with indexing data into
a vectorstore while avoiding duplicated content and over-writing content
if it's unchanged.
"""
from langchain_core.indexing.api import IndexingResult, aindex, index
from langchain_core.indexing.base import InMemoryRecordManager, RecordManager

View File

@@ -1,4 +1,5 @@
"""Module contains logic for indexing documents into vector stores."""
from __future__ import annotations
import hashlib
@@ -232,8 +233,8 @@ def index(
record_manager: Timestamped set to keep track of which documents were
updated.
vector_store: Vector store to index the documents into.
batch_size: Batch size to use when indexing.
cleanup: How to handle clean up of documents.
batch_size: Batch size to use when indexing. Default is 100.
cleanup: How to handle clean up of documents. Default is None.
- Incremental: Cleans up all documents that haven't been updated AND
that are associated with source ids that were seen
during indexing.
@@ -246,14 +247,23 @@ def index(
This means that users may see duplicated content during indexing.
- None: Do not delete any documents.
source_id_key: Optional key that helps identify the original source
of the document.
of the document. Default is None.
cleanup_batch_size: Batch size to use when cleaning up documents.
Default is 1_000.
force_update: Force update documents even if they are present in the
record manager. Useful if you are re-indexing with updated embeddings.
Default is False.
Returns:
Indexing result which contains information about how many documents
were added, updated, deleted, or skipped.
Raises:
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
ValueError: If cleanup mode is incremental and source_id_key is None.
ValueError: If vectorstore does not have
"delete" and "add_documents" required methods.
ValueError: If source_id_key is not None, but is not a string or callable.
"""
if cleanup not in {"incremental", "full", None}:
raise ValueError(
@@ -415,7 +425,7 @@ async def aindex(
cleanup_batch_size: int = 1_000,
force_update: bool = False,
) -> IndexingResult:
"""Index data from the loader into the vector store.
"""Async index data from the loader into the vector store.
Indexing functionality uses a manager to keep track of which documents
are in the vector store.
@@ -437,8 +447,8 @@ async def aindex(
record_manager: Timestamped set to keep track of which documents were
updated.
vector_store: Vector store to index the documents into.
batch_size: Batch size to use when indexing.
cleanup: How to handle clean up of documents.
batch_size: Batch size to use when indexing. Default is 100.
cleanup: How to handle clean up of documents. Default is None.
- Incremental: Cleans up all documents that haven't been updated AND
that are associated with source ids that were seen
during indexing.
@@ -450,14 +460,23 @@ async def aindex(
This means that users may see duplicated content during indexing.
- None: Do not delete any documents.
source_id_key: Optional key that helps identify the original source
of the document.
of the document. Default is None.
cleanup_batch_size: Batch size to use when cleaning up documents.
Default is 1_000.
force_update: Force update documents even if they are present in the
record manager. Useful if you are re-indexing with updated embeddings.
Default is False.
Returns:
Indexing result which contains information about how many documents
were added, updated, deleted, or skipped.
Raises:
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
ValueError: If cleanup mode is incremental and source_id_key is None.
ValueError: If vectorstore does not have
"adelete" and "aadd_documents" required methods.
ValueError: If source_id_key is not None, but is not a string or callable.
"""
if cleanup not in {"incremental", "full", None}:

View File

@@ -37,7 +37,7 @@ class RecordManager(ABC):
2. The record manager is currently implemented separately from the
vectorstore, which means that the overall system becomes distributed
and may create issues with consistency. For example, writing to
record manager succeeds but corresponding writing to vectorstore fails.
record manager succeeds, but corresponding writing to vectorstore fails.
"""
def __init__(
@@ -227,6 +227,11 @@ class InMemoryRecordManager(RecordManager):
"""An in-memory record manager for testing purposes."""
def __init__(self, namespace: str) -> None:
"""Initialize the in-memory record manager.
Args:
namespace (str): The namespace for the record manager.
"""
super().__init__(namespace)
# Each key points to a dictionary
# of {'group_id': group_id, 'updated_at': timestamp}
@@ -237,14 +242,16 @@ class InMemoryRecordManager(RecordManager):
"""In-memory schema creation is simply ensuring the structure is initialized."""
async def acreate_schema(self) -> None:
"""In-memory schema creation is simply ensuring the structure is initialized."""
"""Async in-memory schema creation is simply ensuring
the structure is initialized.
"""
def get_time(self) -> float:
"""Get the current server time as a high resolution timestamp!"""
return time.time()
async def aget_time(self) -> float:
"""Get the current server time as a high resolution timestamp!"""
"""Async get the current server time as a high resolution timestamp!"""
return self.get_time()
def update(
@@ -254,6 +261,27 @@ class InMemoryRecordManager(RecordManager):
group_ids: Optional[Sequence[Optional[str]]] = None,
time_at_least: Optional[float] = None,
) -> None:
"""Upsert records into the database.
Args:
keys: A list of record keys to upsert.
group_ids: A list of group IDs corresponding to the keys.
Defaults to None.
time_at_least: Optional timestamp. Implementation can use this
to optionally verify that the timestamp IS at least this time
in the system that stores. Defaults to None.
E.g., use to validate that the time in the postgres database
is equal to or larger than the given timestamp, if not
raise an error.
This is meant to help prevent time-drift issues since
time may not be monotonically increasing!
Raises:
ValueError: If the length of keys doesn't match the length of group
ids.
ValueError: If time_at_least is in the future.
"""
if group_ids and len(keys) != len(group_ids):
raise ValueError("Length of keys must match length of group_ids")
for index, key in enumerate(keys):
@@ -269,12 +297,48 @@ class InMemoryRecordManager(RecordManager):
group_ids: Optional[Sequence[Optional[str]]] = None,
time_at_least: Optional[float] = None,
) -> None:
"""Async upsert records into the database.
Args:
keys: A list of record keys to upsert.
group_ids: A list of group IDs corresponding to the keys.
Defaults to None.
time_at_least: Optional timestamp. Implementation can use this
to optionally verify that the timestamp IS at least this time
in the system that stores. Defaults to None.
E.g., use to validate that the time in the postgres database
is equal to or larger than the given timestamp, if not
raise an error.
This is meant to help prevent time-drift issues since
time may not be monotonically increasing!
Raises:
ValueError: If the length of keys doesn't match the length of group
ids.
ValueError: If time_at_least is in the future.
"""
self.update(keys, group_ids=group_ids, time_at_least=time_at_least)
def exists(self, keys: Sequence[str]) -> List[bool]:
"""Check if the provided keys exist in the database.
Args:
keys: A list of keys to check.
Returns:
A list of boolean values indicating the existence of each key.
"""
return [key in self.records for key in keys]
async def aexists(self, keys: Sequence[str]) -> List[bool]:
"""Async check if the provided keys exist in the database.
Args:
keys: A list of keys to check.
Returns:
A list of boolean values indicating the existence of each key.
"""
return self.exists(keys)
def list_keys(
@@ -285,6 +349,21 @@ class InMemoryRecordManager(RecordManager):
group_ids: Optional[Sequence[str]] = None,
limit: Optional[int] = None,
) -> List[str]:
"""List records in the database based on the provided filters.
Args:
before: Filter to list records updated before this time.
Defaults to None.
after: Filter to list records updated after this time.
Defaults to None.
group_ids: Filter to list records with specific group IDs.
Defaults to None.
limit: optional limit on the number of records to return.
Defaults to None.
Returns:
A list of keys for the matching records.
"""
result = []
for key, data in self.records.items():
if before and data["updated_at"] >= before:
@@ -306,14 +385,39 @@ class InMemoryRecordManager(RecordManager):
group_ids: Optional[Sequence[str]] = None,
limit: Optional[int] = None,
) -> List[str]:
"""Async list records in the database based on the provided filters.
Args:
before: Filter to list records updated before this time.
Defaults to None.
after: Filter to list records updated after this time.
Defaults to None.
group_ids: Filter to list records with specific group IDs.
Defaults to None.
limit: optional limit on the number of records to return.
Defaults to None.
Returns:
A list of keys for the matching records.
"""
return self.list_keys(
before=before, after=after, group_ids=group_ids, limit=limit
)
def delete_keys(self, keys: Sequence[str]) -> None:
"""Delete specified records from the database.
Args:
keys: A list of keys to delete.
"""
for key in keys:
if key in self.records:
del self.records[key]
async def adelete_keys(self, keys: Sequence[str]) -> None:
"""Async delete specified records from the database.
Args:
keys: A list of keys to delete.
"""
self.delete_keys(keys)