diff --git a/libs/core/langchain_core/indexing/__init__.py b/libs/core/langchain_core/indexing/__init__.py index bc581297d78..3643b130410 100644 --- a/libs/core/langchain_core/indexing/__init__.py +++ b/libs/core/langchain_core/indexing/__init__.py @@ -4,6 +4,7 @@ This package contains helper logic to help deal with indexing data into a vectorstore while avoiding duplicated content and over-writing content if it's unchanged. """ + from langchain_core.indexing.api import IndexingResult, aindex, index from langchain_core.indexing.base import InMemoryRecordManager, RecordManager diff --git a/libs/core/langchain_core/indexing/api.py b/libs/core/langchain_core/indexing/api.py index 86741ae3a25..0462e0838ba 100644 --- a/libs/core/langchain_core/indexing/api.py +++ b/libs/core/langchain_core/indexing/api.py @@ -1,4 +1,5 @@ """Module contains logic for indexing documents into vector stores.""" + from __future__ import annotations import hashlib @@ -232,8 +233,8 @@ def index( record_manager: Timestamped set to keep track of which documents were updated. vector_store: Vector store to index the documents into. - batch_size: Batch size to use when indexing. - cleanup: How to handle clean up of documents. + batch_size: Batch size to use when indexing. Default is 100. + cleanup: How to handle clean up of documents. Default is None. - Incremental: Cleans up all documents that haven't been updated AND that are associated with source ids that were seen during indexing. @@ -246,14 +247,23 @@ def index( This means that users may see duplicated content during indexing. - None: Do not delete any documents. source_id_key: Optional key that helps identify the original source - of the document. + of the document. Default is None. cleanup_batch_size: Batch size to use when cleaning up documents. + Default is 1_000. force_update: Force update documents even if they are present in the record manager. Useful if you are re-indexing with updated embeddings. + Default is False. Returns: Indexing result which contains information about how many documents were added, updated, deleted, or skipped. + + Raises: + ValueError: If cleanup mode is not one of 'incremental', 'full' or None + ValueError: If cleanup mode is incremental and source_id_key is None. + ValueError: If vectorstore does not have + "delete" and "add_documents" required methods. + ValueError: If source_id_key is not None, but is not a string or callable. """ if cleanup not in {"incremental", "full", None}: raise ValueError( @@ -415,7 +425,7 @@ async def aindex( cleanup_batch_size: int = 1_000, force_update: bool = False, ) -> IndexingResult: - """Index data from the loader into the vector store. + """Async index data from the loader into the vector store. Indexing functionality uses a manager to keep track of which documents are in the vector store. @@ -437,8 +447,8 @@ async def aindex( record_manager: Timestamped set to keep track of which documents were updated. vector_store: Vector store to index the documents into. - batch_size: Batch size to use when indexing. - cleanup: How to handle clean up of documents. + batch_size: Batch size to use when indexing. Default is 100. + cleanup: How to handle clean up of documents. Default is None. - Incremental: Cleans up all documents that haven't been updated AND that are associated with source ids that were seen during indexing. @@ -450,14 +460,23 @@ async def aindex( This means that users may see duplicated content during indexing. - None: Do not delete any documents. source_id_key: Optional key that helps identify the original source - of the document. + of the document. Default is None. cleanup_batch_size: Batch size to use when cleaning up documents. + Default is 1_000. force_update: Force update documents even if they are present in the record manager. Useful if you are re-indexing with updated embeddings. + Default is False. Returns: Indexing result which contains information about how many documents were added, updated, deleted, or skipped. + + Raises: + ValueError: If cleanup mode is not one of 'incremental', 'full' or None + ValueError: If cleanup mode is incremental and source_id_key is None. + ValueError: If vectorstore does not have + "adelete" and "aadd_documents" required methods. + ValueError: If source_id_key is not None, but is not a string or callable. """ if cleanup not in {"incremental", "full", None}: diff --git a/libs/core/langchain_core/indexing/base.py b/libs/core/langchain_core/indexing/base.py index aae9a328246..15912358ba0 100644 --- a/libs/core/langchain_core/indexing/base.py +++ b/libs/core/langchain_core/indexing/base.py @@ -37,7 +37,7 @@ class RecordManager(ABC): 2. The record manager is currently implemented separately from the vectorstore, which means that the overall system becomes distributed and may create issues with consistency. For example, writing to - record manager succeeds but corresponding writing to vectorstore fails. + record manager succeeds, but corresponding writing to vectorstore fails. """ def __init__( @@ -227,6 +227,11 @@ class InMemoryRecordManager(RecordManager): """An in-memory record manager for testing purposes.""" def __init__(self, namespace: str) -> None: + """Initialize the in-memory record manager. + + Args: + namespace (str): The namespace for the record manager. + """ super().__init__(namespace) # Each key points to a dictionary # of {'group_id': group_id, 'updated_at': timestamp} @@ -237,14 +242,16 @@ class InMemoryRecordManager(RecordManager): """In-memory schema creation is simply ensuring the structure is initialized.""" async def acreate_schema(self) -> None: - """In-memory schema creation is simply ensuring the structure is initialized.""" + """Async in-memory schema creation is simply ensuring + the structure is initialized. + """ def get_time(self) -> float: """Get the current server time as a high resolution timestamp!""" return time.time() async def aget_time(self) -> float: - """Get the current server time as a high resolution timestamp!""" + """Async get the current server time as a high resolution timestamp!""" return self.get_time() def update( @@ -254,6 +261,27 @@ class InMemoryRecordManager(RecordManager): group_ids: Optional[Sequence[Optional[str]]] = None, time_at_least: Optional[float] = None, ) -> None: + """Upsert records into the database. + + Args: + keys: A list of record keys to upsert. + group_ids: A list of group IDs corresponding to the keys. + Defaults to None. + time_at_least: Optional timestamp. Implementation can use this + to optionally verify that the timestamp IS at least this time + in the system that stores. Defaults to None. + E.g., use to validate that the time in the postgres database + is equal to or larger than the given timestamp, if not + raise an error. + This is meant to help prevent time-drift issues since + time may not be monotonically increasing! + + Raises: + ValueError: If the length of keys doesn't match the length of group + ids. + ValueError: If time_at_least is in the future. + """ + if group_ids and len(keys) != len(group_ids): raise ValueError("Length of keys must match length of group_ids") for index, key in enumerate(keys): @@ -269,12 +297,48 @@ class InMemoryRecordManager(RecordManager): group_ids: Optional[Sequence[Optional[str]]] = None, time_at_least: Optional[float] = None, ) -> None: + """Async upsert records into the database. + + Args: + keys: A list of record keys to upsert. + group_ids: A list of group IDs corresponding to the keys. + Defaults to None. + time_at_least: Optional timestamp. Implementation can use this + to optionally verify that the timestamp IS at least this time + in the system that stores. Defaults to None. + E.g., use to validate that the time in the postgres database + is equal to or larger than the given timestamp, if not + raise an error. + This is meant to help prevent time-drift issues since + time may not be monotonically increasing! + + Raises: + ValueError: If the length of keys doesn't match the length of group + ids. + ValueError: If time_at_least is in the future. + """ self.update(keys, group_ids=group_ids, time_at_least=time_at_least) def exists(self, keys: Sequence[str]) -> List[bool]: + """Check if the provided keys exist in the database. + + Args: + keys: A list of keys to check. + + Returns: + A list of boolean values indicating the existence of each key. + """ return [key in self.records for key in keys] async def aexists(self, keys: Sequence[str]) -> List[bool]: + """Async check if the provided keys exist in the database. + + Args: + keys: A list of keys to check. + + Returns: + A list of boolean values indicating the existence of each key. + """ return self.exists(keys) def list_keys( @@ -285,6 +349,21 @@ class InMemoryRecordManager(RecordManager): group_ids: Optional[Sequence[str]] = None, limit: Optional[int] = None, ) -> List[str]: + """List records in the database based on the provided filters. + + Args: + before: Filter to list records updated before this time. + Defaults to None. + after: Filter to list records updated after this time. + Defaults to None. + group_ids: Filter to list records with specific group IDs. + Defaults to None. + limit: optional limit on the number of records to return. + Defaults to None. + + Returns: + A list of keys for the matching records. + """ result = [] for key, data in self.records.items(): if before and data["updated_at"] >= before: @@ -306,14 +385,39 @@ class InMemoryRecordManager(RecordManager): group_ids: Optional[Sequence[str]] = None, limit: Optional[int] = None, ) -> List[str]: + """Async list records in the database based on the provided filters. + + Args: + before: Filter to list records updated before this time. + Defaults to None. + after: Filter to list records updated after this time. + Defaults to None. + group_ids: Filter to list records with specific group IDs. + Defaults to None. + limit: optional limit on the number of records to return. + Defaults to None. + + Returns: + A list of keys for the matching records. + """ return self.list_keys( before=before, after=after, group_ids=group_ids, limit=limit ) def delete_keys(self, keys: Sequence[str]) -> None: + """Delete specified records from the database. + + Args: + keys: A list of keys to delete. + """ for key in keys: if key in self.records: del self.records[key] async def adelete_keys(self, keys: Sequence[str]) -> None: + """Async delete specified records from the database. + + Args: + keys: A list of keys to delete. + """ self.delete_keys(keys) diff --git a/libs/core/langchain_core/language_models/__init__.py b/libs/core/langchain_core/language_models/__init__.py index daa481ba2d7..4781f98cc9f 100644 --- a/libs/core/langchain_core/language_models/__init__.py +++ b/libs/core/langchain_core/language_models/__init__.py @@ -1,15 +1,15 @@ """**Language Model** is a type of model that can generate text or complete -text prompts. +text prompts. -LangChain has two main classes to work with language models: **Chat Models** +LangChain has two main classes to work with language models: **Chat Models** and "old-fashioned" **LLMs**. ## Chat Models -Language models that use a sequence of messages as inputs and return chat messages +Language models that use a sequence of messages as inputs and return chat messages as outputs (as opposed to using plain text). These are traditionally newer models ( -older models are generally LLMs, see below). Chat models support the assignment of -distinct roles to conversation messages, helping to distinguish messages from the AI, +older models are generally LLMs, see below). Chat models support the assignment of +distinct roles to conversation messages, helping to distinguish messages from the AI, users, and instructions such as system messages. The key abstraction for chat models is `BaseChatModel`. Implementations @@ -23,15 +23,15 @@ https://python.langchain.com/v0.2/docs/how_to/custom_chat_model/ ## LLMs -Language models that takes a string as input and returns a string. +Language models that takes a string as input and returns a string. These are traditionally older models (newer models generally are Chat Models, see below). -Although the underlying models are string in, string out, the LangChain wrappers -also allow these models to take messages as input. This gives them the same interface -as Chat Models. When messages are passed in as input, they will be formatted into a +Although the underlying models are string in, string out, the LangChain wrappers +also allow these models to take messages as input. This gives them the same interface +as Chat Models. When messages are passed in as input, they will be formatted into a string under the hood before being passed to the underlying model. -To implement a custom LLM, inherit from `BaseLLM` or `LLM`. +To implement a custom LLM, inherit from `BaseLLM` or `LLM`. Please see the following guide for more information on how to implement a custom LLM: https://python.langchain.com/v0.2/docs/how_to/custom_llm/ diff --git a/libs/core/langchain_core/language_models/base.py b/libs/core/langchain_core/language_models/base.py index 52bd6261773..262a3b4e548 100644 --- a/libs/core/langchain_core/language_models/base.py +++ b/libs/core/langchain_core/language_models/base.py @@ -39,6 +39,11 @@ if TYPE_CHECKING: @lru_cache(maxsize=None) # Cache the tokenizer def get_tokenizer() -> Any: + """Get a GPT-2 tokenizer instance. + + This function is cached to avoid re-loading the tokenizer + every time it is called. + """ try: from transformers import GPT2TokenizerFast # type: ignore[import] except ImportError: @@ -77,7 +82,7 @@ class BaseLanguageModel( ): """Abstract base class for interfacing with language models. - All language model wrappers inherit from BaseLanguageModel. + All language model wrappers inherited from BaseLanguageModel. """ cache: Union[BaseCache, bool, None] = None @@ -108,6 +113,12 @@ class BaseLanguageModel( """If verbose is None, set it. This allows users to pass in None as verbose to access the global setting. + + Args: + verbose: The verbosity setting to use. + + Returns: + The verbosity setting to use. """ if verbose is None: return _get_verbosity() @@ -324,7 +335,7 @@ class BaseLanguageModel( def get_num_tokens(self, text: str) -> int: """Get the number of tokens present in the text. - Useful for checking if an input will fit in a model's context window. + Useful for checking if an input fits in a model's context window. Args: text: The string input to tokenize. @@ -337,7 +348,7 @@ class BaseLanguageModel( def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int: """Get the number of tokens in the messages. - Useful for checking if an input will fit in a model's context window. + Useful for checking if an input fits in a model's context window. Args: messages: The message inputs to tokenize. diff --git a/libs/core/langchain_core/language_models/chat_models.py b/libs/core/langchain_core/language_models/chat_models.py index 8dd6dcc7719..13c68dded92 100644 --- a/libs/core/langchain_core/language_models/chat_models.py +++ b/libs/core/langchain_core/language_models/chat_models.py @@ -68,16 +68,31 @@ if TYPE_CHECKING: class LangSmithParams(TypedDict, total=False): + """LangSmith parameters for tracing.""" + ls_provider: str + """Provider of the model.""" ls_model_name: str + """Name of the model.""" ls_model_type: Literal["chat"] + """Type of the model. Should be 'chat'.""" ls_temperature: Optional[float] + """Temperature for generation.""" ls_max_tokens: Optional[int] + """Max tokens for generation.""" ls_stop: Optional[List[str]] + """Stop words for generation.""" def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult: - """Generate from a stream.""" + """Generate from a stream. + + Args: + stream: Iterator of ChatGenerationChunk. + + Returns: + ChatResult: Chat result. + """ generation: Optional[ChatGenerationChunk] = None for chunk in stream: @@ -99,7 +114,14 @@ def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult: async def agenerate_from_stream( stream: AsyncIterator[ChatGenerationChunk], ) -> ChatResult: - """Async generate from a stream.""" + """Async generate from a stream. + + Args: + stream: Iterator of ChatGenerationChunk. + + Returns: + ChatResult: Chat result. + """ generation: Optional[ChatGenerationChunk] = None async for chunk in stream: @@ -200,7 +222,17 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC): @root_validator(pre=True) def raise_deprecation(cls, values: Dict) -> Dict: - """Raise deprecation warning if callback_manager is used.""" + """Raise deprecation warning if callback_manager is used. + + Args: + values (Dict): Values to validate. + + Returns: + Dict: Validated values. + + Raises: + DeprecationWarning: If callback_manager is used. + """ if values.get("callback_manager") is not None: warnings.warn( "callback_manager is deprecated. Please use callbacks instead.", diff --git a/libs/core/langchain_core/language_models/fake_chat_models.py b/libs/core/langchain_core/language_models/fake_chat_models.py index 1b265ec4c15..92b0e58993e 100644 --- a/libs/core/langchain_core/language_models/fake_chat_models.py +++ b/libs/core/langchain_core/language_models/fake_chat_models.py @@ -1,4 +1,5 @@ """Fake ChatModel for testing purposes.""" + import asyncio import re import time diff --git a/libs/core/langchain_core/language_models/llms.py b/libs/core/langchain_core/language_models/llms.py index e52ea156e34..1b1e014a8ef 100644 --- a/libs/core/langchain_core/language_models/llms.py +++ b/libs/core/langchain_core/language_models/llms.py @@ -78,7 +78,20 @@ def create_base_retry_decorator( Union[AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun] ] = None, ) -> Callable[[Any], Any]: - """Create a retry decorator for a given LLM and provided list of error types.""" + """Create a retry decorator for a given LLM and provided + a list of error types. + + Args: + error_types: List of error types to retry on. + max_retries: Number of retries. Default is 1. + run_manager: Callback manager for the run. Default is None. + + Returns: + A retry decorator. + + Raises: + ValueError: If the cache is not set and cache is True. + """ _logging = before_sleep_log(logger, logging.WARNING) @@ -141,7 +154,20 @@ def get_prompts( prompts: List[str], cache: Optional[Union[BaseCache, bool, None]] = None, ) -> Tuple[Dict[int, List], str, List[int], List[str]]: - """Get prompts that are already cached.""" + """Get prompts that are already cached. + + Args: + params: Dictionary of parameters. + prompts: List of prompts. + cache: Cache object. Default is None. + + Returns: + A tuple of existing prompts, llm_string, missing prompt indexes, + and missing prompts. + + Raises: + ValueError: If the cache is not set and cache is True. + """ llm_string = str(sorted([(k, v) for k, v in params.items()])) missing_prompts = [] missing_prompt_idxs = [] @@ -164,7 +190,20 @@ async def aget_prompts( prompts: List[str], cache: Optional[Union[BaseCache, bool, None]] = None, ) -> Tuple[Dict[int, List], str, List[int], List[str]]: - """Get prompts that are already cached. Async version.""" + """Get prompts that are already cached. Async version. + + Args: + params: Dictionary of parameters. + prompts: List of prompts. + cache: Cache object. Default is None. + + Returns: + A tuple of existing prompts, llm_string, missing prompt indexes, + and missing prompts. + + Raises: + ValueError: If the cache is not set and cache is True. + """ llm_string = str(sorted([(k, v) for k, v in params.items()])) missing_prompts = [] missing_prompt_idxs = [] @@ -189,7 +228,22 @@ def update_cache( new_results: LLMResult, prompts: List[str], ) -> Optional[dict]: - """Update the cache and get the LLM output.""" + """Update the cache and get the LLM output. + + Args: + cache: Cache object. + existing_prompts: Dictionary of existing prompts. + llm_string: LLM string. + missing_prompt_idxs: List of missing prompt indexes. + new_results: LLMResult object. + prompts: List of prompts. + + Returns: + LLM output. + + Raises: + ValueError: If the cache is not set and cache is True. + """ llm_cache = _resolve_cache(cache) for i, result in enumerate(new_results.generations): existing_prompts[missing_prompt_idxs[i]] = result @@ -208,7 +262,23 @@ async def aupdate_cache( new_results: LLMResult, prompts: List[str], ) -> Optional[dict]: - """Update the cache and get the LLM output. Async version""" + """Update the cache and get the LLM output. Async version. + + Args: + cache: Cache object. + existing_prompts: Dictionary of existing prompts. + llm_string: LLM string. + missing_prompt_idxs: List of missing prompt indexes. + new_results: LLMResult object. + prompts: List of prompts. + + Returns: + LLM output. + + Raises: + ValueError: If the cache is not set and cache is True. + """ + llm_cache = _resolve_cache(cache) for i, result in enumerate(new_results.generations): existing_prompts[missing_prompt_idxs[i]] = result @@ -706,6 +776,15 @@ class BaseLLM(BaseLanguageModel[str], ABC): first occurrence of any of these substrings. callbacks: Callbacks to pass through. Used for executing additional functionality, such as logging or streaming, throughout generation. + tags: List of tags to associate with each prompt. If provided, the length + of the list must match the length of the prompts list. + metadata: List of metadata dictionaries to associate with each prompt. If + provided, the length of the list must match the length of the prompts + list. + run_name: List of run names to associate with each prompt. If provided, the + length of the list must match the length of the prompts list. + run_id: List of run IDs to associate with each prompt. If provided, the + length of the list must match the length of the prompts list. **kwargs: Arbitrary additional keyword arguments. These are usually passed to the model provider API call. @@ -924,6 +1003,15 @@ class BaseLLM(BaseLanguageModel[str], ABC): first occurrence of any of these substrings. callbacks: Callbacks to pass through. Used for executing additional functionality, such as logging or streaming, throughout generation. + tags: List of tags to associate with each prompt. If provided, the length + of the list must match the length of the prompts list. + metadata: List of metadata dictionaries to associate with each prompt. If + provided, the length of the list must match the length of the prompts + list. + run_name: List of run names to associate with each prompt. If provided, the + length of the list must match the length of the prompts list. + run_id: List of run IDs to associate with each prompt. If provided, the + length of the list must match the length of the prompts list. **kwargs: Arbitrary additional keyword arguments. These are usually passed to the model provider API call. @@ -1075,7 +1163,25 @@ class BaseLLM(BaseLanguageModel[str], ABC): metadata: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> str: - """Check Cache and run the LLM on the given prompt and input.""" + """Check Cache and run the LLM on the given prompt and input. + + Args: + prompt: The prompt to generate from. + stop: Stop words to use when generating. Model output is cut off at the + first occurrence of any of these substrings. + callbacks: Callbacks to pass through. Used for executing additional + functionality, such as logging or streaming, throughout generation. + tags: List of tags to associate with the prompt. + metadata: Metadata to associate with the prompt. + **kwargs: Arbitrary additional keyword arguments. These are usually passed + to the model provider API call. + + Returns: + The generated text. + + Raises: + ValueError: If the prompt is not a string. + """ if not isinstance(prompt, str): raise ValueError( "Argument `prompt` is expected to be a string. Instead found " @@ -1190,6 +1296,9 @@ class BaseLLM(BaseLanguageModel[str], ABC): Args: file_path: Path to file to save the LLM to. + Raises: + ValueError: If the file path is not a string or Path object. + Example: .. code-block:: python @@ -1333,7 +1442,7 @@ class LLM(BaseLLM): run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, **kwargs: Any, ) -> LLMResult: - """Run the LLM on the given prompt and input.""" + """Async run the LLM on the given prompt and input.""" generations = [] new_arg_supported = inspect.signature(self._acall).parameters.get("run_manager") for prompt in prompts: