From e731ba1e47542137a246f1f5439c2f7fd0da16a2 Mon Sep 17 00:00:00 2001 From: Mason Daugherty Date: Mon, 20 Oct 2025 18:40:19 -0400 Subject: [PATCH] style: more refs work (#33616) --- .../langchain_core/documents/transformers.py | 8 +- libs/core/langchain_core/load/dump.py | 12 +- libs/core/langchain_core/messages/utils.py | 4 +- libs/core/langchain_core/runnables/branch.py | 18 +-- .../langchain_core/vectorstores/in_memory.py | 4 +- .../langchain_mistralai/chat_models.py | 2 +- .../langchain_tests/__init__.py | 7 +- .../integration_tests/chat_models.py | 55 +++---- .../integration_tests/tools.py | 14 +- .../integration_tests/vectorstores.py | 10 +- .../langchain_tests/unit_tests/chat_models.py | 143 +++++++++--------- .../langchain_text_splitters/__init__.py | 4 +- .../langchain_text_splitters/base.py | 10 +- .../langchain_text_splitters/html.py | 100 ++++++------ .../langchain_text_splitters/json.py | 2 +- .../tests/unit_tests/test_text_splitters.py | 6 +- 16 files changed, 193 insertions(+), 206 deletions(-) diff --git a/libs/core/langchain_core/documents/transformers.py b/libs/core/langchain_core/documents/transformers.py index 4cb37470e6b..4b815464a6d 100644 --- a/libs/core/langchain_core/documents/transformers.py +++ b/libs/core/langchain_core/documents/transformers.py @@ -57,10 +57,10 @@ class BaseDocumentTransformer(ABC): """Transform a list of documents. Args: - documents: A sequence of Documents to be transformed. + documents: A sequence of `Document` objects to be transformed. Returns: - A sequence of transformed Documents. + A sequence of transformed `Document` objects. """ async def atransform_documents( @@ -69,10 +69,10 @@ class BaseDocumentTransformer(ABC): """Asynchronously transform a list of documents. Args: - documents: A sequence of Documents to be transformed. + documents: A sequence of `Document` objects to be transformed. Returns: - A sequence of transformed Documents. + A sequence of transformed `Document` objects. """ return await run_in_executor( None, self.transform_documents, documents, **kwargs diff --git a/libs/core/langchain_core/load/dump.py b/libs/core/langchain_core/load/dump.py index f1c07db2fe3..07bfa3844d0 100644 --- a/libs/core/langchain_core/load/dump.py +++ b/libs/core/langchain_core/load/dump.py @@ -38,7 +38,7 @@ def _dump_pydantic_models(obj: Any) -> Any: def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str: - """Return a json string representation of an object. + """Return a JSON string representation of an object. Args: obj: The object to dump. @@ -47,7 +47,7 @@ def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str: **kwargs: Additional arguments to pass to `json.dumps` Returns: - A json string representation of the object. + A JSON string representation of the object. Raises: ValueError: If `default` is passed as a kwarg. @@ -71,14 +71,12 @@ def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str: def dumpd(obj: Any) -> Any: """Return a dict representation of an object. - !!! note - Unfortunately this function is not as efficient as it could be because it first - dumps the object to a json string and then loads it back into a dictionary. - Args: obj: The object to dump. Returns: - dictionary that can be serialized to json using json.dumps + Dictionary that can be serialized to json using `json.dumps`. """ + # Unfortunately this function is not as efficient as it could be because it first + # dumps the object to a json string and then loads it back into a dictionary. return json.loads(dumps(obj)) diff --git a/libs/core/langchain_core/messages/utils.py b/libs/core/langchain_core/messages/utils.py index c9eab08da15..267bdfeb179 100644 --- a/libs/core/langchain_core/messages/utils.py +++ b/libs/core/langchain_core/messages/utils.py @@ -439,8 +439,8 @@ def filter_messages( exclude_ids: Message IDs to exclude. exclude_tool_calls: Tool call IDs to exclude. Can be one of the following: - - `True`: all `AIMessage`s with tool calls and all - `ToolMessage` objects will be excluded. + - `True`: All `AIMessage` objects with tool calls and all `ToolMessage` + objects will be excluded. - a sequence of tool call IDs to exclude: - `ToolMessage` objects with the corresponding tool call ID will be excluded. diff --git a/libs/core/langchain_core/runnables/branch.py b/libs/core/langchain_core/runnables/branch.py index e773d9165ff..e7d40151f11 100644 --- a/libs/core/langchain_core/runnables/branch.py +++ b/libs/core/langchain_core/runnables/branch.py @@ -40,13 +40,13 @@ from langchain_core.runnables.utils import ( class RunnableBranch(RunnableSerializable[Input, Output]): """Runnable that selects which branch to run based on a condition. - The Runnable is initialized with a list of (condition, Runnable) pairs and + The Runnable is initialized with a list of `(condition, Runnable)` pairs and a default branch. When operating on an input, the first condition that evaluates to True is - selected, and the corresponding Runnable is run on the input. + selected, and the corresponding `Runnable` is run on the input. - If no condition evaluates to True, the default branch is run on the input. + If no condition evaluates to `True`, the default branch is run on the input. Examples: ```python @@ -65,9 +65,9 @@ class RunnableBranch(RunnableSerializable[Input, Output]): """ branches: Sequence[tuple[Runnable[Input, bool], Runnable[Input, Output]]] - """A list of (condition, Runnable) pairs.""" + """A list of `(condition, Runnable)` pairs.""" default: Runnable[Input, Output] - """A Runnable to run if no condition is met.""" + """A `Runnable` to run if no condition is met.""" def __init__( self, @@ -79,15 +79,15 @@ class RunnableBranch(RunnableSerializable[Input, Output]): ] | RunnableLike, ) -> None: - """A Runnable that runs one of two branches based on a condition. + """A `Runnable` that runs one of two branches based on a condition. Args: - *branches: A list of (condition, Runnable) pairs. - Defaults a Runnable to run if no condition is met. + *branches: A list of `(condition, Runnable)` pairs. + Defaults a `Runnable` to run if no condition is met. Raises: ValueError: If the number of branches is less than 2. - TypeError: If the default branch is not Runnable, Callable or Mapping. + TypeError: If the default branch is not `Runnable`, `Callable` or `Mapping`. TypeError: If a branch is not a tuple or list. ValueError: If a branch is not of length 2. """ diff --git a/libs/core/langchain_core/vectorstores/in_memory.py b/libs/core/langchain_core/vectorstores/in_memory.py index 91b5c2c243f..90c6b84ce45 100644 --- a/libs/core/langchain_core/vectorstores/in_memory.py +++ b/libs/core/langchain_core/vectorstores/in_memory.py @@ -260,7 +260,7 @@ class InMemoryVectorStore(VectorStore): ids: The ids of the documents to get. Returns: - A list of Document objects. + A list of `Document` objects. """ documents = [] @@ -284,7 +284,7 @@ class InMemoryVectorStore(VectorStore): ids: The ids of the documents to get. Returns: - A list of Document objects. + A list of `Document` objects. """ return self.get_by_ids(ids) diff --git a/libs/partners/mistralai/langchain_mistralai/chat_models.py b/libs/partners/mistralai/langchain_mistralai/chat_models.py index 78cba61dc32..d2bf7d50a6e 100644 --- a/libs/partners/mistralai/langchain_mistralai/chat_models.py +++ b/libs/partners/mistralai/langchain_mistralai/chat_models.py @@ -446,7 +446,7 @@ def _convert_message_to_mistral_chat_message( class ChatMistralAI(BaseChatModel): - """A chat model that uses the MistralAI API.""" + """A chat model that uses the Mistral AI API.""" # The type for client and async_client is ignored because the type is not # an Optional after the model is initialized and the model_validator diff --git a/libs/standard-tests/langchain_tests/__init__.py b/libs/standard-tests/langchain_tests/__init__.py index d2b756dde09..0d06a1c01ba 100644 --- a/libs/standard-tests/langchain_tests/__init__.py +++ b/libs/standard-tests/langchain_tests/__init__.py @@ -1,6 +1,5 @@ -"""Base Test classes for standard testing. +"""Base test classes for standard testing. -To learn how to use these classes, see the -[integration standard testing](https://python.langchain.com/docs/contributing/how_to/integrations/standard_tests/) -guide. +To learn how to use these, see the guide on +[integrating standard tests](https://docs.langchain.com/oss/python/contributing/standard-tests-langchain). """ diff --git a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py index c9fc99d07ca..fa2f8f371e3 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py @@ -182,23 +182,21 @@ class ChatModelIntegrationTests(ChatModelTests): Test subclasses **must** implement the following two properties: - chat_model_class - The chat model class to test, e.g., `ChatParrotLink`. + `chat_model_class`: The chat model class to test, e.g., `ChatParrotLink`. - ```python - @property - def chat_model_class(self) -> Type[ChatParrotLink]: - return ChatParrotLink - ``` + ```python + @property + def chat_model_class(self) -> Type[ChatParrotLink]: + return ChatParrotLink + ``` - chat_model_params - Initialization parameters for the chat model. + `chat_model_params`: Initialization parameters for the chat model. - ```python - @property - def chat_model_params(self) -> dict: - return {"model": "bird-brain-001", "temperature": 0} - ``` + ```python + @property + def chat_model_params(self) -> dict: + return {"model": "bird-brain-001", "temperature": 0} + ``` In addition, test subclasses can control what features are tested (such as tool calling or multi-modality) by selectively overriding the following properties. @@ -266,7 +264,7 @@ class ChatModelIntegrationTests(ChatModelTests): `with_structured_output` method is overridden. If the base implementation is intended to be used, this method should be overridden. - See: https://python.langchain.com/docs/concepts/structured_outputs/ + See: https://docs.langchain.com/oss/python/langchain/structured-output ```python @property @@ -290,7 +288,7 @@ class ChatModelIntegrationTests(ChatModelTests): Boolean property indicating whether the chat model supports JSON mode in `with_structured_output`. - See: https://python.langchain.com/docs/concepts/structured_outputs/#json-mode + See: https://docs.langchain.com/oss/python/langchain/structured-output ```python @property @@ -324,7 +322,7 @@ class ChatModelIntegrationTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @property @@ -349,7 +347,7 @@ class ChatModelIntegrationTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @property @@ -374,7 +372,7 @@ class ChatModelIntegrationTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @property @@ -399,7 +397,7 @@ class ChatModelIntegrationTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @property @@ -420,8 +418,8 @@ class ChatModelIntegrationTests(ChatModelTests): Defaults to `True`. - `usage_metadata` is an optional dict attribute on `AIMessage`s that track input - and output tokens. + `usage_metadata` is an optional dict attribute on `AIMessage` objects that track + input and output tokens. [See more](https://python.langchain.com/api_reference/core/messages/langchain_core.messages.ai.UsageMetadata.html). ```python @@ -537,8 +535,8 @@ class ChatModelIntegrationTests(ChatModelTests): Property controlling what usage metadata details are emitted in both invoke and stream. - `usage_metadata` is an optional dict attribute on `AIMessage`s that track input - and output tokens. + `usage_metadata` is an optional dict attribute on `AIMessage` objects that track + input and output tokens. [See more](https://python.langchain.com/api_reference/core/messages/langchain_core.messages.ai.UsageMetadata.html). It includes optional keys `input_token_details` and `output_token_details` @@ -580,9 +578,7 @@ class ChatModelIntegrationTests(ChatModelTests): also exclude additional headers, override the default exclusions, or apply other customizations to the VCR configuration. See example below: - ```python - :caption: tests/conftest.py - + ```python title="tests/conftest.py" import pytest from langchain_tests.conftest import ( _base_vcr_config as _base_vcr_config, @@ -617,9 +613,7 @@ class ChatModelIntegrationTests(ChatModelTests): to your VCR fixture and enable this serializer in the config. See example below: - ```python - :caption: tests/conftest.py - + ```python title="tests/conftest.py" import pytest from langchain_tests.conftest import ( CustomPersister, @@ -658,7 +652,6 @@ class ChatModelIntegrationTests(ChatModelTests): def pytest_recording_configure(config: dict, vcr: VCR) -> None: vcr.register_persister(CustomPersister()) vcr.register_serializer("yaml.gz", CustomSerializer()) - ``` You can inspect the contents of the compressed cassettes (e.g., to diff --git a/libs/standard-tests/langchain_tests/integration_tests/tools.py b/libs/standard-tests/langchain_tests/integration_tests/tools.py index b485cbe7023..8c65f37dab9 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/tools.py +++ b/libs/standard-tests/langchain_tests/integration_tests/tools.py @@ -12,7 +12,8 @@ class ToolsIntegrationTests(ToolsTests): def test_invoke_matches_output_schema(self, tool: BaseTool) -> None: """Test invoke matches output schema. - If invoked with a ToolCall, the tool should return a valid ToolMessage content. + If invoked with a `ToolCall`, the tool should return a valid `ToolMessage` + content. If you have followed the [custom tool guide](https://python.langchain.com/docs/how_to/custom_tools/), this test should always pass because ToolCall inputs are handled by the @@ -44,7 +45,8 @@ class ToolsIntegrationTests(ToolsTests): async def test_async_invoke_matches_output_schema(self, tool: BaseTool) -> None: """Test async invoke matches output schema. - If ainvoked with a ToolCall, the tool should return a valid ToolMessage content. + If ainvoked with a `ToolCall`, the tool should return a valid `ToolMessage` + content. For debugging tips, see `test_invoke_matches_output_schema`. """ @@ -68,9 +70,9 @@ class ToolsIntegrationTests(ToolsTests): assert all(isinstance(c, str | dict) for c in tool_message.content) def test_invoke_no_tool_call(self, tool: BaseTool) -> None: - """Test invoke without ToolCall. + """Test invoke without `ToolCall`. - If invoked without a ToolCall, the tool can return anything + If invoked without a `ToolCall`, the tool can return anything but it shouldn't throw an error. If this test fails, your tool may not be handling the input you defined @@ -82,9 +84,9 @@ class ToolsIntegrationTests(ToolsTests): tool.invoke(self.tool_invoke_params_example) async def test_async_invoke_no_tool_call(self, tool: BaseTool) -> None: - """Test async invoke without ToolCall. + """Test async invoke without `ToolCall`. - If ainvoked without a ToolCall, the tool can return anything + If ainvoked without a `ToolCall`, the tool can return anything but it shouldn't throw an error. For debugging tips, see `test_invoke_no_tool_call`. diff --git a/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py b/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py index 5e82d53222e..786376550f4 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py +++ b/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py @@ -103,7 +103,7 @@ class VectorStoreIntegrationTests(BaseStandardTests): def vectorstore(self) -> VectorStore: """Get the vectorstore class to test. - The returned vectorstore should be EMPTY. + The returned vectorstore should be empty. """ @property @@ -398,7 +398,7 @@ class VectorStoreIntegrationTests(BaseStandardTests): assert documents == [] def test_add_documents_documents(self, vectorstore: VectorStore) -> None: - """Run add_documents tests. + """Run `add_documents` tests. ??? note "Troubleshooting" @@ -439,7 +439,7 @@ class VectorStoreIntegrationTests(BaseStandardTests): ) def test_add_documents_with_existing_ids(self, vectorstore: VectorStore) -> None: - """Test that add_documents with existing IDs is idempotent. + """Test that `add_documents` with existing IDs is idempotent. ??? note "Troubleshooting" @@ -754,7 +754,7 @@ class VectorStoreIntegrationTests(BaseStandardTests): async def test_add_documents_documents_async( self, vectorstore: VectorStore ) -> None: - """Run add_documents tests. + """Run `add_documents` tests. ??? note "Troubleshooting" @@ -797,7 +797,7 @@ class VectorStoreIntegrationTests(BaseStandardTests): async def test_add_documents_with_existing_ids_async( self, vectorstore: VectorStore ) -> None: - """Test that add_documents with existing IDs is idempotent. + """Test that `add_documents` with existing IDs is idempotent. ??? note "Troubleshooting" diff --git a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py index fb5077604f8..ca1fe6083fd 100644 --- a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py @@ -110,7 +110,7 @@ class ChatModelTests(BaseStandardTests): @property def has_tool_calling(self) -> bool: - """(bool) whether the model supports tool calling.""" + """Whether the model supports tool calling.""" return self.chat_model_class.bind_tools is not BaseChatModel.bind_tools @property @@ -120,7 +120,7 @@ class ChatModelTests(BaseStandardTests): @property def has_tool_choice(self) -> bool: - """(bool) whether the model supports tool calling.""" + """Whether the model supports tool calling.""" bind_tools_params = inspect.signature( self.chat_model_class.bind_tools ).parameters @@ -128,7 +128,7 @@ class ChatModelTests(BaseStandardTests): @property def has_structured_output(self) -> bool: - """(bool) whether the chat model supports structured output.""" + """Whether the chat model supports structured output.""" return ( self.chat_model_class.with_structured_output is not BaseChatModel.with_structured_output @@ -136,19 +136,19 @@ class ChatModelTests(BaseStandardTests): @property def structured_output_kwargs(self) -> dict: - """If specified, additional kwargs for with_structured_output.""" + """If specified, additional kwargs for `with_structured_output`.""" return {} @property def supports_json_mode(self) -> bool: - """(bool) whether the chat model supports JSON mode.""" + """Whether the chat model supports JSON mode.""" return False @property def supports_image_inputs(self) -> bool: """Supports image inputs. - (bool) whether the chat model supports image inputs, defaults to + Whether the chat model supports image inputs, defaults to `False`. """ @@ -158,7 +158,7 @@ class ChatModelTests(BaseStandardTests): def supports_image_urls(self) -> bool: """Supports image inputs from URLs. - (bool) whether the chat model supports image inputs from URLs, defaults to + Whether the chat model supports image inputs from URLs, defaults to `False`. """ @@ -166,14 +166,14 @@ class ChatModelTests(BaseStandardTests): @property def supports_pdf_inputs(self) -> bool: - """(bool) whether the chat model supports PDF inputs, defaults to `False`.""" + """Whether the chat model supports PDF inputs, defaults to `False`.""" return False @property def supports_audio_inputs(self) -> bool: """Supports audio inputs. - (bool) whether the chat model supports audio inputs, defaults to `False`. + Whether the chat model supports audio inputs, defaults to `False`. """ return False @@ -182,7 +182,7 @@ class ChatModelTests(BaseStandardTests): def supports_video_inputs(self) -> bool: """Supports video inputs. - (bool) whether the chat model supports video inputs, defaults to `False`. + Whether the chat model supports video inputs, defaults to `False`. No current tests are written for this feature. @@ -193,7 +193,7 @@ class ChatModelTests(BaseStandardTests): def returns_usage_metadata(self) -> bool: """Returns usage metadata. - (bool) whether the chat model returns usage metadata on invoke and streaming + Whether the chat model returns usage metadata on invoke and streaming responses. """ @@ -201,14 +201,14 @@ class ChatModelTests(BaseStandardTests): @property def supports_anthropic_inputs(self) -> bool: - """(bool) whether the chat model supports Anthropic-style inputs.""" + """Whether the chat model supports Anthropic-style inputs.""" return False @property def supports_image_tool_message(self) -> bool: """Supports image `ToolMessage` objects. - (bool) whether the chat model supports `ToolMessage` objects that include image + Whether the chat model supports `ToolMessage` objects that include image content. """ @@ -218,7 +218,7 @@ class ChatModelTests(BaseStandardTests): def supports_pdf_tool_message(self) -> bool: """Supports PDF `ToolMessage` objects. - (bool) whether the chat model supports `ToolMessage` objects that include PDF + Whether the chat model supports `ToolMessage` objects that include PDF content. """ @@ -226,7 +226,7 @@ class ChatModelTests(BaseStandardTests): @property def enable_vcr_tests(self) -> bool: - """(bool) whether to enable VCR tests for the chat model. + """Whether to enable VCR tests for the chat model. !!! warning See `enable_vcr_tests` dropdown `above ` for more @@ -252,8 +252,8 @@ class ChatModelTests(BaseStandardTests): ]: """Supported usage metadata details. - (dict) what usage metadata details are emitted in invoke and stream. Only - needs to be overridden if these details are returned by the model. + What usage metadata details are emitted in invoke and stream. Only needs to be + overridden if these details are returned by the model. """ return {"invoke": [], "stream": []} @@ -290,22 +290,21 @@ class ChatModelUnitTests(ChatModelTests): Test subclasses **must** implement the following two properties: - chat_model_class - The chat model class to test, e.g., `ChatParrotLink`. + `chat_model_class`: The chat model class to test, e.g., `ChatParrotLink`. - ```python - @property - def chat_model_class(self) -> Type[ChatParrotLink]: - return ChatParrotLink - ``` - chat_model_params - Initialization parameters for the chat model. + ```python + @property + def chat_model_class(self) -> Type[ChatParrotLink]: + return ChatParrotLink + ``` - ```python - @property - def chat_model_params(self) -> dict: - return {"model": "bird-brain-001", "temperature": 0} - ``` + `chat_model_params`: Initialization parameters for the chat model. + + ```python + @property + def chat_model_params(self) -> dict: + return {"model": "bird-brain-001", "temperature": 0} + ``` In addition, test subclasses can control what features are tested (such as tool calling or multi-modality) by selectively overriding the following properties. @@ -372,7 +371,7 @@ class ChatModelUnitTests(ChatModelTests): `with_structured_output` or `bind_tools` methods. If the base implementations are intended to be used, this method should be overridden. - See: https://python.langchain.com/docs/concepts/structured_outputs/ + See: https://docs.langchain.com/oss/python/langchain/structured-output ```python @property @@ -396,7 +395,7 @@ class ChatModelUnitTests(ChatModelTests): Boolean property indicating whether the chat model supports JSON mode in `with_structured_output`. - See: https://python.langchain.com/docs/concepts/structured_outputs/#json-mode + See: https://docs.langchain.com/oss/python/langchain/structured-output ```python @property @@ -430,7 +429,7 @@ class ChatModelUnitTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @@ -456,7 +455,7 @@ class ChatModelUnitTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @@ -482,7 +481,7 @@ class ChatModelUnitTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @@ -508,7 +507,7 @@ class ChatModelUnitTests(ChatModelTests): } ``` - See https://python.langchain.com/docs/concepts/multimodality/ + See https://docs.langchain.com/oss/python/langchain/models#multimodal ```python @property @@ -529,7 +528,7 @@ class ChatModelUnitTests(ChatModelTests): Defaults to `True`. - `usage_metadata` is an optional dict attribute on `AIMessage`s that track + `usage_metadata` is an optional dict attribute on `AIMessage` objects that track input and output tokens. [See more](https://python.langchain.com/api_reference/core/messages/langchain_core.messages.ai.UsageMetadata.html). @@ -651,7 +650,7 @@ class ChatModelUnitTests(ChatModelTests): Property controlling what usage metadata details are emitted in both `invoke` and `stream`. - `usage_metadata` is an optional dict attribute on `AIMessage`s that track + `usage_metadata` is an optional dict attribute on `AIMessage` objects that track input and output tokens. [See more](https://python.langchain.com/api_reference/core/messages/langchain_core.messages.ai.UsageMetadata.html). @@ -694,9 +693,7 @@ class ChatModelUnitTests(ChatModelTests): also exclude additional headers, override the default exclusions, or apply other customizations to the VCR configuration. See example below: - ```python - :caption: tests/conftest.py - + ```python title="tests/conftest.py" import pytest from langchain_tests.conftest import ( _base_vcr_config as _base_vcr_config, @@ -731,9 +728,7 @@ class ChatModelUnitTests(ChatModelTests): to your VCR fixture and enable this serializer in the config. See example below: - ```python - :caption: tests/conftest.py - + ```python title="tests/conftest.py" import pytest from langchain_tests.conftest import ( CustomPersister, @@ -814,36 +809,37 @@ class ChatModelUnitTests(ChatModelTests): You can then commit the cassette to your repository. Subsequent test runs will use the cassette instead of making HTTP calls. - Testing initialization from environment variables - Some unit tests may require testing initialization from environment variables. - These tests can be enabled by overriding the `init_from_env_params` - property (see below): + **Testing initialization from environment variables** - ??? note "`init_from_env_params`" + Some unit tests may require testing initialization from environment variables. + These tests can be enabled by overriding the `init_from_env_params` + property (see below). - This property is used in unit tests to test initialization from - environment variables. It should return a tuple of three dictionaries - that specify the environment variables, additional initialization args, - and expected instance attributes to check. + ??? note "`init_from_env_params`" - Defaults to empty dicts. If not overridden, the test is skipped. + This property is used in unit tests to test initialization from + environment variables. It should return a tuple of three dictionaries + that specify the environment variables, additional initialization args, + and expected instance attributes to check. - Example: - ```python - @property - def init_from_env_params(self) -> Tuple[dict, dict, dict]: - return ( - { - "MY_API_KEY": "api_key", - }, - { - "model": "bird-brain-001", - }, - { - "my_api_key": "api_key", - }, - ) - ``` + Defaults to empty dicts. If not overridden, the test is skipped. + + Example: + ```python + @property + def init_from_env_params(self) -> Tuple[dict, dict, dict]: + return ( + { + "MY_API_KEY": "api_key", + }, + { + "model": "bird-brain-001", + }, + { + "my_api_key": "api_key", + }, + ) + ``` ''' # noqa: E501,D214 @@ -858,9 +854,8 @@ class ChatModelUnitTests(ChatModelTests): def init_from_env_params(self) -> tuple[dict, dict, dict]: """Init from env params. - (tuple) environment variables, additional initialization args, and expected - instance attributes for testing initialization from environment variables. - + Environment variables, additional initialization args, and expected instance + attributes for testing initialization from environment variables. """ return {}, {}, {} diff --git a/libs/text-splitters/langchain_text_splitters/__init__.py b/libs/text-splitters/langchain_text_splitters/__init__.py index 828319c852d..e6c88699bae 100644 --- a/libs/text-splitters/langchain_text_splitters/__init__.py +++ b/libs/text-splitters/langchain_text_splitters/__init__.py @@ -1,8 +1,8 @@ """**Text Splitters** are classes for splitting text. !!! note - **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from - TextSplitter. + `MarkdownHeaderTextSplitter` and `HTMLHeaderTextSplitter` do not derive from + `TextSplitter`. """ from langchain_text_splitters.base import ( diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 44221aa21c8..4718306ba15 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -60,10 +60,10 @@ class TextSplitter(BaseDocumentTransformer, ABC): chunk_overlap: Overlap in characters between chunks length_function: Function that measures the length of given chunks keep_separator: Whether to keep the separator and where to place it - in each corresponding chunk (True='start') + in each corresponding chunk `(True='start')` add_start_index: If `True`, includes chunk's start index in metadata strip_whitespace: If `True`, strips whitespace from the start and end of - every document + every document """ if chunk_size <= 0: msg = f"chunk_size must be > 0, got {chunk_size}" @@ -91,7 +91,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): def create_documents( self, texts: list[str], metadatas: list[dict[Any, Any]] | None = None ) -> list[Document]: - """Create documents from a list of texts.""" + """Create a list of `Document` objects from a list of texts.""" metadatas_ = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): @@ -196,7 +196,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): disallowed_special: Literal["all"] | Collection[str] = "all", **kwargs: Any, ) -> Self: - """Text splitter that uses tiktoken encoder to count length.""" + """Text splitter that uses `tiktoken` encoder to count length.""" if not _HAS_TIKTOKEN: msg = ( "Could not import tiktoken python package. " @@ -280,7 +280,7 @@ class TokenTextSplitter(TextSplitter): Returns: A list of text chunks, where each chunk is derived from a portion - of the input text based on the tokenization and chunking rules. + of the input text based on the tokenization and chunking rules. """ def _encode(_text: str) -> list[int]: diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index fd8cc2bf32a..47596f135f4 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -84,17 +84,17 @@ def _find_all_tags( class HTMLHeaderTextSplitter: """Split HTML content into structured Documents based on specified headers. - Splits HTML content by detecting specified header tags (e.g.,

,

) and - creating hierarchical Document objects that reflect the semantic structure - of the original content. For each identified section, the splitter associates - the extracted text with metadata corresponding to the encountered headers. + Splits HTML content by detecting specified header tags and creating hierarchical + `Document` objects that reflect the semantic structure of the original content. For + each identified section, the splitter associates the extracted text with metadata + corresponding to the encountered headers. If no specified headers are found, the entire content is returned as a single - Document. This allows for flexible handling of HTML input, ensuring that + `Document`. This allows for flexible handling of HTML input, ensuring that information is organized according to its semantic headers. The splitter provides the option to return each HTML element as a separate - Document or aggregate them into semantically meaningful chunks. It also + `Document` or aggregate them into semantically meaningful chunks. It also gracefully handles multiple levels of nested headers, creating a rich, hierarchical representation of the content. @@ -151,15 +151,15 @@ class HTMLHeaderTextSplitter: """Initialize with headers to split on. Args: - headers_to_split_on: A list of (header_tag, - header_name) pairs representing the headers that define splitting - boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")] - will split content by

and

tags, assigning their textual - content to the Document metadata. + headers_to_split_on: A list of `(header_tag, + header_name)` pairs representing the headers that define splitting + boundaries. For example, `[("h1", "Header 1"), ("h2", "Header 2")]` + will split content by `h1` and `h2` tags, assigning their textual + content to the `Document` metadata. return_each_element: If `True`, every HTML element encountered (including headers, paragraphs, etc.) is returned as a separate - Document. If `False`, content under the same header hierarchy is - aggregated into fewer Documents. + `Document`. If `False`, content under the same header hierarchy is + aggregated into fewer `Document` objects. """ # Sort headers by their numeric level so that h1 < h2 < h3... self.headers_to_split_on = sorted( @@ -170,15 +170,15 @@ class HTMLHeaderTextSplitter: self.return_each_element = return_each_element def split_text(self, text: str) -> list[Document]: - """Split the given text into a list of Document objects. + """Split the given text into a list of `Document` objects. Args: text: The HTML text to split. Returns: - A list of split Document objects. Each Document contains - `page_content` holding the extracted text and `metadata` that maps - the header hierarchy to their corresponding titles. + A list of split Document objects. Each `Document` contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. """ return self.split_text_from_file(StringIO(text)) @@ -193,9 +193,9 @@ class HTMLHeaderTextSplitter: **kwargs: Additional keyword arguments for the request. Returns: - A list of split Document objects. Each Document contains - `page_content` holding the extracted text and `metadata` that maps - the header hierarchy to their corresponding titles. + A list of split Document objects. Each `Document` contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. Raises: requests.RequestException: If the HTTP request fails. @@ -205,15 +205,15 @@ class HTMLHeaderTextSplitter: return self.split_text(response.text) def split_text_from_file(self, file: str | IO[str]) -> list[Document]: - """Split HTML content from a file into a list of Document objects. + """Split HTML content from a file into a list of `Document` objects. Args: file: A file path or a file-like object containing HTML content. Returns: - A list of split Document objects. Each Document contains - `page_content` holding the extracted text and `metadata` that maps - the header hierarchy to their corresponding titles. + A list of split Document objects. Each `Document` contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. """ if isinstance(file, str): html_content = pathlib.Path(file).read_text(encoding="utf-8") @@ -339,7 +339,7 @@ class HTMLHeaderTextSplitter: class HTMLSectionSplitter: """Splitting HTML files based on specified tag and font sizes. - Requires lxml package. + Requires `lxml` package. """ def __init__( @@ -347,12 +347,12 @@ class HTMLSectionSplitter: headers_to_split_on: list[tuple[str, str]], **kwargs: Any, ) -> None: - """Create a new HTMLSectionSplitter. + """Create a new `HTMLSectionSplitter`. Args: headers_to_split_on: list of tuples of headers we want to track mapped to - (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4, - h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"]. + (arbitrary) keys for metadata. Allowed header values: `h1`, `h2`, `h3`, + `h4`, `h5`, `h6` e.g. `[("h1", "Header 1"), ("h2", "Header 2"]`. **kwargs: Additional optional arguments for customizations. """ @@ -385,7 +385,7 @@ class HTMLSectionSplitter: def create_documents( self, texts: list[str], metadatas: list[dict[Any, Any]] | None = None ) -> list[Document]: - """Create documents from a list of texts.""" + """Create a list of `Document` objects from a list of texts.""" metadatas_ = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): @@ -413,9 +413,9 @@ class HTMLSectionSplitter: Returns: A list of dictionaries representing sections. Each dictionary contains: - * 'header': The header text or a default title for the first section. - * 'content': The content under the header. - * 'tag_name': The name of the header tag (e.g., "h1", "h2"). + * `'header'`: The header text or a default title for the first section. + * `'content'`: The content under the header. + * `'tag_name'`: The name of the header tag (e.g., `h1`, `h2`). """ if not _HAS_BS4: msg = "Unable to import BeautifulSoup/PageElement, \ @@ -491,7 +491,7 @@ class HTMLSectionSplitter: return str(result) def split_text_from_file(self, file: StringIO) -> list[Document]: - """Split HTML content from a file into a list of Document objects. + """Split HTML content from a file into a list of `Document` objects. Args: file: A file path or a file-like object containing HTML content. @@ -524,10 +524,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): structure. If chunks exceed the maximum chunk size, it uses RecursiveCharacterTextSplitter for further splitting. - The splitter preserves full HTML elements (e.g., ,
    ) and converts - links to Markdown-like links. It can also preserve images, videos, and audio - elements by converting them into Markdown format. Note that some chunks may - exceed the maximum size to maintain semantic integrity. + The splitter preserves full HTML elements and converts links to Markdown-like links. + It can also preserve images, videos, and audio elements by converting them into + Markdown format. Note that some chunks may exceed the maximum size to maintain + semantic integrity. !!! version-added "Added in version 0.3.5" @@ -584,22 +584,22 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): """Initialize splitter. Args: - headers_to_split_on: HTML headers (e.g., "h1", "h2") + headers_to_split_on: HTML headers (e.g., `h1`, `h2`) that define content sections. max_chunk_size: Maximum size for each chunk, with allowance for exceeding this limit to preserve semantics. chunk_overlap: Number of characters to overlap between chunks to ensure contextual continuity. - separators: Delimiters used by RecursiveCharacterTextSplitter for + separators: Delimiters used by `RecursiveCharacterTextSplitter` for further splitting. - elements_to_preserve: HTML tags (e.g.,
,