text-splitters: Add ruff rule UP (pyupgrade) (#31841)

See https://docs.astral.sh/ruff/rules/#pyupgrade-up All auto-fixed except `typing.AbstractSet` -> `collections.abc.Set`
2025-08-19 09:30:15 +00:00 · 2025-07-03 16:11:35 +02:00 · 2025-07-03 16:11:35 +02:00 · 802d2bf249
commit 802d2bf249
parent 911b0b69ea
13 changed files with 106 additions and 115 deletions
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@ -3,19 +3,14 @@ from __future__ import annotations
 import copy
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Collection, Iterable, Sequence, Set
 from dataclasses import dataclass
 from enum import Enum
 from typing import (
-    AbstractSet,
    Any,
    Callable,
-    Collection,
-    Iterable,
-    List,
    Literal,
    Optional,
-    Sequence,
-    Type,
    TypeVar,
    Union,
 )
@ -64,12 +59,12 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        self._strip_whitespace = strip_whitespace

    @abstractmethod
-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split text into multiple components."""

    def create_documents(
        self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Create documents from a list of texts."""
        _metadatas = metadatas or [{}] * len(texts)
        documents = []
@ -87,7 +82,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
                documents.append(new_doc)
        return documents

-    def split_documents(self, documents: Iterable[Document]) -> List[Document]:
+    def split_documents(self, documents: Iterable[Document]) -> list[Document]:
        """Split documents."""
        texts, metadatas = [], []
        for doc in documents:
@ -95,7 +90,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
            metadatas.append(doc.metadata)
        return self.create_documents(texts, metadatas=metadatas)

-    def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
+    def _join_docs(self, docs: list[str], separator: str) -> Optional[str]:
        text = separator.join(docs)
        if self._strip_whitespace:
            text = text.strip()
@ -104,13 +99,13 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        else:
            return text

-    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> list[str]:
        # We now want to combine these smaller pieces into medium size
        # chunks to send to the LLM.
        separator_len = self._length_function(separator)

        docs = []
-        current_doc: List[str] = []
+        current_doc: list[str] = []
        total = 0
        for d in splits:
            _len = self._length_function(d)
@ -169,10 +164,10 @@ class TextSplitter(BaseDocumentTransformer, ABC):

    @classmethod
    def from_tiktoken_encoder(
-        cls: Type[TS],
+        cls: type[TS],
        encoding_name: str = "gpt2",
        model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+        allowed_special: Union[Literal["all"], Set[str]] = set(),
        disallowed_special: Union[Literal["all"], Collection[str]] = "all",
        **kwargs: Any,
    ) -> TS:
@ -225,7 +220,7 @@ class TokenTextSplitter(TextSplitter):
        self,
        encoding_name: str = "gpt2",
        model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+        allowed_special: Union[Literal["all"], Set[str]] = set(),
        disallowed_special: Union[Literal["all"], Collection[str]] = "all",
        **kwargs: Any,
    ) -> None:
@ -248,7 +243,7 @@ class TokenTextSplitter(TextSplitter):
        self._allowed_special = allowed_special
        self._disallowed_special = disallowed_special

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
@ -264,7 +259,7 @@ class TokenTextSplitter(TextSplitter):
            of the input text based on the tokenization and chunking rules.
        """

-        def _encode(_text: str) -> List[int]:
+        def _encode(_text: str) -> list[int]:
            return self._tokenizer.encode(
                _text,
                allowed_special=self._allowed_special,
@ -320,15 +315,15 @@ class Tokenizer:
    """Overlap in tokens between chunks"""
    tokens_per_chunk: int
    """Maximum number of tokens per chunk"""
-    decode: Callable[[List[int]], str]
+    decode: Callable[[list[int]], str]
    """ Function to decode a list of token ids to a string"""
-    encode: Callable[[str], List[int]]
+    encode: Callable[[str], list[int]]
    """ Function to encode a string to a list of token ids"""


-def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
+def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]:
    """Split incoming text and return chunks using tokenizer."""
-    splits: List[str] = []
+    splits: list[str] = []
    input_ids = tokenizer.encode(text)
    start_idx = 0
    cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
--- a/libs/text-splitters/langchain_text_splitters/character.py
+++ b/libs/text-splitters/langchain_text_splitters/character.py
@ -1,7 +1,7 @@
 from __future__ import annotations

 import re
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, Literal, Optional, Union

 from langchain_text_splitters.base import Language, TextSplitter

@ -17,7 +17,7 @@ class CharacterTextSplitter(TextSplitter):
        self._separator = separator
        self._is_separator_regex = is_separator_regex

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split into chunks without re-inserting lookaround separators."""
        # 1. Determine split pattern: raw regex or escaped literal
        sep_pattern = (
@ -46,7 +46,7 @@ class CharacterTextSplitter(TextSplitter):

 def _split_text_with_regex(
    text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
-) -> List[str]:
+) -> list[str]:
    # Now that we have the separator, split the text
    if separator:
        if keep_separator:
@ -80,7 +80,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):

    def __init__(
        self,
-        separators: Optional[List[str]] = None,
+        separators: Optional[list[str]] = None,
        keep_separator: Union[bool, Literal["start", "end"]] = True,
        is_separator_regex: bool = False,
        **kwargs: Any,
@ -90,7 +90,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
        self._separators = separators or ["\n\n", "\n", " ", ""]
        self._is_separator_regex = is_separator_regex

-    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+    def _split_text(self, text: str, separators: list[str]) -> list[str]:
        """Split incoming text and return chunks."""
        final_chunks = []
        # Get appropriate separator to use
@ -130,7 +130,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
            final_chunks.extend(merged_text)
        return final_chunks

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split the input text into smaller chunks based on predefined separators.

        Args:
@ -161,7 +161,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
        return cls(separators=separators, is_separator_regex=True, **kwargs)

    @staticmethod
-    def get_separators_for_language(language: Language) -> List[str]:
+    def get_separators_for_language(language: Language) -> list[str]:
        """Retrieve a list of separators specific to the given language.

        Args:
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@ -3,17 +3,13 @@ from __future__ import annotations
 import copy
 import pathlib
 import re
+from collections.abc import Iterable, Sequence
 from io import StringIO
 from typing import (
    Any,
    Callable,
-    Dict,
-    Iterable,
-    List,
    Literal,
    Optional,
-    Sequence,
-    Tuple,
    TypedDict,
    Union,
    cast,
@ -32,7 +28,7 @@ class ElementType(TypedDict):
    url: str
    xpath: str
    content: str
-    metadata: Dict[str, str]
+    metadata: dict[str, str]


 class HTMLHeaderTextSplitter:
@ -115,7 +111,7 @@ class HTMLHeaderTextSplitter:

    def __init__(
        self,
-        headers_to_split_on: List[Tuple[str, str]],
+        headers_to_split_on: list[tuple[str, str]],
        return_each_element: bool = False,
    ) -> None:
        """Initialize with headers to split on.
@ -134,7 +130,7 @@ class HTMLHeaderTextSplitter:
        self.header_tags = [tag for tag, _ in self.headers_to_split_on]
        self.return_each_element = return_each_element

-    def split_text(self, text: str) -> List[Document]:
+    def split_text(self, text: str) -> list[Document]:
        """Split the given text into a list of Document objects.

        Args:
@ -147,7 +143,7 @@ class HTMLHeaderTextSplitter:

    def split_text_from_url(
        self, url: str, timeout: int = 10, **kwargs: Any
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Fetch text content from a URL and split it into documents.

        Args:
@ -166,7 +162,7 @@ class HTMLHeaderTextSplitter:
        response.raise_for_status()
        return self.split_text(response.text)

-    def split_text_from_file(self, file: Any) -> List[Document]:
+    def split_text_from_file(self, file: Any) -> list[Document]:
        """Split HTML content from a file into a list of Document objects.

        Args:
@ -176,7 +172,7 @@ class HTMLHeaderTextSplitter:
            A list of split Document objects.
        """
        if isinstance(file, str):
-            with open(file, "r", encoding="utf-8") as f:
+            with open(file, encoding="utf-8") as f:
                html_content = f.read()
        else:
            html_content = file.read()
@ -208,8 +204,8 @@ class HTMLHeaderTextSplitter:
        # Dictionary of active headers:
        #   key = user-defined header name (e.g. "Header 1")
        #   value = (header_text, level, dom_depth)
-        active_headers: Dict[str, Tuple[str, int, int]] = {}
-        current_chunk: List[str] = []
+        active_headers: dict[str, tuple[str, int, int]] = {}
+        current_chunk: list[str] = []

        def finalize_chunk() -> Optional[Document]:
            """Finalize the accumulated chunk into a single Document."""
@ -308,7 +304,7 @@ class HTMLSectionSplitter:

    def __init__(
        self,
-        headers_to_split_on: List[Tuple[str, str]],
+        headers_to_split_on: list[tuple[str, str]],
        **kwargs: Any,
    ) -> None:
        """Create a new HTMLSectionSplitter.
@ -326,7 +322,7 @@ class HTMLSectionSplitter:
        ).absolute()
        self.kwargs = kwargs

-    def split_documents(self, documents: Iterable[Document]) -> List[Document]:
+    def split_documents(self, documents: Iterable[Document]) -> list[Document]:
        """Split documents."""
        texts, metadatas = [], []
        for doc in documents:
@ -338,7 +334,7 @@ class HTMLSectionSplitter:

        return text_splitter.split_documents(results)

-    def split_text(self, text: str) -> List[Document]:
+    def split_text(self, text: str) -> list[Document]:
        """Split HTML text string.

        Args:
@ -364,7 +360,7 @@ class HTMLSectionSplitter:
                documents.append(new_doc)
        return documents

-    def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
+    def split_html_by_headers(self, html_doc: str) -> list[dict[str, Optional[str]]]:
        """Split an HTML document into sections based on specified header tags.

        This method uses BeautifulSoup to parse the HTML content and divides it into
@ -466,7 +462,7 @@ class HTMLSectionSplitter:
        result = transform(tree)
        return str(result)

-    def split_text_from_file(self, file: Any) -> List[Document]:
+    def split_text_from_file(self, file: Any) -> list[Document]:
        """Split HTML content from a file into a list of Document objects.

        Args:
@ -571,23 +567,23 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

    def __init__(
        self,
-        headers_to_split_on: List[Tuple[str, str]],
+        headers_to_split_on: list[tuple[str, str]],
        *,
        max_chunk_size: int = 1000,
        chunk_overlap: int = 0,
-        separators: Optional[List[str]] = None,
-        elements_to_preserve: Optional[List[str]] = None,
+        separators: Optional[list[str]] = None,
+        elements_to_preserve: Optional[list[str]] = None,
        preserve_links: bool = False,
        preserve_images: bool = False,
        preserve_videos: bool = False,
        preserve_audio: bool = False,
-        custom_handlers: Optional[Dict[str, Callable[[Any], str]]] = None,
+        custom_handlers: Optional[dict[str, Callable[[Any], str]]] = None,
        stopword_removal: bool = False,
        stopword_lang: str = "english",
        normalize_text: bool = False,
-        external_metadata: Optional[Dict[str, str]] = None,
-        allowlist_tags: Optional[List[str]] = None,
-        denylist_tags: Optional[List[str]] = None,
+        external_metadata: Optional[dict[str, str]] = None,
+        allowlist_tags: Optional[list[str]] = None,
+        denylist_tags: Optional[list[str]] = None,
        preserve_parent_metadata: bool = False,
        keep_separator: Union[bool, Literal["start", "end"]] = True,
    ):
@ -654,7 +650,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
                    "Could not import nltk. Please install it with 'pip install nltk'."
                )

-    def split_text(self, text: str) -> List[Document]:
+    def split_text(self, text: str) -> list[Document]:
        """Splits the provided HTML text into smaller chunks based on the configuration.

        Args:
@ -677,7 +673,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Transform sequence of documents by splitting them."""
        transformed = []
        for doc in documents:
@ -776,7 +772,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

        return text

-    def _process_html(self, soup: Any) -> List[Document]:
+    def _process_html(self, soup: Any) -> list[Document]:
        """Processes the HTML content using BeautifulSoup and splits it using headers.

        Args:
@ -785,10 +781,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        Returns:
            List[Document]: A list of Document objects containing the split content.
        """
-        documents: List[Document] = []
-        current_headers: Dict[str, str] = {}
-        current_content: List[str] = []
-        preserved_elements: Dict[str, str] = {}
+        documents: list[Document] = []
+        current_headers: dict[str, str] = {}
+        current_content: list[str] = []
+        preserved_elements: dict[str, str] = {}
        placeholder_count: int = 0

        def _get_element_text(element: Any) -> str:
@ -821,13 +817,13 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        elements = soup.find_all(recursive=False)

        def _process_element(
-            element: List[Any],
-            documents: List[Document],
-            current_headers: Dict[str, str],
-            current_content: List[str],
-            preserved_elements: Dict[str, str],
+            element: list[Any],
+            documents: list[Document],
+            current_headers: dict[str, str],
+            current_content: list[str],
+            preserved_elements: dict[str, str],
            placeholder_count: int,
-        ) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
+        ) -> tuple[list[Document], dict[str, str], list[str], dict[str, str], int]:
            for elem in element:
                if elem.name.lower() in ["html", "body", "div", "main"]:
                    children = elem.find_all(recursive=False)
@ -910,7 +906,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

    def _create_documents(
        self, headers: dict[str, str], content: str, preserved_elements: dict[str, str]
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Creates Document objects from the provided headers, content, and elements.

        Args:
@ -936,7 +932,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

    def _further_split_chunk(
        self, content: str, metadata: dict[Any, Any], preserved_elements: dict[str, str]
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Further splits the content into smaller chunks.

        Args:
--- a/libs/text-splitters/langchain_text_splitters/json.py
+++ b/libs/text-splitters/langchain_text_splitters/json.py
@ -2,7 +2,7 @@ from __future__ import annotations

 import copy
 import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 from langchain_core.documents import Document

@ -123,10 +123,10 @@ class RecursiveJsonSplitter:

    def split_text(
        self,
-        json_data: Dict[str, Any],
+        json_data: dict[str, Any],
        convert_lists: bool = False,
        ensure_ascii: bool = True,
-    ) -> List[str]:
+    ) -> list[str]:
        """Splits JSON into a list of JSON formatted strings."""
        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)

--- a/libs/text-splitters/langchain_text_splitters/jsx.py
+++ b/libs/text-splitters/langchain_text_splitters/jsx.py
@ -1,5 +1,5 @@
 import re
-from typing import Any, List, Optional
+from typing import Any, Optional

 from langchain_text_splitters import RecursiveCharacterTextSplitter

@ -23,7 +23,7 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):

    def __init__(
        self,
-        separators: Optional[List[str]] = None,
+        separators: Optional[list[str]] = None,
        chunk_size: int = 2000,
        chunk_overlap: int = 0,
        **kwargs: Any,
@ -39,7 +39,7 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
        self._separators = separators or []

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split text into chunks.

        This method splits the text into chunks by:
--- a/libs/text-splitters/langchain_text_splitters/konlpy.py
+++ b/libs/text-splitters/langchain_text_splitters/konlpy.py
@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import Any, List
+from typing import Any

 from langchain_text_splitters.base import TextSplitter

@ -30,7 +30,7 @@ class KonlpyTextSplitter(TextSplitter):
            )
        self.kkma = konlpy.tag.Kkma()

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split incoming text and return chunks."""
        splits = self.kkma.sentences(text)
        return self._merge_splits(splits, self._separator)
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -1,7 +1,7 @@
 from __future__ import annotations

 import re
-from typing import Any, Dict, List, Tuple, TypedDict, Union
+from typing import Any, TypedDict, Union

 from langchain_core.documents import Document

@ -23,7 +23,7 @@ class MarkdownHeaderTextSplitter:

    def __init__(
        self,
-        headers_to_split_on: List[Tuple[str, str]],
+        headers_to_split_on: list[tuple[str, str]],
        return_each_line: bool = False,
        strip_headers: bool = True,
    ):
@ -44,13 +44,13 @@ class MarkdownHeaderTextSplitter:
        # Strip headers split headers from the content of the chunk
        self.strip_headers = strip_headers

-    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
+    def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
        """Combine lines with common metadata into chunks.

        Args:
            lines: Line of text / associated header metadata
        """
-        aggregated_chunks: List[LineType] = []
+        aggregated_chunks: list[LineType] = []

        for line in lines:
            if (
@ -87,7 +87,7 @@ class MarkdownHeaderTextSplitter:
            for chunk in aggregated_chunks
        ]

-    def split_text(self, text: str) -> List[Document]:
+    def split_text(self, text: str) -> list[Document]:
        """Split markdown file.

        Args:
@ -96,14 +96,14 @@ class MarkdownHeaderTextSplitter:
        # Split the input text by newline character ("\n").
        lines = text.split("\n")
        # Final output
-        lines_with_metadata: List[LineType] = []
+        lines_with_metadata: list[LineType] = []
        # Content and metadata of the chunk currently being processed
-        current_content: List[str] = []
-        current_metadata: Dict[str, str] = {}
+        current_content: list[str] = []
+        current_metadata: dict[str, str] = {}
        # Keep track of the nested header structure
        # header_stack: List[Dict[str, Union[int, str]]] = []
-        header_stack: List[HeaderType] = []
-        initial_metadata: Dict[str, str] = {}
+        header_stack: list[HeaderType] = []
+        initial_metadata: dict[str, str] = {}

        in_code_block = False
        opening_fence = ""
@ -217,7 +217,7 @@ class MarkdownHeaderTextSplitter:
 class LineType(TypedDict):
    """Line type as typed dict."""

-    metadata: Dict[str, str]
+    metadata: dict[str, str]
    content: str


@ -280,7 +280,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:

    def __init__(
        self,
-        headers_to_split_on: Union[List[Tuple[str, str]], None] = None,
+        headers_to_split_on: Union[list[tuple[str, str]], None] = None,
        return_each_line: bool = False,
        strip_headers: bool = True,
    ):
@ -300,9 +300,9 @@ class ExperimentalMarkdownSyntaxTextSplitter:
                Whether to exclude headers from the resulting chunks.
                Defaults to True.
        """
-        self.chunks: List[Document] = []
+        self.chunks: list[Document] = []
        self.current_chunk = Document(page_content="")
-        self.current_header_stack: List[Tuple[int, str]] = []
+        self.current_header_stack: list[tuple[int, str]] = []
        self.strip_headers = strip_headers
        if headers_to_split_on:
            self.splittable_headers = dict(headers_to_split_on)
@ -311,7 +311,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:

        self.return_each_line = return_each_line

-    def split_text(self, text: str) -> List[Document]:
+    def split_text(self, text: str) -> list[Document]:
        """Split the input text into structured chunks.

        This method processes the input text line by line, identifying and handling
@ -382,7 +382,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
                break
        self.current_header_stack.append((header_depth, header_text))

-    def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
+    def _resolve_code_chunk(self, current_line: str, raw_lines: list[str]) -> str:
        chunk = current_line
        while raw_lines:
            raw_line = raw_lines.pop(0)
--- a/libs/text-splitters/langchain_text_splitters/nltk.py
+++ b/libs/text-splitters/langchain_text_splitters/nltk.py
@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import Any, List
+from typing import Any

 from langchain_text_splitters.base import TextSplitter

@ -35,7 +35,7 @@ class NLTKTextSplitter(TextSplitter):
                "NLTK is not installed, please install it with `pip install nltk`."
            )

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split incoming text and return chunks."""
        # First we naively split the large input into a bunch of smaller ones.
        if self._use_span_tokenize:
--- a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
+++ b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import Any, List, Optional, cast
+from typing import Any, Optional, cast

 from langchain_text_splitters.base import TextSplitter, Tokenizer, split_text_on_tokens

@ -50,7 +50,7 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
                f" > maximum token limit."
            )

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Splits the input text into smaller components by splitting text on tokens.

        This method encodes the input text using a private `_encode` method, then
@ -65,7 +65,7 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
            encoding and processing.
        """

-        def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
+        def encode_strip_start_and_stop_token_ids(text: str) -> list[int]:
            return self._encode(text)[1:-1]

        tokenizer = Tokenizer(
--- a/libs/text-splitters/langchain_text_splitters/spacy.py
+++ b/libs/text-splitters/langchain_text_splitters/spacy.py
@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import Any, List
+from typing import Any

 from langchain_text_splitters.base import TextSplitter

@ -31,7 +31,7 @@ class SpacyTextSplitter(TextSplitter):
        self._separator = separator
        self._strip_whitespace = strip_whitespace

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        """Split incoming text and return chunks."""
        splits = (
            s.text if self._strip_whitespace else s.text_with_ws
--- a/libs/text-splitters/pyproject.toml
+++ b/libs/text-splitters/pyproject.toml
@ -61,8 +61,8 @@ ignore_missing_imports = "True"
 target-version = "py39"

 [tool.ruff.lint]
-select = ["E", "F", "I", "PGH003", "T201", "D"]
-ignore = ["D100"]
+select = ["E", "F", "I", "UP", "PGH003", "T201", "D"]
+ignore = ["D100", "UP007"]

 [tool.coverage.run]
 omit = ["tests/*"]
--- a/libs/text-splitters/tests/unit_tests/conftest.py
+++ b/libs/text-splitters/tests/unit_tests/conftest.py
@ -1,7 +1,7 @@
 """Configuration for unit tests."""

+from collections.abc import Sequence
 from importlib import util
-from typing import Dict, Sequence

 import pytest
 from pytest import Config, Function, Parser
@ -39,7 +39,7 @@ def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) ->
    """
    # Mapping from the name of a package to whether it is installed or not.
    # Used to avoid repeated calls to `util.find_spec`
-    required_pkgs_info: Dict[str, bool] = {}
+    required_pkgs_info: dict[str, bool] = {}

    only_extended = config.getoption("--only-extended") or False
    only_core = config.getoption("--only-core") or False
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -3,7 +3,7 @@
 import random
 import re
 import string
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable

 import pytest
 from langchain_core.documents import Document
@ -282,7 +282,7 @@ def test_create_documents_with_metadata() -> None:
    ],
 )
 def test_create_documents_with_start_index(
-    splitter: TextSplitter, text: str, expected_docs: List[Document]
+    splitter: TextSplitter, text: str, expected_docs: list[Document]
 ) -> None:
    """Test create documents method."""
    docs = splitter.create_documents([text])
@ -333,7 +333,7 @@ def test_iterative_text_splitter_discard_separator() -> None:
    ]


-def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> List[str]:
+def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> list[str]:
    chunk_size += 1 if keep_separator else 0

    splitter = RecursiveCharacterTextSplitter(
@ -2224,7 +2224,7 @@ def test_haskell_code_splitter() -> None:
@pytest.fixture
@pytest.mark.requires("bs4")
 def html_header_splitter_splitter_factory() -> Callable[
-    [List[Tuple[str, str]]], HTMLHeaderTextSplitter
+    [list[tuple[str, str]]], HTMLHeaderTextSplitter
 ]:
    """
    Fixture to create an HTMLHeaderTextSplitter instance with given headers.
@ -2232,7 +2232,7 @@ def html_header_splitter_splitter_factory() -> Callable[
    """

    def _create_splitter(
-        headers_to_split_on: List[Tuple[str, str]],
+        headers_to_split_on: list[tuple[str, str]],
    ) -> HTMLHeaderTextSplitter:
        return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

@ -2426,9 +2426,9 @@ def html_header_splitter_splitter_factory() -> Callable[
@pytest.mark.requires("bs4")
 def test_html_header_text_splitter(
    html_header_splitter_splitter_factory: Any,
-    headers_to_split_on: List[Tuple[str, str]],
+    headers_to_split_on: list[tuple[str, str]],
    html_input: str,
-    expected_documents: List[Document],
+    expected_documents: list[Document],
    test_case: str,
 ) -> None:
    """
@ -2582,9 +2582,9 @@ def test_html_header_text_splitter(
@pytest.mark.requires("bs4")
 def test_additional_html_header_text_splitter(
    html_header_splitter_splitter_factory: Any,
-    headers_to_split_on: List[Tuple[str, str]],
+    headers_to_split_on: list[tuple[str, str]],
    html_content: str,
-    expected_output: List[Document],
+    expected_output: list[Document],
    test_case: str,
 ) -> None:
    """
@ -2653,9 +2653,9 @@ def test_additional_html_header_text_splitter(
@pytest.mark.requires("bs4")
 def test_html_no_headers_with_multiple_splitters(
    html_header_splitter_splitter_factory: Any,
-    headers_to_split_on: List[Tuple[str, str]],
+    headers_to_split_on: list[tuple[str, str]],
    html_content: str,
-    expected_output: List[Document],
+    expected_output: list[Document],
    test_case: str,
 ) -> None:
    """
@ -3572,7 +3572,7 @@ def test_character_text_splitter_chunk_size_effect(
    is_regex: bool,
    text: str,
    chunk_size: int,
-    expected: List[str],
+    expected: list[str],
 ) -> None:
    splitter = CharacterTextSplitter(
        separator=separator,