text-splitters: Add ruff rule UP (pyupgrade) (#31841)

See https://docs.astral.sh/ruff/rules/#pyupgrade-up
All auto-fixed except `typing.AbstractSet` -> `collections.abc.Set`
This commit is contained in:
Christophe Bornet 2025-07-03 16:11:35 +02:00 committed by GitHub
parent 911b0b69ea
commit 802d2bf249
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 106 additions and 115 deletions

View File

@ -3,19 +3,14 @@ from __future__ import annotations
import copy
import logging
from abc import ABC, abstractmethod
from collections.abc import Collection, Iterable, Sequence, Set
from dataclasses import dataclass
from enum import Enum
from typing import (
AbstractSet,
Any,
Callable,
Collection,
Iterable,
List,
Literal,
Optional,
Sequence,
Type,
TypeVar,
Union,
)
@ -64,12 +59,12 @@ class TextSplitter(BaseDocumentTransformer, ABC):
self._strip_whitespace = strip_whitespace
@abstractmethod
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split text into multiple components."""
def create_documents(
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
) -> List[Document]:
) -> list[Document]:
"""Create documents from a list of texts."""
_metadatas = metadatas or [{}] * len(texts)
documents = []
@ -87,7 +82,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
documents.append(new_doc)
return documents
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
def split_documents(self, documents: Iterable[Document]) -> list[Document]:
"""Split documents."""
texts, metadatas = [], []
for doc in documents:
@ -95,7 +90,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
metadatas.append(doc.metadata)
return self.create_documents(texts, metadatas=metadatas)
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
def _join_docs(self, docs: list[str], separator: str) -> Optional[str]:
text = separator.join(docs)
if self._strip_whitespace:
text = text.strip()
@ -104,13 +99,13 @@ class TextSplitter(BaseDocumentTransformer, ABC):
else:
return text
def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
def _merge_splits(self, splits: Iterable[str], separator: str) -> list[str]:
# We now want to combine these smaller pieces into medium size
# chunks to send to the LLM.
separator_len = self._length_function(separator)
docs = []
current_doc: List[str] = []
current_doc: list[str] = []
total = 0
for d in splits:
_len = self._length_function(d)
@ -169,10 +164,10 @@ class TextSplitter(BaseDocumentTransformer, ABC):
@classmethod
def from_tiktoken_encoder(
cls: Type[TS],
cls: type[TS],
encoding_name: str = "gpt2",
model_name: Optional[str] = None,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
allowed_special: Union[Literal["all"], Set[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
**kwargs: Any,
) -> TS:
@ -225,7 +220,7 @@ class TokenTextSplitter(TextSplitter):
self,
encoding_name: str = "gpt2",
model_name: Optional[str] = None,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
allowed_special: Union[Literal["all"], Set[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
**kwargs: Any,
) -> None:
@ -248,7 +243,7 @@ class TokenTextSplitter(TextSplitter):
self._allowed_special = allowed_special
self._disallowed_special = disallowed_special
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Splits the input text into smaller chunks based on tokenization.
This method uses a custom tokenizer configuration to encode the input text
@ -264,7 +259,7 @@ class TokenTextSplitter(TextSplitter):
of the input text based on the tokenization and chunking rules.
"""
def _encode(_text: str) -> List[int]:
def _encode(_text: str) -> list[int]:
return self._tokenizer.encode(
_text,
allowed_special=self._allowed_special,
@ -320,15 +315,15 @@ class Tokenizer:
"""Overlap in tokens between chunks"""
tokens_per_chunk: int
"""Maximum number of tokens per chunk"""
decode: Callable[[List[int]], str]
decode: Callable[[list[int]], str]
""" Function to decode a list of token ids to a string"""
encode: Callable[[str], List[int]]
encode: Callable[[str], list[int]]
""" Function to encode a string to a list of token ids"""
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]:
"""Split incoming text and return chunks using tokenizer."""
splits: List[str] = []
splits: list[str] = []
input_ids = tokenizer.encode(text)
start_idx = 0
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import re
from typing import Any, List, Literal, Optional, Union
from typing import Any, Literal, Optional, Union
from langchain_text_splitters.base import Language, TextSplitter
@ -17,7 +17,7 @@ class CharacterTextSplitter(TextSplitter):
self._separator = separator
self._is_separator_regex = is_separator_regex
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split into chunks without re-inserting lookaround separators."""
# 1. Determine split pattern: raw regex or escaped literal
sep_pattern = (
@ -46,7 +46,7 @@ class CharacterTextSplitter(TextSplitter):
def _split_text_with_regex(
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
) -> List[str]:
) -> list[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
@ -80,7 +80,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
def __init__(
self,
separators: Optional[List[str]] = None,
separators: Optional[list[str]] = None,
keep_separator: Union[bool, Literal["start", "end"]] = True,
is_separator_regex: bool = False,
**kwargs: Any,
@ -90,7 +90,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
self._separators = separators or ["\n\n", "\n", " ", ""]
self._is_separator_regex = is_separator_regex
def _split_text(self, text: str, separators: List[str]) -> List[str]:
def _split_text(self, text: str, separators: list[str]) -> list[str]:
"""Split incoming text and return chunks."""
final_chunks = []
# Get appropriate separator to use
@ -130,7 +130,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
final_chunks.extend(merged_text)
return final_chunks
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split the input text into smaller chunks based on predefined separators.
Args:
@ -161,7 +161,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
return cls(separators=separators, is_separator_regex=True, **kwargs)
@staticmethod
def get_separators_for_language(language: Language) -> List[str]:
def get_separators_for_language(language: Language) -> list[str]:
"""Retrieve a list of separators specific to the given language.
Args:

View File

@ -3,17 +3,13 @@ from __future__ import annotations
import copy
import pathlib
import re
from collections.abc import Iterable, Sequence
from io import StringIO
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Literal,
Optional,
Sequence,
Tuple,
TypedDict,
Union,
cast,
@ -32,7 +28,7 @@ class ElementType(TypedDict):
url: str
xpath: str
content: str
metadata: Dict[str, str]
metadata: dict[str, str]
class HTMLHeaderTextSplitter:
@ -115,7 +111,7 @@ class HTMLHeaderTextSplitter:
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
return_each_element: bool = False,
) -> None:
"""Initialize with headers to split on.
@ -134,7 +130,7 @@ class HTMLHeaderTextSplitter:
self.header_tags = [tag for tag, _ in self.headers_to_split_on]
self.return_each_element = return_each_element
def split_text(self, text: str) -> List[Document]:
def split_text(self, text: str) -> list[Document]:
"""Split the given text into a list of Document objects.
Args:
@ -147,7 +143,7 @@ class HTMLHeaderTextSplitter:
def split_text_from_url(
self, url: str, timeout: int = 10, **kwargs: Any
) -> List[Document]:
) -> list[Document]:
"""Fetch text content from a URL and split it into documents.
Args:
@ -166,7 +162,7 @@ class HTMLHeaderTextSplitter:
response.raise_for_status()
return self.split_text(response.text)
def split_text_from_file(self, file: Any) -> List[Document]:
def split_text_from_file(self, file: Any) -> list[Document]:
"""Split HTML content from a file into a list of Document objects.
Args:
@ -176,7 +172,7 @@ class HTMLHeaderTextSplitter:
A list of split Document objects.
"""
if isinstance(file, str):
with open(file, "r", encoding="utf-8") as f:
with open(file, encoding="utf-8") as f:
html_content = f.read()
else:
html_content = file.read()
@ -208,8 +204,8 @@ class HTMLHeaderTextSplitter:
# Dictionary of active headers:
# key = user-defined header name (e.g. "Header 1")
# value = (header_text, level, dom_depth)
active_headers: Dict[str, Tuple[str, int, int]] = {}
current_chunk: List[str] = []
active_headers: dict[str, tuple[str, int, int]] = {}
current_chunk: list[str] = []
def finalize_chunk() -> Optional[Document]:
"""Finalize the accumulated chunk into a single Document."""
@ -308,7 +304,7 @@ class HTMLSectionSplitter:
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
**kwargs: Any,
) -> None:
"""Create a new HTMLSectionSplitter.
@ -326,7 +322,7 @@ class HTMLSectionSplitter:
).absolute()
self.kwargs = kwargs
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
def split_documents(self, documents: Iterable[Document]) -> list[Document]:
"""Split documents."""
texts, metadatas = [], []
for doc in documents:
@ -338,7 +334,7 @@ class HTMLSectionSplitter:
return text_splitter.split_documents(results)
def split_text(self, text: str) -> List[Document]:
def split_text(self, text: str) -> list[Document]:
"""Split HTML text string.
Args:
@ -364,7 +360,7 @@ class HTMLSectionSplitter:
documents.append(new_doc)
return documents
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
def split_html_by_headers(self, html_doc: str) -> list[dict[str, Optional[str]]]:
"""Split an HTML document into sections based on specified header tags.
This method uses BeautifulSoup to parse the HTML content and divides it into
@ -466,7 +462,7 @@ class HTMLSectionSplitter:
result = transform(tree)
return str(result)
def split_text_from_file(self, file: Any) -> List[Document]:
def split_text_from_file(self, file: Any) -> list[Document]:
"""Split HTML content from a file into a list of Document objects.
Args:
@ -571,23 +567,23 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
*,
max_chunk_size: int = 1000,
chunk_overlap: int = 0,
separators: Optional[List[str]] = None,
elements_to_preserve: Optional[List[str]] = None,
separators: Optional[list[str]] = None,
elements_to_preserve: Optional[list[str]] = None,
preserve_links: bool = False,
preserve_images: bool = False,
preserve_videos: bool = False,
preserve_audio: bool = False,
custom_handlers: Optional[Dict[str, Callable[[Any], str]]] = None,
custom_handlers: Optional[dict[str, Callable[[Any], str]]] = None,
stopword_removal: bool = False,
stopword_lang: str = "english",
normalize_text: bool = False,
external_metadata: Optional[Dict[str, str]] = None,
allowlist_tags: Optional[List[str]] = None,
denylist_tags: Optional[List[str]] = None,
external_metadata: Optional[dict[str, str]] = None,
allowlist_tags: Optional[list[str]] = None,
denylist_tags: Optional[list[str]] = None,
preserve_parent_metadata: bool = False,
keep_separator: Union[bool, Literal["start", "end"]] = True,
):
@ -654,7 +650,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
"Could not import nltk. Please install it with 'pip install nltk'."
)
def split_text(self, text: str) -> List[Document]:
def split_text(self, text: str) -> list[Document]:
"""Splits the provided HTML text into smaller chunks based on the configuration.
Args:
@ -677,7 +673,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> List[Document]:
) -> list[Document]:
"""Transform sequence of documents by splitting them."""
transformed = []
for doc in documents:
@ -776,7 +772,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
return text
def _process_html(self, soup: Any) -> List[Document]:
def _process_html(self, soup: Any) -> list[Document]:
"""Processes the HTML content using BeautifulSoup and splits it using headers.
Args:
@ -785,10 +781,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
Returns:
List[Document]: A list of Document objects containing the split content.
"""
documents: List[Document] = []
current_headers: Dict[str, str] = {}
current_content: List[str] = []
preserved_elements: Dict[str, str] = {}
documents: list[Document] = []
current_headers: dict[str, str] = {}
current_content: list[str] = []
preserved_elements: dict[str, str] = {}
placeholder_count: int = 0
def _get_element_text(element: Any) -> str:
@ -821,13 +817,13 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
elements = soup.find_all(recursive=False)
def _process_element(
element: List[Any],
documents: List[Document],
current_headers: Dict[str, str],
current_content: List[str],
preserved_elements: Dict[str, str],
element: list[Any],
documents: list[Document],
current_headers: dict[str, str],
current_content: list[str],
preserved_elements: dict[str, str],
placeholder_count: int,
) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
) -> tuple[list[Document], dict[str, str], list[str], dict[str, str], int]:
for elem in element:
if elem.name.lower() in ["html", "body", "div", "main"]:
children = elem.find_all(recursive=False)
@ -910,7 +906,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
def _create_documents(
self, headers: dict[str, str], content: str, preserved_elements: dict[str, str]
) -> List[Document]:
) -> list[Document]:
"""Creates Document objects from the provided headers, content, and elements.
Args:
@ -936,7 +932,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
def _further_split_chunk(
self, content: str, metadata: dict[Any, Any], preserved_elements: dict[str, str]
) -> List[Document]:
) -> list[Document]:
"""Further splits the content into smaller chunks.
Args:

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import copy
import json
from typing import Any, Dict, List, Optional
from typing import Any, Optional
from langchain_core.documents import Document
@ -123,10 +123,10 @@ class RecursiveJsonSplitter:
def split_text(
self,
json_data: Dict[str, Any],
json_data: dict[str, Any],
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> List[str]:
) -> list[str]:
"""Splits JSON into a list of JSON formatted strings."""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)

View File

@ -1,5 +1,5 @@
import re
from typing import Any, List, Optional
from typing import Any, Optional
from langchain_text_splitters import RecursiveCharacterTextSplitter
@ -23,7 +23,7 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
separators: Optional[List[str]] = None,
separators: Optional[list[str]] = None,
chunk_size: int = 2000,
chunk_overlap: int = 0,
**kwargs: Any,
@ -39,7 +39,7 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
self._separators = separators or []
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split text into chunks.
This method splits the text into chunks by:

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, List
from typing import Any
from langchain_text_splitters.base import TextSplitter
@ -30,7 +30,7 @@ class KonlpyTextSplitter(TextSplitter):
)
self.kkma = konlpy.tag.Kkma()
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split incoming text and return chunks."""
splits = self.kkma.sentences(text)
return self._merge_splits(splits, self._separator)

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import re
from typing import Any, Dict, List, Tuple, TypedDict, Union
from typing import Any, TypedDict, Union
from langchain_core.documents import Document
@ -23,7 +23,7 @@ class MarkdownHeaderTextSplitter:
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
return_each_line: bool = False,
strip_headers: bool = True,
):
@ -44,13 +44,13 @@ class MarkdownHeaderTextSplitter:
# Strip headers split headers from the content of the chunk
self.strip_headers = strip_headers
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
"""Combine lines with common metadata into chunks.
Args:
lines: Line of text / associated header metadata
"""
aggregated_chunks: List[LineType] = []
aggregated_chunks: list[LineType] = []
for line in lines:
if (
@ -87,7 +87,7 @@ class MarkdownHeaderTextSplitter:
for chunk in aggregated_chunks
]
def split_text(self, text: str) -> List[Document]:
def split_text(self, text: str) -> list[Document]:
"""Split markdown file.
Args:
@ -96,14 +96,14 @@ class MarkdownHeaderTextSplitter:
# Split the input text by newline character ("\n").
lines = text.split("\n")
# Final output
lines_with_metadata: List[LineType] = []
lines_with_metadata: list[LineType] = []
# Content and metadata of the chunk currently being processed
current_content: List[str] = []
current_metadata: Dict[str, str] = {}
current_content: list[str] = []
current_metadata: dict[str, str] = {}
# Keep track of the nested header structure
# header_stack: List[Dict[str, Union[int, str]]] = []
header_stack: List[HeaderType] = []
initial_metadata: Dict[str, str] = {}
header_stack: list[HeaderType] = []
initial_metadata: dict[str, str] = {}
in_code_block = False
opening_fence = ""
@ -217,7 +217,7 @@ class MarkdownHeaderTextSplitter:
class LineType(TypedDict):
"""Line type as typed dict."""
metadata: Dict[str, str]
metadata: dict[str, str]
content: str
@ -280,7 +280,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
def __init__(
self,
headers_to_split_on: Union[List[Tuple[str, str]], None] = None,
headers_to_split_on: Union[list[tuple[str, str]], None] = None,
return_each_line: bool = False,
strip_headers: bool = True,
):
@ -300,9 +300,9 @@ class ExperimentalMarkdownSyntaxTextSplitter:
Whether to exclude headers from the resulting chunks.
Defaults to True.
"""
self.chunks: List[Document] = []
self.chunks: list[Document] = []
self.current_chunk = Document(page_content="")
self.current_header_stack: List[Tuple[int, str]] = []
self.current_header_stack: list[tuple[int, str]] = []
self.strip_headers = strip_headers
if headers_to_split_on:
self.splittable_headers = dict(headers_to_split_on)
@ -311,7 +311,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
self.return_each_line = return_each_line
def split_text(self, text: str) -> List[Document]:
def split_text(self, text: str) -> list[Document]:
"""Split the input text into structured chunks.
This method processes the input text line by line, identifying and handling
@ -382,7 +382,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
break
self.current_header_stack.append((header_depth, header_text))
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
def _resolve_code_chunk(self, current_line: str, raw_lines: list[str]) -> str:
chunk = current_line
while raw_lines:
raw_line = raw_lines.pop(0)

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, List
from typing import Any
from langchain_text_splitters.base import TextSplitter
@ -35,7 +35,7 @@ class NLTKTextSplitter(TextSplitter):
"NLTK is not installed, please install it with `pip install nltk`."
)
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split incoming text and return chunks."""
# First we naively split the large input into a bunch of smaller ones.
if self._use_span_tokenize:

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, List, Optional, cast
from typing import Any, Optional, cast
from langchain_text_splitters.base import TextSplitter, Tokenizer, split_text_on_tokens
@ -50,7 +50,7 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
f" > maximum token limit."
)
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Splits the input text into smaller components by splitting text on tokens.
This method encodes the input text using a private `_encode` method, then
@ -65,7 +65,7 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
encoding and processing.
"""
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
def encode_strip_start_and_stop_token_ids(text: str) -> list[int]:
return self._encode(text)[1:-1]
tokenizer = Tokenizer(

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, List
from typing import Any
from langchain_text_splitters.base import TextSplitter
@ -31,7 +31,7 @@ class SpacyTextSplitter(TextSplitter):
self._separator = separator
self._strip_whitespace = strip_whitespace
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> list[str]:
"""Split incoming text and return chunks."""
splits = (
s.text if self._strip_whitespace else s.text_with_ws

View File

@ -61,8 +61,8 @@ ignore_missing_imports = "True"
target-version = "py39"
[tool.ruff.lint]
select = ["E", "F", "I", "PGH003", "T201", "D"]
ignore = ["D100"]
select = ["E", "F", "I", "UP", "PGH003", "T201", "D"]
ignore = ["D100", "UP007"]
[tool.coverage.run]
omit = ["tests/*"]

View File

@ -1,7 +1,7 @@
"""Configuration for unit tests."""
from collections.abc import Sequence
from importlib import util
from typing import Dict, Sequence
import pytest
from pytest import Config, Function, Parser
@ -39,7 +39,7 @@ def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) ->
"""
# Mapping from the name of a package to whether it is installed or not.
# Used to avoid repeated calls to `util.find_spec`
required_pkgs_info: Dict[str, bool] = {}
required_pkgs_info: dict[str, bool] = {}
only_extended = config.getoption("--only-extended") or False
only_core = config.getoption("--only-core") or False

View File

@ -3,7 +3,7 @@
import random
import re
import string
from typing import Any, Callable, List, Tuple
from typing import Any, Callable
import pytest
from langchain_core.documents import Document
@ -282,7 +282,7 @@ def test_create_documents_with_metadata() -> None:
],
)
def test_create_documents_with_start_index(
splitter: TextSplitter, text: str, expected_docs: List[Document]
splitter: TextSplitter, text: str, expected_docs: list[Document]
) -> None:
"""Test create documents method."""
docs = splitter.create_documents([text])
@ -333,7 +333,7 @@ def test_iterative_text_splitter_discard_separator() -> None:
]
def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> List[str]:
def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> list[str]:
chunk_size += 1 if keep_separator else 0
splitter = RecursiveCharacterTextSplitter(
@ -2224,7 +2224,7 @@ def test_haskell_code_splitter() -> None:
@pytest.fixture
@pytest.mark.requires("bs4")
def html_header_splitter_splitter_factory() -> Callable[
[List[Tuple[str, str]]], HTMLHeaderTextSplitter
[list[tuple[str, str]]], HTMLHeaderTextSplitter
]:
"""
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
@ -2232,7 +2232,7 @@ def html_header_splitter_splitter_factory() -> Callable[
"""
def _create_splitter(
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
) -> HTMLHeaderTextSplitter:
return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
@ -2426,9 +2426,9 @@ def html_header_splitter_splitter_factory() -> Callable[
@pytest.mark.requires("bs4")
def test_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
html_input: str,
expected_documents: List[Document],
expected_documents: list[Document],
test_case: str,
) -> None:
"""
@ -2582,9 +2582,9 @@ def test_html_header_text_splitter(
@pytest.mark.requires("bs4")
def test_additional_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
html_content: str,
expected_output: List[Document],
expected_output: list[Document],
test_case: str,
) -> None:
"""
@ -2653,9 +2653,9 @@ def test_additional_html_header_text_splitter(
@pytest.mark.requires("bs4")
def test_html_no_headers_with_multiple_splitters(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
headers_to_split_on: list[tuple[str, str]],
html_content: str,
expected_output: List[Document],
expected_output: list[Document],
test_case: str,
) -> None:
"""
@ -3572,7 +3572,7 @@ def test_character_text_splitter_chunk_size_effect(
is_regex: bool,
text: str,
chunk_size: int,
expected: List[str],
expected: list[str],
) -> None:
splitter = CharacterTextSplitter(
separator=separator,