text-splitters: Add ruff rules FBT (#31935)

See [flake8-boolean-trap
(FBT)](https://docs.astral.sh/ruff/rules/#flake8-boolean-trap-fbt)
This commit is contained in:
Christophe Bornet 2025-07-10 00:36:58 +02:00 committed by GitHub
parent 4d9c0b0883
commit 060fc0e3c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 37 additions and 27 deletions

View File

@ -31,9 +31,9 @@ class TextSplitter(BaseDocumentTransformer, ABC):
chunk_size: int = 4000, chunk_size: int = 4000,
chunk_overlap: int = 200, chunk_overlap: int = 200,
length_function: Callable[[str], int] = len, length_function: Callable[[str], int] = len,
keep_separator: Union[bool, Literal["start", "end"]] = False, keep_separator: Union[bool, Literal["start", "end"]] = False, # noqa: FBT001,FBT002
add_start_index: bool = False, add_start_index: bool = False, # noqa: FBT001,FBT002
strip_whitespace: bool = True, strip_whitespace: bool = True, # noqa: FBT001,FBT002
) -> None: ) -> None:
"""Create a new TextSplitter. """Create a new TextSplitter.

View File

@ -10,7 +10,10 @@ class CharacterTextSplitter(TextSplitter):
"""Splitting text that looks at characters.""" """Splitting text that looks at characters."""
def __init__( def __init__(
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any self,
separator: str = "\n\n",
is_separator_regex: bool = False, # noqa: FBT001,FBT002
**kwargs: Any,
) -> None: ) -> None:
"""Create a new TextSplitter.""" """Create a new TextSplitter."""
super().__init__(**kwargs) super().__init__(**kwargs)
@ -25,7 +28,9 @@ class CharacterTextSplitter(TextSplitter):
) )
# 2. Initial split (keep separator if requested) # 2. Initial split (keep separator if requested)
splits = _split_text_with_regex(text, sep_pattern, self._keep_separator) splits = _split_text_with_regex(
text, sep_pattern, keep_separator=self._keep_separator
)
# 3. Detect zero-width lookaround so we never re-insert it # 3. Detect zero-width lookaround so we never re-insert it
lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!") lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")
@ -45,7 +50,7 @@ class CharacterTextSplitter(TextSplitter):
def _split_text_with_regex( def _split_text_with_regex(
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] text: str, separator: str, *, keep_separator: Union[bool, Literal["start", "end"]]
) -> list[str]: ) -> list[str]:
# Now that we have the separator, split the text # Now that we have the separator, split the text
if separator: if separator:
@ -81,8 +86,8 @@ class RecursiveCharacterTextSplitter(TextSplitter):
def __init__( def __init__(
self, self,
separators: Optional[list[str]] = None, separators: Optional[list[str]] = None,
keep_separator: Union[bool, Literal["start", "end"]] = True, keep_separator: Union[bool, Literal["start", "end"]] = True, # noqa: FBT001,FBT002
is_separator_regex: bool = False, is_separator_regex: bool = False, # noqa: FBT001,FBT002
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Create a new TextSplitter.""" """Create a new TextSplitter."""
@ -107,7 +112,9 @@ class RecursiveCharacterTextSplitter(TextSplitter):
break break
_separator = separator if self._is_separator_regex else re.escape(separator) _separator = separator if self._is_separator_regex else re.escape(separator)
splits = _split_text_with_regex(text, _separator, self._keep_separator) splits = _split_text_with_regex(
text, _separator, keep_separator=self._keep_separator
)
# Now go merging things, recursively splitting longer texts. # Now go merging things, recursively splitting longer texts.
_good_splits = [] _good_splits = []

View File

@ -112,7 +112,7 @@ class HTMLHeaderTextSplitter:
def __init__( def __init__(
self, self,
headers_to_split_on: list[tuple[str, str]], headers_to_split_on: list[tuple[str, str]],
return_each_element: bool = False, return_each_element: bool = False, # noqa: FBT001,FBT002
) -> None: ) -> None:
"""Initialize with headers to split on. """Initialize with headers to split on.
@ -744,7 +744,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
soup (Any): Parsed HTML content using BeautifulSoup. soup (Any): Parsed HTML content using BeautifulSoup.
""" """
if self._allowlist_tags: if self._allowlist_tags:
for tag in soup.find_all(True): for tag in soup.find_all(name=True):
if tag.name not in self._allowlist_tags: if tag.name not in self._allowlist_tags:
tag.decompose() tag.decompose()

View File

@ -107,7 +107,7 @@ class RecursiveJsonSplitter:
def split_json( def split_json(
self, self,
json_data: dict[str, Any], json_data: dict[str, Any],
convert_lists: bool = False, convert_lists: bool = False, # noqa: FBT001,FBT002
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
"""Splits JSON into a list of JSON chunks.""" """Splits JSON into a list of JSON chunks."""
if convert_lists: if convert_lists:
@ -123,8 +123,8 @@ class RecursiveJsonSplitter:
def split_text( def split_text(
self, self,
json_data: dict[str, Any], json_data: dict[str, Any],
convert_lists: bool = False, convert_lists: bool = False, # noqa: FBT001,FBT002
ensure_ascii: bool = True, ensure_ascii: bool = True, # noqa: FBT001,FBT002
) -> list[str]: ) -> list[str]:
"""Splits JSON into a list of JSON formatted strings.""" """Splits JSON into a list of JSON formatted strings."""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists) chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
@ -135,8 +135,8 @@ class RecursiveJsonSplitter:
def create_documents( def create_documents(
self, self,
texts: list[dict[str, Any]], texts: list[dict[str, Any]],
convert_lists: bool = False, convert_lists: bool = False, # noqa: FBT001,FBT002
ensure_ascii: bool = True, ensure_ascii: bool = True, # noqa: FBT001,FBT002
metadatas: Optional[list[dict[Any, Any]]] = None, metadatas: Optional[list[dict[Any, Any]]] = None,
) -> list[Document]: ) -> list[Document]:
"""Create documents from a list of json objects (Dict).""" """Create documents from a list of json objects (Dict)."""

View File

@ -24,8 +24,8 @@ class MarkdownHeaderTextSplitter:
def __init__( def __init__(
self, self,
headers_to_split_on: list[tuple[str, str]], headers_to_split_on: list[tuple[str, str]],
return_each_line: bool = False, return_each_line: bool = False, # noqa: FBT001,FBT002
strip_headers: bool = True, strip_headers: bool = True, # noqa: FBT001,FBT002
): ):
"""Create a new MarkdownHeaderTextSplitter. """Create a new MarkdownHeaderTextSplitter.
@ -279,8 +279,8 @@ class ExperimentalMarkdownSyntaxTextSplitter:
def __init__( def __init__(
self, self,
headers_to_split_on: Union[list[tuple[str, str]], None] = None, headers_to_split_on: Union[list[tuple[str, str]], None] = None,
return_each_line: bool = False, return_each_line: bool = False, # noqa: FBT001,FBT002
strip_headers: bool = True, strip_headers: bool = True, # noqa: FBT001,FBT002
): ):
"""Initialize the text splitter with header splitting and formatting options. """Initialize the text splitter with header splitting and formatting options.

View File

@ -61,7 +61,7 @@ ignore_missing_imports = "True"
target-version = "py39" target-version = "py39"
[tool.ruff.lint] [tool.ruff.lint]
select = ["E", "F", "I", "UP", "PGH003", "T201", "D"] select = ["E", "F", "FBT", "I", "UP", "PGH003", "T201", "D"]
ignore = ["D100",] ignore = ["D100",]
pyupgrade.keep-runtime-typing = true pyupgrade.keep-runtime-typing = true

View File

@ -101,7 +101,7 @@ def test_character_text_splitter_longer_words() -> None:
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
) )
def test_character_text_splitter_keep_separator_regex( def test_character_text_splitter_keep_separator_regex(
separator: str, is_separator_regex: bool *, separator: str, is_separator_regex: bool
) -> None: ) -> None:
"""Test splitting by characters while keeping the separator """Test splitting by characters while keeping the separator
that is a regex special character. that is a regex special character.
@ -123,7 +123,7 @@ def test_character_text_splitter_keep_separator_regex(
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
) )
def test_character_text_splitter_keep_separator_regex_start( def test_character_text_splitter_keep_separator_regex_start(
separator: str, is_separator_regex: bool *, separator: str, is_separator_regex: bool
) -> None: ) -> None:
"""Test splitting by characters while keeping the separator """Test splitting by characters while keeping the separator
that is a regex special character and placing it at the start of each chunk. that is a regex special character and placing it at the start of each chunk.
@ -145,7 +145,7 @@ def test_character_text_splitter_keep_separator_regex_start(
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
) )
def test_character_text_splitter_keep_separator_regex_end( def test_character_text_splitter_keep_separator_regex_end(
separator: str, is_separator_regex: bool *, separator: str, is_separator_regex: bool
) -> None: ) -> None:
"""Test splitting by characters while keeping the separator """Test splitting by characters while keeping the separator
that is a regex special character and placing it at the end of each chunk. that is a regex special character and placing it at the end of each chunk.
@ -167,7 +167,7 @@ def test_character_text_splitter_keep_separator_regex_end(
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
) )
def test_character_text_splitter_discard_separator_regex( def test_character_text_splitter_discard_separator_regex(
separator: str, is_separator_regex: bool *, separator: str, is_separator_regex: bool
) -> None: ) -> None:
"""Test splitting by characters discarding the separator """Test splitting by characters discarding the separator
that is a regex special character.""" that is a regex special character."""
@ -338,7 +338,9 @@ def test_iterative_text_splitter_discard_separator() -> None:
] ]
def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> list[str]: def __test_iterative_text_splitter(
*, chunk_size: int, keep_separator: bool
) -> list[str]:
chunk_size += 1 if keep_separator else 0 chunk_size += 1 if keep_separator else 0
splitter = RecursiveCharacterTextSplitter( splitter = RecursiveCharacterTextSplitter(
@ -3574,6 +3576,7 @@ def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
) )
def test_character_text_splitter_chunk_size_effect( def test_character_text_splitter_chunk_size_effect(
separator: str, separator: str,
*,
is_regex: bool, is_regex: bool,
text: str, text: str,
chunk_size: int, chunk_size: int,

View File

@ -1119,7 +1119,7 @@ dev = [
{ name = "jupyter", specifier = ">=1.0.0,<2.0.0" }, { name = "jupyter", specifier = ">=1.0.0,<2.0.0" },
{ name = "setuptools", specifier = ">=67.6.1,<68.0.0" }, { name = "setuptools", specifier = ">=67.6.1,<68.0.0" },
] ]
lint = [{ name = "ruff", specifier = ">=0.11.2,<0.12.0" }] lint = [{ name = "ruff", specifier = ">=0.12.2,<0.13" }]
test = [ test = [
{ name = "blockbuster", specifier = "~=1.5.18" }, { name = "blockbuster", specifier = "~=1.5.18" },
{ name = "freezegun", specifier = ">=1.2.2,<2.0.0" }, { name = "freezegun", specifier = ">=1.2.2,<2.0.0" },