mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 08:56:27 +00:00
text-splitters: Add ruff rules FBT (#31935)
See [flake8-boolean-trap (FBT)](https://docs.astral.sh/ruff/rules/#flake8-boolean-trap-fbt)
This commit is contained in:
parent
4d9c0b0883
commit
060fc0e3c9
@ -31,9 +31,9 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
chunk_size: int = 4000,
|
chunk_size: int = 4000,
|
||||||
chunk_overlap: int = 200,
|
chunk_overlap: int = 200,
|
||||||
length_function: Callable[[str], int] = len,
|
length_function: Callable[[str], int] = len,
|
||||||
keep_separator: Union[bool, Literal["start", "end"]] = False,
|
keep_separator: Union[bool, Literal["start", "end"]] = False, # noqa: FBT001,FBT002
|
||||||
add_start_index: bool = False,
|
add_start_index: bool = False, # noqa: FBT001,FBT002
|
||||||
strip_whitespace: bool = True,
|
strip_whitespace: bool = True, # noqa: FBT001,FBT002
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new TextSplitter.
|
"""Create a new TextSplitter.
|
||||||
|
|
||||||
|
@ -10,7 +10,10 @@ class CharacterTextSplitter(TextSplitter):
|
|||||||
"""Splitting text that looks at characters."""
|
"""Splitting text that looks at characters."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
|
self,
|
||||||
|
separator: str = "\n\n",
|
||||||
|
is_separator_regex: bool = False, # noqa: FBT001,FBT002
|
||||||
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new TextSplitter."""
|
"""Create a new TextSplitter."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@ -25,7 +28,9 @@ class CharacterTextSplitter(TextSplitter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 2. Initial split (keep separator if requested)
|
# 2. Initial split (keep separator if requested)
|
||||||
splits = _split_text_with_regex(text, sep_pattern, self._keep_separator)
|
splits = _split_text_with_regex(
|
||||||
|
text, sep_pattern, keep_separator=self._keep_separator
|
||||||
|
)
|
||||||
|
|
||||||
# 3. Detect zero-width lookaround so we never re-insert it
|
# 3. Detect zero-width lookaround so we never re-insert it
|
||||||
lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")
|
lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")
|
||||||
@ -45,7 +50,7 @@ class CharacterTextSplitter(TextSplitter):
|
|||||||
|
|
||||||
|
|
||||||
def _split_text_with_regex(
|
def _split_text_with_regex(
|
||||||
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
|
text: str, separator: str, *, keep_separator: Union[bool, Literal["start", "end"]]
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
# Now that we have the separator, split the text
|
# Now that we have the separator, split the text
|
||||||
if separator:
|
if separator:
|
||||||
@ -81,8 +86,8 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
separators: Optional[list[str]] = None,
|
separators: Optional[list[str]] = None,
|
||||||
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
keep_separator: Union[bool, Literal["start", "end"]] = True, # noqa: FBT001,FBT002
|
||||||
is_separator_regex: bool = False,
|
is_separator_regex: bool = False, # noqa: FBT001,FBT002
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new TextSplitter."""
|
"""Create a new TextSplitter."""
|
||||||
@ -107,7 +112,9 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
break
|
break
|
||||||
|
|
||||||
_separator = separator if self._is_separator_regex else re.escape(separator)
|
_separator = separator if self._is_separator_regex else re.escape(separator)
|
||||||
splits = _split_text_with_regex(text, _separator, self._keep_separator)
|
splits = _split_text_with_regex(
|
||||||
|
text, _separator, keep_separator=self._keep_separator
|
||||||
|
)
|
||||||
|
|
||||||
# Now go merging things, recursively splitting longer texts.
|
# Now go merging things, recursively splitting longer texts.
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
|
@ -112,7 +112,7 @@ class HTMLHeaderTextSplitter:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
headers_to_split_on: list[tuple[str, str]],
|
headers_to_split_on: list[tuple[str, str]],
|
||||||
return_each_element: bool = False,
|
return_each_element: bool = False, # noqa: FBT001,FBT002
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with headers to split on.
|
"""Initialize with headers to split on.
|
||||||
|
|
||||||
@ -744,7 +744,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
soup (Any): Parsed HTML content using BeautifulSoup.
|
soup (Any): Parsed HTML content using BeautifulSoup.
|
||||||
"""
|
"""
|
||||||
if self._allowlist_tags:
|
if self._allowlist_tags:
|
||||||
for tag in soup.find_all(True):
|
for tag in soup.find_all(name=True):
|
||||||
if tag.name not in self._allowlist_tags:
|
if tag.name not in self._allowlist_tags:
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ class RecursiveJsonSplitter:
|
|||||||
def split_json(
|
def split_json(
|
||||||
self,
|
self,
|
||||||
json_data: dict[str, Any],
|
json_data: dict[str, Any],
|
||||||
convert_lists: bool = False,
|
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Splits JSON into a list of JSON chunks."""
|
"""Splits JSON into a list of JSON chunks."""
|
||||||
if convert_lists:
|
if convert_lists:
|
||||||
@ -123,8 +123,8 @@ class RecursiveJsonSplitter:
|
|||||||
def split_text(
|
def split_text(
|
||||||
self,
|
self,
|
||||||
json_data: dict[str, Any],
|
json_data: dict[str, Any],
|
||||||
convert_lists: bool = False,
|
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||||
ensure_ascii: bool = True,
|
ensure_ascii: bool = True, # noqa: FBT001,FBT002
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
"""Splits JSON into a list of JSON formatted strings."""
|
"""Splits JSON into a list of JSON formatted strings."""
|
||||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||||
@ -135,8 +135,8 @@ class RecursiveJsonSplitter:
|
|||||||
def create_documents(
|
def create_documents(
|
||||||
self,
|
self,
|
||||||
texts: list[dict[str, Any]],
|
texts: list[dict[str, Any]],
|
||||||
convert_lists: bool = False,
|
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||||
ensure_ascii: bool = True,
|
ensure_ascii: bool = True, # noqa: FBT001,FBT002
|
||||||
metadatas: Optional[list[dict[Any, Any]]] = None,
|
metadatas: Optional[list[dict[Any, Any]]] = None,
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
"""Create documents from a list of json objects (Dict)."""
|
"""Create documents from a list of json objects (Dict)."""
|
||||||
|
@ -24,8 +24,8 @@ class MarkdownHeaderTextSplitter:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
headers_to_split_on: list[tuple[str, str]],
|
headers_to_split_on: list[tuple[str, str]],
|
||||||
return_each_line: bool = False,
|
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||||
strip_headers: bool = True,
|
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||||
):
|
):
|
||||||
"""Create a new MarkdownHeaderTextSplitter.
|
"""Create a new MarkdownHeaderTextSplitter.
|
||||||
|
|
||||||
@ -279,8 +279,8 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
headers_to_split_on: Union[list[tuple[str, str]], None] = None,
|
headers_to_split_on: Union[list[tuple[str, str]], None] = None,
|
||||||
return_each_line: bool = False,
|
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||||
strip_headers: bool = True,
|
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||||
):
|
):
|
||||||
"""Initialize the text splitter with header splitting and formatting options.
|
"""Initialize the text splitter with header splitting and formatting options.
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ ignore_missing_imports = "True"
|
|||||||
target-version = "py39"
|
target-version = "py39"
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = ["E", "F", "I", "UP", "PGH003", "T201", "D"]
|
select = ["E", "F", "FBT", "I", "UP", "PGH003", "T201", "D"]
|
||||||
ignore = ["D100",]
|
ignore = ["D100",]
|
||||||
pyupgrade.keep-runtime-typing = true
|
pyupgrade.keep-runtime-typing = true
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ def test_character_text_splitter_longer_words() -> None:
|
|||||||
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
||||||
)
|
)
|
||||||
def test_character_text_splitter_keep_separator_regex(
|
def test_character_text_splitter_keep_separator_regex(
|
||||||
separator: str, is_separator_regex: bool
|
*, separator: str, is_separator_regex: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test splitting by characters while keeping the separator
|
"""Test splitting by characters while keeping the separator
|
||||||
that is a regex special character.
|
that is a regex special character.
|
||||||
@ -123,7 +123,7 @@ def test_character_text_splitter_keep_separator_regex(
|
|||||||
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
||||||
)
|
)
|
||||||
def test_character_text_splitter_keep_separator_regex_start(
|
def test_character_text_splitter_keep_separator_regex_start(
|
||||||
separator: str, is_separator_regex: bool
|
*, separator: str, is_separator_regex: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test splitting by characters while keeping the separator
|
"""Test splitting by characters while keeping the separator
|
||||||
that is a regex special character and placing it at the start of each chunk.
|
that is a regex special character and placing it at the start of each chunk.
|
||||||
@ -145,7 +145,7 @@ def test_character_text_splitter_keep_separator_regex_start(
|
|||||||
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
||||||
)
|
)
|
||||||
def test_character_text_splitter_keep_separator_regex_end(
|
def test_character_text_splitter_keep_separator_regex_end(
|
||||||
separator: str, is_separator_regex: bool
|
*, separator: str, is_separator_regex: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test splitting by characters while keeping the separator
|
"""Test splitting by characters while keeping the separator
|
||||||
that is a regex special character and placing it at the end of each chunk.
|
that is a regex special character and placing it at the end of each chunk.
|
||||||
@ -167,7 +167,7 @@ def test_character_text_splitter_keep_separator_regex_end(
|
|||||||
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]
|
||||||
)
|
)
|
||||||
def test_character_text_splitter_discard_separator_regex(
|
def test_character_text_splitter_discard_separator_regex(
|
||||||
separator: str, is_separator_regex: bool
|
*, separator: str, is_separator_regex: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test splitting by characters discarding the separator
|
"""Test splitting by characters discarding the separator
|
||||||
that is a regex special character."""
|
that is a regex special character."""
|
||||||
@ -338,7 +338,9 @@ def test_iterative_text_splitter_discard_separator() -> None:
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> list[str]:
|
def __test_iterative_text_splitter(
|
||||||
|
*, chunk_size: int, keep_separator: bool
|
||||||
|
) -> list[str]:
|
||||||
chunk_size += 1 if keep_separator else 0
|
chunk_size += 1 if keep_separator else 0
|
||||||
|
|
||||||
splitter = RecursiveCharacterTextSplitter(
|
splitter = RecursiveCharacterTextSplitter(
|
||||||
@ -3574,6 +3576,7 @@ def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
|
|||||||
)
|
)
|
||||||
def test_character_text_splitter_chunk_size_effect(
|
def test_character_text_splitter_chunk_size_effect(
|
||||||
separator: str,
|
separator: str,
|
||||||
|
*,
|
||||||
is_regex: bool,
|
is_regex: bool,
|
||||||
text: str,
|
text: str,
|
||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
|
@ -1119,7 +1119,7 @@ dev = [
|
|||||||
{ name = "jupyter", specifier = ">=1.0.0,<2.0.0" },
|
{ name = "jupyter", specifier = ">=1.0.0,<2.0.0" },
|
||||||
{ name = "setuptools", specifier = ">=67.6.1,<68.0.0" },
|
{ name = "setuptools", specifier = ">=67.6.1,<68.0.0" },
|
||||||
]
|
]
|
||||||
lint = [{ name = "ruff", specifier = ">=0.11.2,<0.12.0" }]
|
lint = [{ name = "ruff", specifier = ">=0.12.2,<0.13" }]
|
||||||
test = [
|
test = [
|
||||||
{ name = "blockbuster", specifier = "~=1.5.18" },
|
{ name = "blockbuster", specifier = "~=1.5.18" },
|
||||||
{ name = "freezegun", specifier = ">=1.2.2,<2.0.0" },
|
{ name = "freezegun", specifier = ">=1.2.2,<2.0.0" },
|
||||||
|
Loading…
Reference in New Issue
Block a user