mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 22:11:51 +00:00
text-splitters: Add ruff rules FBT (#31935)
See [flake8-boolean-trap (FBT)](https://docs.astral.sh/ruff/rules/#flake8-boolean-trap-fbt)
This commit is contained in:
committed by
GitHub
parent
4d9c0b0883
commit
060fc0e3c9
@@ -31,9 +31,9 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
length_function: Callable[[str], int] = len,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = False,
|
||||
add_start_index: bool = False,
|
||||
strip_whitespace: bool = True,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = False, # noqa: FBT001,FBT002
|
||||
add_start_index: bool = False, # noqa: FBT001,FBT002
|
||||
strip_whitespace: bool = True, # noqa: FBT001,FBT002
|
||||
) -> None:
|
||||
"""Create a new TextSplitter.
|
||||
|
||||
|
@@ -10,7 +10,10 @@ class CharacterTextSplitter(TextSplitter):
|
||||
"""Splitting text that looks at characters."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
|
||||
self,
|
||||
separator: str = "\n\n",
|
||||
is_separator_regex: bool = False, # noqa: FBT001,FBT002
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(**kwargs)
|
||||
@@ -25,7 +28,9 @@ class CharacterTextSplitter(TextSplitter):
|
||||
)
|
||||
|
||||
# 2. Initial split (keep separator if requested)
|
||||
splits = _split_text_with_regex(text, sep_pattern, self._keep_separator)
|
||||
splits = _split_text_with_regex(
|
||||
text, sep_pattern, keep_separator=self._keep_separator
|
||||
)
|
||||
|
||||
# 3. Detect zero-width lookaround so we never re-insert it
|
||||
lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")
|
||||
@@ -45,7 +50,7 @@ class CharacterTextSplitter(TextSplitter):
|
||||
|
||||
|
||||
def _split_text_with_regex(
|
||||
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
|
||||
text: str, separator: str, *, keep_separator: Union[bool, Literal["start", "end"]]
|
||||
) -> list[str]:
|
||||
# Now that we have the separator, split the text
|
||||
if separator:
|
||||
@@ -81,8 +86,8 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
def __init__(
|
||||
self,
|
||||
separators: Optional[list[str]] = None,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
||||
is_separator_regex: bool = False,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = True, # noqa: FBT001,FBT002
|
||||
is_separator_regex: bool = False, # noqa: FBT001,FBT002
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a new TextSplitter."""
|
||||
@@ -107,7 +112,9 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
break
|
||||
|
||||
_separator = separator if self._is_separator_regex else re.escape(separator)
|
||||
splits = _split_text_with_regex(text, _separator, self._keep_separator)
|
||||
splits = _split_text_with_regex(
|
||||
text, _separator, keep_separator=self._keep_separator
|
||||
)
|
||||
|
||||
# Now go merging things, recursively splitting longer texts.
|
||||
_good_splits = []
|
||||
|
@@ -112,7 +112,7 @@ class HTMLHeaderTextSplitter:
|
||||
def __init__(
|
||||
self,
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
return_each_element: bool = False,
|
||||
return_each_element: bool = False, # noqa: FBT001,FBT002
|
||||
) -> None:
|
||||
"""Initialize with headers to split on.
|
||||
|
||||
@@ -744,7 +744,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
soup (Any): Parsed HTML content using BeautifulSoup.
|
||||
"""
|
||||
if self._allowlist_tags:
|
||||
for tag in soup.find_all(True):
|
||||
for tag in soup.find_all(name=True):
|
||||
if tag.name not in self._allowlist_tags:
|
||||
tag.decompose()
|
||||
|
||||
|
@@ -107,7 +107,7 @@ class RecursiveJsonSplitter:
|
||||
def split_json(
|
||||
self,
|
||||
json_data: dict[str, Any],
|
||||
convert_lists: bool = False,
|
||||
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Splits JSON into a list of JSON chunks."""
|
||||
if convert_lists:
|
||||
@@ -123,8 +123,8 @@ class RecursiveJsonSplitter:
|
||||
def split_text(
|
||||
self,
|
||||
json_data: dict[str, Any],
|
||||
convert_lists: bool = False,
|
||||
ensure_ascii: bool = True,
|
||||
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||
ensure_ascii: bool = True, # noqa: FBT001,FBT002
|
||||
) -> list[str]:
|
||||
"""Splits JSON into a list of JSON formatted strings."""
|
||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||
@@ -135,8 +135,8 @@ class RecursiveJsonSplitter:
|
||||
def create_documents(
|
||||
self,
|
||||
texts: list[dict[str, Any]],
|
||||
convert_lists: bool = False,
|
||||
ensure_ascii: bool = True,
|
||||
convert_lists: bool = False, # noqa: FBT001,FBT002
|
||||
ensure_ascii: bool = True, # noqa: FBT001,FBT002
|
||||
metadatas: Optional[list[dict[Any, Any]]] = None,
|
||||
) -> list[Document]:
|
||||
"""Create documents from a list of json objects (Dict)."""
|
||||
|
@@ -24,8 +24,8 @@ class MarkdownHeaderTextSplitter:
|
||||
def __init__(
|
||||
self,
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
return_each_line: bool = False,
|
||||
strip_headers: bool = True,
|
||||
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||
):
|
||||
"""Create a new MarkdownHeaderTextSplitter.
|
||||
|
||||
@@ -279,8 +279,8 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
||||
def __init__(
|
||||
self,
|
||||
headers_to_split_on: Union[list[tuple[str, str]], None] = None,
|
||||
return_each_line: bool = False,
|
||||
strip_headers: bool = True,
|
||||
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||
):
|
||||
"""Initialize the text splitter with header splitting and formatting options.
|
||||
|
||||
|
Reference in New Issue
Block a user