diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 8861f4c6585..22e3288a190 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -31,9 +31,9 @@ class TextSplitter(BaseDocumentTransformer, ABC): chunk_size: int = 4000, chunk_overlap: int = 200, length_function: Callable[[str], int] = len, - keep_separator: Union[bool, Literal["start", "end"]] = False, - add_start_index: bool = False, - strip_whitespace: bool = True, + keep_separator: Union[bool, Literal["start", "end"]] = False, # noqa: FBT001,FBT002 + add_start_index: bool = False, # noqa: FBT001,FBT002 + strip_whitespace: bool = True, # noqa: FBT001,FBT002 ) -> None: """Create a new TextSplitter. diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index 517cea5f640..bfe4653f06f 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -10,7 +10,10 @@ class CharacterTextSplitter(TextSplitter): """Splitting text that looks at characters.""" def __init__( - self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any + self, + separator: str = "\n\n", + is_separator_regex: bool = False, # noqa: FBT001,FBT002 + **kwargs: Any, ) -> None: """Create a new TextSplitter.""" super().__init__(**kwargs) @@ -25,7 +28,9 @@ class CharacterTextSplitter(TextSplitter): ) # 2. Initial split (keep separator if requested) - splits = _split_text_with_regex(text, sep_pattern, self._keep_separator) + splits = _split_text_with_regex( + text, sep_pattern, keep_separator=self._keep_separator + ) # 3. Detect zero-width lookaround so we never re-insert it lookaround_prefixes = ("(?=", "(? list[str]: # Now that we have the separator, split the text if separator: @@ -81,8 +86,8 @@ class RecursiveCharacterTextSplitter(TextSplitter): def __init__( self, separators: Optional[list[str]] = None, - keep_separator: Union[bool, Literal["start", "end"]] = True, - is_separator_regex: bool = False, + keep_separator: Union[bool, Literal["start", "end"]] = True, # noqa: FBT001,FBT002 + is_separator_regex: bool = False, # noqa: FBT001,FBT002 **kwargs: Any, ) -> None: """Create a new TextSplitter.""" @@ -107,7 +112,9 @@ class RecursiveCharacterTextSplitter(TextSplitter): break _separator = separator if self._is_separator_regex else re.escape(separator) - splits = _split_text_with_regex(text, _separator, self._keep_separator) + splits = _split_text_with_regex( + text, _separator, keep_separator=self._keep_separator + ) # Now go merging things, recursively splitting longer texts. _good_splits = [] diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index b8586028928..0553c6787ed 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -112,7 +112,7 @@ class HTMLHeaderTextSplitter: def __init__( self, headers_to_split_on: list[tuple[str, str]], - return_each_element: bool = False, + return_each_element: bool = False, # noqa: FBT001,FBT002 ) -> None: """Initialize with headers to split on. @@ -744,7 +744,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): soup (Any): Parsed HTML content using BeautifulSoup. """ if self._allowlist_tags: - for tag in soup.find_all(True): + for tag in soup.find_all(name=True): if tag.name not in self._allowlist_tags: tag.decompose() diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py index a5c9385febd..8d600da8771 100644 --- a/libs/text-splitters/langchain_text_splitters/json.py +++ b/libs/text-splitters/langchain_text_splitters/json.py @@ -107,7 +107,7 @@ class RecursiveJsonSplitter: def split_json( self, json_data: dict[str, Any], - convert_lists: bool = False, + convert_lists: bool = False, # noqa: FBT001,FBT002 ) -> list[dict[str, Any]]: """Splits JSON into a list of JSON chunks.""" if convert_lists: @@ -123,8 +123,8 @@ class RecursiveJsonSplitter: def split_text( self, json_data: dict[str, Any], - convert_lists: bool = False, - ensure_ascii: bool = True, + convert_lists: bool = False, # noqa: FBT001,FBT002 + ensure_ascii: bool = True, # noqa: FBT001,FBT002 ) -> list[str]: """Splits JSON into a list of JSON formatted strings.""" chunks = self.split_json(json_data=json_data, convert_lists=convert_lists) @@ -135,8 +135,8 @@ class RecursiveJsonSplitter: def create_documents( self, texts: list[dict[str, Any]], - convert_lists: bool = False, - ensure_ascii: bool = True, + convert_lists: bool = False, # noqa: FBT001,FBT002 + ensure_ascii: bool = True, # noqa: FBT001,FBT002 metadatas: Optional[list[dict[Any, Any]]] = None, ) -> list[Document]: """Create documents from a list of json objects (Dict).""" diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index 3d60a9f7269..80d3600a2aa 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -24,8 +24,8 @@ class MarkdownHeaderTextSplitter: def __init__( self, headers_to_split_on: list[tuple[str, str]], - return_each_line: bool = False, - strip_headers: bool = True, + return_each_line: bool = False, # noqa: FBT001,FBT002 + strip_headers: bool = True, # noqa: FBT001,FBT002 ): """Create a new MarkdownHeaderTextSplitter. @@ -279,8 +279,8 @@ class ExperimentalMarkdownSyntaxTextSplitter: def __init__( self, headers_to_split_on: Union[list[tuple[str, str]], None] = None, - return_each_line: bool = False, - strip_headers: bool = True, + return_each_line: bool = False, # noqa: FBT001,FBT002 + strip_headers: bool = True, # noqa: FBT001,FBT002 ): """Initialize the text splitter with header splitting and formatting options. diff --git a/libs/text-splitters/pyproject.toml b/libs/text-splitters/pyproject.toml index 5f56c328c23..086f0492d8b 100644 --- a/libs/text-splitters/pyproject.toml +++ b/libs/text-splitters/pyproject.toml @@ -61,7 +61,7 @@ ignore_missing_imports = "True" target-version = "py39" [tool.ruff.lint] -select = ["E", "F", "I", "UP", "PGH003", "T201", "D"] +select = ["E", "F", "FBT", "I", "UP", "PGH003", "T201", "D"] ignore = ["D100",] pyupgrade.keep-runtime-typing = true diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 1d3304d1b84..f155dd10ab8 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -101,7 +101,7 @@ def test_character_text_splitter_longer_words() -> None: ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ) def test_character_text_splitter_keep_separator_regex( - separator: str, is_separator_regex: bool + *, separator: str, is_separator_regex: bool ) -> None: """Test splitting by characters while keeping the separator that is a regex special character. @@ -123,7 +123,7 @@ def test_character_text_splitter_keep_separator_regex( ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ) def test_character_text_splitter_keep_separator_regex_start( - separator: str, is_separator_regex: bool + *, separator: str, is_separator_regex: bool ) -> None: """Test splitting by characters while keeping the separator that is a regex special character and placing it at the start of each chunk. @@ -145,7 +145,7 @@ def test_character_text_splitter_keep_separator_regex_start( ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ) def test_character_text_splitter_keep_separator_regex_end( - separator: str, is_separator_regex: bool + *, separator: str, is_separator_regex: bool ) -> None: """Test splitting by characters while keeping the separator that is a regex special character and placing it at the end of each chunk. @@ -167,7 +167,7 @@ def test_character_text_splitter_keep_separator_regex_end( ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)] ) def test_character_text_splitter_discard_separator_regex( - separator: str, is_separator_regex: bool + *, separator: str, is_separator_regex: bool ) -> None: """Test splitting by characters discarding the separator that is a regex special character.""" @@ -338,7 +338,9 @@ def test_iterative_text_splitter_discard_separator() -> None: ] -def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> list[str]: +def __test_iterative_text_splitter( + *, chunk_size: int, keep_separator: bool +) -> list[str]: chunk_size += 1 if keep_separator else 0 splitter = RecursiveCharacterTextSplitter( @@ -3574,6 +3576,7 @@ def test_character_text_splitter_discard_regex_separator_on_merge() -> None: ) def test_character_text_splitter_chunk_size_effect( separator: str, + *, is_regex: bool, text: str, chunk_size: int, diff --git a/libs/text-splitters/uv.lock b/libs/text-splitters/uv.lock index 18c97b7ea38..b6eb2a2bd92 100644 --- a/libs/text-splitters/uv.lock +++ b/libs/text-splitters/uv.lock @@ -1119,7 +1119,7 @@ dev = [ { name = "jupyter", specifier = ">=1.0.0,<2.0.0" }, { name = "setuptools", specifier = ">=67.6.1,<68.0.0" }, ] -lint = [{ name = "ruff", specifier = ">=0.11.2,<0.12.0" }] +lint = [{ name = "ruff", specifier = ">=0.12.2,<0.13" }] test = [ { name = "blockbuster", specifier = "~=1.5.18" }, { name = "freezegun", specifier = ">=1.2.2,<2.0.0" },