text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Ankit Dangi
2024-12-08 22:01:03 -08:00
committed by GitHub
parent b53f07bfb9
commit 90f162efb6
9 changed files with 194 additions and 27 deletions

View File

@@ -45,7 +45,8 @@ class MarkdownHeaderTextSplitter:
self.strip_headers = strip_headers
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
"""Combine lines with common metadata into chunks
"""Combine lines with common metadata into chunks.
Args:
lines: Line of text / associated header metadata
"""
@@ -87,10 +88,11 @@ class MarkdownHeaderTextSplitter:
]
def split_text(self, text: str) -> List[Document]:
"""Split markdown file
Args:
text: Markdown file"""
"""Split markdown file.
Args:
text: Markdown file
"""
# Split the input text by newline character ("\n").
lines = text.split("\n")
# Final output
@@ -225,8 +227,7 @@ class HeaderType(TypedDict):
class ExperimentalMarkdownSyntaxTextSplitter:
"""
An experimental text splitter for handling Markdown syntax.
"""An experimental text splitter for handling Markdown syntax.
This splitter aims to retain the exact whitespace of the original text while
extracting structured metadata, such as headers. It is a re-implementation of the
@@ -280,6 +281,22 @@ class ExperimentalMarkdownSyntaxTextSplitter:
return_each_line: bool = False,
strip_headers: bool = True,
):
"""Initialize the text splitter with header splitting and formatting options.
This constructor sets up the required configuration for splitting text into
chunks based on specified headers and formatting preferences.
Args:
headers_to_split_on (Union[List[Tuple[str, str]], None]):
A list of tuples, where each tuple contains a header tag (e.g., "h1")
and its corresponding metadata key. If None, default headers are used.
return_each_line (bool):
Whether to return each line as an individual chunk.
Defaults to False, which aggregates lines into larger chunks.
strip_headers (bool):
Whether to exclude headers from the resulting chunks.
Defaults to True.
"""
self.chunks: List[Document] = []
self.current_chunk = Document(page_content="")
self.current_header_stack: List[Tuple[int, str]] = []
@@ -292,6 +309,21 @@ class ExperimentalMarkdownSyntaxTextSplitter:
self.return_each_line = return_each_line
def split_text(self, text: str) -> List[Document]:
"""Split the input text into structured chunks.
This method processes the input text line by line, identifying and handling
specific patterns such as headers, code blocks, and horizontal rules to
split it into structured chunks based on headers, code blocks, and
horizontal rules.
Args:
text (str): The input text to be split into chunks.
Returns:
List[Document]: A list of `Document` objects representing the structured
chunks of the input text. If `return_each_line` is enabled, each line
is returned as a separate `Document`.
"""
raw_lines = text.splitlines(keepends=True)
while raw_lines: