text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Ankit Dangi
2024-12-08 22:01:03 -08:00
committed by GitHub
parent b53f07bfb9
commit 90f162efb6
9 changed files with 194 additions and 27 deletions

View File

@@ -21,8 +21,8 @@ class ElementType(TypedDict):
class HTMLHeaderTextSplitter:
"""
Splitting HTML files based on specified headers.
"""Splitting HTML files based on specified headers.
Requires lxml package.
"""
@@ -46,7 +46,7 @@ class HTMLHeaderTextSplitter:
def aggregate_elements_to_chunks(
self, elements: List[ElementType]
) -> List[Document]:
"""Combine elements with common metadata into chunks
"""Combine elements with common metadata into chunks.
Args:
elements: HTML element content with associated identifying info and metadata
@@ -72,7 +72,7 @@ class HTMLHeaderTextSplitter:
]
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
"""Split HTML from web URL
"""Split HTML from web URL.
Args:
url: web URL
@@ -83,7 +83,7 @@ class HTMLHeaderTextSplitter:
return self.split_text_from_file(BytesIO(r.content))
def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
@@ -91,7 +91,7 @@ class HTMLHeaderTextSplitter:
return self.split_text_from_file(StringIO(text))
def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file
@@ -166,8 +166,8 @@ class HTMLHeaderTextSplitter:
class HTMLSectionSplitter:
"""
Splitting HTML files based on specified tag and font sizes.
"""Splitting HTML files based on specified tag and font sizes.
Requires lxml package.
"""
@@ -186,6 +186,8 @@ class HTMLSectionSplitter:
xslt_path: path to xslt file for document transformation.
Uses a default if not passed.
Needed for html contents that using different format and layouts.
**kwargs (Any): Additional optional arguments for customizations.
"""
self.headers_to_split_on = dict(headers_to_split_on)
@@ -210,7 +212,7 @@ class HTMLSectionSplitter:
return text_splitter.split_documents(results)
def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
@@ -236,6 +238,23 @@ class HTMLSectionSplitter:
return documents
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
"""Split an HTML document into sections based on specified header tags.
This method uses BeautifulSoup to parse the HTML content and divides it into
sections based on headers defined in `headers_to_split_on`. Each section
contains the header text, content under the header, and the tag name.
Args:
html_doc (str): The HTML document to be split into sections.
Returns:
List[Dict[str, Optional[str]]]: A list of dictionaries representing
sections.
Each dictionary contains:
- 'header': The header text or a default title for the first section.
- 'content': The content under the header.
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
"""
try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
except ImportError as e:
@@ -259,7 +278,7 @@ class HTMLSectionSplitter:
section_content: List = []
else:
current_header = header_element.text.strip()
current_header_tag = header_element.name
current_header_tag = header_element.name # type: ignore[attr-defined]
section_content = []
for element in header_element.next_elements:
if i + 1 < len(headers) and element == headers[i + 1]:
@@ -280,6 +299,18 @@ class HTMLSectionSplitter:
return sections
def convert_possible_tags_to_header(self, html_content: str) -> str:
"""Convert specific HTML tags to headers using an XSLT transformation.
This method uses an XSLT file to transform the HTML content, converting
certain tags into headers for easier parsing. If no XSLT path is provided,
the HTML content is returned unchanged.
Args:
html_content (str): The HTML content to be transformed.
Returns:
str: The transformed HTML content as a string.
"""
if self.xslt_path is None:
return html_content
@@ -299,7 +330,7 @@ class HTMLSectionSplitter:
return str(result)
def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file