mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-06 13:33:37 +00:00
text-splitters: add pydocstyle linting (#28127)
As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -21,8 +21,8 @@ class ElementType(TypedDict):
|
||||
|
||||
|
||||
class HTMLHeaderTextSplitter:
|
||||
"""
|
||||
Splitting HTML files based on specified headers.
|
||||
"""Splitting HTML files based on specified headers.
|
||||
|
||||
Requires lxml package.
|
||||
"""
|
||||
|
||||
@@ -46,7 +46,7 @@ class HTMLHeaderTextSplitter:
|
||||
def aggregate_elements_to_chunks(
|
||||
self, elements: List[ElementType]
|
||||
) -> List[Document]:
|
||||
"""Combine elements with common metadata into chunks
|
||||
"""Combine elements with common metadata into chunks.
|
||||
|
||||
Args:
|
||||
elements: HTML element content with associated identifying info and metadata
|
||||
@@ -72,7 +72,7 @@ class HTMLHeaderTextSplitter:
|
||||
]
|
||||
|
||||
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
|
||||
"""Split HTML from web URL
|
||||
"""Split HTML from web URL.
|
||||
|
||||
Args:
|
||||
url: web URL
|
||||
@@ -83,7 +83,7 @@ class HTMLHeaderTextSplitter:
|
||||
return self.split_text_from_file(BytesIO(r.content))
|
||||
|
||||
def split_text(self, text: str) -> List[Document]:
|
||||
"""Split HTML text string
|
||||
"""Split HTML text string.
|
||||
|
||||
Args:
|
||||
text: HTML text
|
||||
@@ -91,7 +91,7 @@ class HTMLHeaderTextSplitter:
|
||||
return self.split_text_from_file(StringIO(text))
|
||||
|
||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||
"""Split HTML file
|
||||
"""Split HTML file.
|
||||
|
||||
Args:
|
||||
file: HTML file
|
||||
@@ -166,8 +166,8 @@ class HTMLHeaderTextSplitter:
|
||||
|
||||
|
||||
class HTMLSectionSplitter:
|
||||
"""
|
||||
Splitting HTML files based on specified tag and font sizes.
|
||||
"""Splitting HTML files based on specified tag and font sizes.
|
||||
|
||||
Requires lxml package.
|
||||
"""
|
||||
|
||||
@@ -186,6 +186,8 @@ class HTMLSectionSplitter:
|
||||
xslt_path: path to xslt file for document transformation.
|
||||
Uses a default if not passed.
|
||||
Needed for html contents that using different format and layouts.
|
||||
**kwargs (Any): Additional optional arguments for customizations.
|
||||
|
||||
"""
|
||||
self.headers_to_split_on = dict(headers_to_split_on)
|
||||
|
||||
@@ -210,7 +212,7 @@ class HTMLSectionSplitter:
|
||||
return text_splitter.split_documents(results)
|
||||
|
||||
def split_text(self, text: str) -> List[Document]:
|
||||
"""Split HTML text string
|
||||
"""Split HTML text string.
|
||||
|
||||
Args:
|
||||
text: HTML text
|
||||
@@ -236,6 +238,23 @@ class HTMLSectionSplitter:
|
||||
return documents
|
||||
|
||||
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
|
||||
"""Split an HTML document into sections based on specified header tags.
|
||||
|
||||
This method uses BeautifulSoup to parse the HTML content and divides it into
|
||||
sections based on headers defined in `headers_to_split_on`. Each section
|
||||
contains the header text, content under the header, and the tag name.
|
||||
|
||||
Args:
|
||||
html_doc (str): The HTML document to be split into sections.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Optional[str]]]: A list of dictionaries representing
|
||||
sections.
|
||||
Each dictionary contains:
|
||||
- 'header': The header text or a default title for the first section.
|
||||
- 'content': The content under the header.
|
||||
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
||||
"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
||||
except ImportError as e:
|
||||
@@ -259,7 +278,7 @@ class HTMLSectionSplitter:
|
||||
section_content: List = []
|
||||
else:
|
||||
current_header = header_element.text.strip()
|
||||
current_header_tag = header_element.name
|
||||
current_header_tag = header_element.name # type: ignore[attr-defined]
|
||||
section_content = []
|
||||
for element in header_element.next_elements:
|
||||
if i + 1 < len(headers) and element == headers[i + 1]:
|
||||
@@ -280,6 +299,18 @@ class HTMLSectionSplitter:
|
||||
return sections
|
||||
|
||||
def convert_possible_tags_to_header(self, html_content: str) -> str:
|
||||
"""Convert specific HTML tags to headers using an XSLT transformation.
|
||||
|
||||
This method uses an XSLT file to transform the HTML content, converting
|
||||
certain tags into headers for easier parsing. If no XSLT path is provided,
|
||||
the HTML content is returned unchanged.
|
||||
|
||||
Args:
|
||||
html_content (str): The HTML content to be transformed.
|
||||
|
||||
Returns:
|
||||
str: The transformed HTML content as a string.
|
||||
"""
|
||||
if self.xslt_path is None:
|
||||
return html_content
|
||||
|
||||
@@ -299,7 +330,7 @@ class HTMLSectionSplitter:
|
||||
return str(result)
|
||||
|
||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||
"""Split HTML file
|
||||
"""Split HTML file.
|
||||
|
||||
Args:
|
||||
file: HTML file
|
||||
|
Reference in New Issue
Block a user