mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
(text-splitters): Small Fix in _process_html
for HTMLSemanticPreservingSplitter to properly extract the metadata. (#29215)
- **Description:** Include `main` in the list of elements whose child elements needs to be processed for splitting the HTML. - **Issue:** #29184
This commit is contained in:
parent
4867fe7ac8
commit
288613d361
@ -696,7 +696,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
placeholder_count: int,
|
||||
) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
|
||||
for elem in element:
|
||||
if elem.name.lower() in ["html", "body", "div"]:
|
||||
if elem.name.lower() in ["html", "body", "div", "main"]:
|
||||
children = elem.find_all(recursive=False)
|
||||
(
|
||||
documents,
|
||||
|
Loading…
Reference in New Issue
Block a user