mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 21:33:51 +00:00
(text-splitters): Small Fix in _process_html
for HTMLSemanticPreservingSplitter to properly extract the metadata. (#29215)
- **Description:** Include `main` in the list of elements whose child elements needs to be processed for splitting the HTML. - **Issue:** #29184
This commit is contained in:
parent
4867fe7ac8
commit
288613d361
@ -696,7 +696,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
placeholder_count: int,
|
placeholder_count: int,
|
||||||
) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
|
) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
|
||||||
for elem in element:
|
for elem in element:
|
||||||
if elem.name.lower() in ["html", "body", "div"]:
|
if elem.name.lower() in ["html", "body", "div", "main"]:
|
||||||
children = elem.find_all(recursive=False)
|
children = elem.find_all(recursive=False)
|
||||||
(
|
(
|
||||||
documents,
|
documents,
|
||||||
|
Loading…
Reference in New Issue
Block a user