From 288613d36180553605b0169de63d5cf9f2323dd9 Mon Sep 17 00:00:00 2001 From: Mohammad Mohtashim <45242107+keenborder786@users.noreply.github.com> Date: Wed, 15 Jan 2025 20:18:06 +0500 Subject: [PATCH] (text-splitters): Small Fix in `_process_html` for HTMLSemanticPreservingSplitter to properly extract the metadata. (#29215) - **Description:** Include `main` in the list of elements whose child elements needs to be processed for splitting the HTML. - **Issue:** #29184 --- libs/text-splitters/langchain_text_splitters/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 212a9abaa7b..3613937d999 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -696,7 +696,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): placeholder_count: int, ) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]: for elem in element: - if elem.name.lower() in ["html", "body", "div"]: + if elem.name.lower() in ["html", "body", "div", "main"]: children = elem.find_all(recursive=False) ( documents,