From 288613d36180553605b0169de63d5cf9f2323dd9 Mon Sep 17 00:00:00 2001
From: Mohammad Mohtashim <45242107+keenborder786@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:18:06 +0500
Subject: [PATCH] (text-splitters): Small Fix in `_process_html` for
 HTMLSemanticPreservingSplitter to properly extract the metadata. (#29215)

- **Description:** Include `main` in the list of elements whose child
elements needs to be processed for splitting the HTML.
- **Issue:** #29184
---
 libs/text-splitters/langchain_text_splitters/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
index 212a9abaa7b..3613937d999 100644
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -696,7 +696,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
             placeholder_count: int,
         ) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
             for elem in element:
-                if elem.name.lower() in ["html", "body", "div"]:
+                if elem.name.lower() in ["html", "body", "div", "main"]:
                     children = elem.find_all(recursive=False)
                     (
                         documents,