From 622bb0575137d883c785e776e1120541a3b286c3 Mon Sep 17 00:00:00 2001 From: tanwirahmad Date: Thu, 24 Jul 2025 17:09:03 +0300 Subject: [PATCH] fix(langchain): class HTMLSemanticPreservingSplitter ignores the text inside the div tag (#32213) **Description:** We collect the text from the "html", "body", "div", and "main" nodes, if they have any. **Issue:** Fixes #32206. --- libs/text-splitters/langchain_text_splitters/html.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 0553c6787ed..1f7c30bc132 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -842,6 +842,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): preserved_elements, placeholder_count, ) + content = " ".join(elem.find_all(string=True, recursive=False)) + if content: + content = self._normalize_and_clean_text(content) + current_content.append(content) continue if elem.name in [h[0] for h in self._headers_to_split_on]: