fix(langchain): class HTMLSemanticPreservingSplitter ignores the text inside the div tag (#32213)

**Description:** We collect the text from the "html", "body", "div", and
"main" nodes, if they have any.

**Issue:** Fixes #32206.
This commit is contained in:
tanwirahmad 2025-07-24 17:09:03 +03:00 committed by GitHub
parent 56dde3ade3
commit 622bb05751
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -842,6 +842,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
preserved_elements,
placeholder_count,
)
content = " ".join(elem.find_all(string=True, recursive=False))
if content:
content = self._normalize_and_clean_text(content)
current_content.append(content)
continue
if elem.name in [h[0] for h in self._headers_to_split_on]: