mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-09 13:00:34 +00:00
fix(langchain): class HTMLSemanticPreservingSplitter ignores the text inside the div tag (#32213)
**Description:** We collect the text from the "html", "body", "div", and "main" nodes, if they have any. **Issue:** Fixes #32206.
This commit is contained in:
parent
56dde3ade3
commit
622bb05751
@ -842,6 +842,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
preserved_elements,
|
preserved_elements,
|
||||||
placeholder_count,
|
placeholder_count,
|
||||||
)
|
)
|
||||||
|
content = " ".join(elem.find_all(string=True, recursive=False))
|
||||||
|
if content:
|
||||||
|
content = self._normalize_and_clean_text(content)
|
||||||
|
current_content.append(content)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if elem.name in [h[0] for h in self._headers_to_split_on]:
|
if elem.name in [h[0] for h in self._headers_to_split_on]:
|
||||||
|
Loading…
Reference in New Issue
Block a user