mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-01 09:04:03 +00:00
fix(langchain): class HTMLSemanticPreservingSplitter ignores the text inside the div tag (#32213)
**Description:** We collect the text from the "html", "body", "div", and "main" nodes, if they have any. **Issue:** Fixes #32206.
This commit is contained in:
parent
56dde3ade3
commit
622bb05751
@ -842,6 +842,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
preserved_elements,
|
||||
placeholder_count,
|
||||
)
|
||||
content = " ".join(elem.find_all(string=True, recursive=False))
|
||||
if content:
|
||||
content = self._normalize_and_clean_text(content)
|
||||
current_content.append(content)
|
||||
continue
|
||||
|
||||
if elem.name in [h[0] for h in self._headers_to_split_on]:
|
||||
|
Loading…
Reference in New Issue
Block a user