mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-28 06:48:50 +00:00
chore: update branch with changes from master (#32277)
Co-authored-by: Maxime Grenu <69890511+cluster2600@users.noreply.github.com> Co-authored-by: Claude <claude@anthropic.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: jmaillefaud <jonathan.maillefaud@evooq.ch> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: tanwirahmad <tanwirahmad@users.noreply.github.com> Co-authored-by: Christophe Bornet <cbornet@hotmail.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: niceg <79145285+growmuye@users.noreply.github.com> Co-authored-by: Chaitanya varma <varmac301@gmail.com> Co-authored-by: dishaprakash <57954147+dishaprakash@users.noreply.github.com> Co-authored-by: Chester Curme <chester.curme@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Kanav Bansal <13186335+bansalkanav@users.noreply.github.com> Co-authored-by: Aleksandr Filippov <71711753+alex-feel@users.noreply.github.com> Co-authored-by: Alex Feel <afilippov@spotware.com>
This commit is contained in:
@@ -107,6 +107,7 @@ class HTMLHeaderTextSplitter:
|
||||
# content="Conclusion"
|
||||
# - Document with metadata={"Main Topic": "Conclusion"} and
|
||||
# content="Final thoughts."
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -562,6 +563,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
preserve_images=True,
|
||||
custom_handlers={"iframe": custom_iframe_extractor}
|
||||
)
|
||||
|
||||
""" # noqa: E501, D214
|
||||
|
||||
def __init__(
|
||||
@@ -842,6 +844,10 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
preserved_elements,
|
||||
placeholder_count,
|
||||
)
|
||||
content = " ".join(elem.find_all(string=True, recursive=False))
|
||||
if content:
|
||||
content = self._normalize_and_clean_text(content)
|
||||
current_content.append(content)
|
||||
continue
|
||||
|
||||
if elem.name in [h[0] for h in self._headers_to_split_on]:
|
||||
|
Reference in New Issue
Block a user