From 95ee69a301621ba4ea23db752777c936ceef1426 Mon Sep 17 00:00:00 2001
From: i-w-a <65731397+i-w-a@users.noreply.github.com>
Date: Wed, 24 Jan 2024 11:20:29 +0900
Subject: [PATCH] langchain[patch]: In HTMLHeaderTextSplitter set default
 encoding to utf-8 (#16372)

- **Description:** The HTMLHeaderTextSplitter Class now explicitly
specifies utf-8 encoding in the part of the split_text_from_file method
that calls the HTMLParser.
- **Issue:** Prevent garbled characters due to differences in encoding
of html files (except for English in particular, I noticed that problem
with Japanese).
  - **Dependencies:** No dependencies,
  - **Twitter handle:**  @i_w__a
---
 libs/langchain/langchain/text_splitter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py
index cd6204adfc8..c4ece253204 100644
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@@ -598,7 +598,9 @@ class HTMLHeaderTextSplitter:
                 "Unable to import lxml, please install with `pip install lxml`."
             ) from e
         # use lxml library to parse html document and return xml ElementTree
-        parser = etree.HTMLParser()
+        # Explicitly encoding in utf-8 allows non-English
+        # html files to be processed without garbled characters
+        parser = etree.HTMLParser(encoding="utf-8")
         tree = etree.parse(file, parser)
 
         # document transformation for "structure-aware" chunking is handled with xsl.