From 95ee69a301621ba4ea23db752777c936ceef1426 Mon Sep 17 00:00:00 2001 From: i-w-a <65731397+i-w-a@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:20:29 +0900 Subject: [PATCH] langchain[patch]: In HTMLHeaderTextSplitter set default encoding to utf-8 (#16372) - **Description:** The HTMLHeaderTextSplitter Class now explicitly specifies utf-8 encoding in the part of the split_text_from_file method that calls the HTMLParser. - **Issue:** Prevent garbled characters due to differences in encoding of html files (except for English in particular, I noticed that problem with Japanese). - **Dependencies:** No dependencies, - **Twitter handle:** @i_w__a --- libs/langchain/langchain/text_splitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index cd6204adfc8..c4ece253204 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -598,7 +598,9 @@ class HTMLHeaderTextSplitter: "Unable to import lxml, please install with `pip install lxml`." ) from e # use lxml library to parse html document and return xml ElementTree - parser = etree.HTMLParser() + # Explicitly encoding in utf-8 allows non-English + # html files to be processed without garbled characters + parser = etree.HTMLParser(encoding="utf-8") tree = etree.parse(file, parser) # document transformation for "structure-aware" chunking is handled with xsl.