feat #4479: TextLoader auto detect encoding and improved exceptions (#4927)

# TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting  ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530>
2026-01-29 21:30:18 +00:00 · 2023-05-18 09:55:14 -04:00
parent 8c28ad6dac
commit e46202829f
8 changed files with 457 additions and 23 deletions
--- a/tests/unit_tests/document_loaders/test_detect_encoding.py
+++ b/tests/unit_tests/document_loaders/test_detect_encoding.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders import DirectoryLoader, TextLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+
+
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding() -> None:
+    """Test text loader."""
+    path = Path(__file__).parent.parent / "examples"
+    files = path.glob("**/*.txt")
+    loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
+    loader_detect_encoding = DirectoryLoader(
+        str(path),
+        glob="**/*.txt",
+        loader_kwargs={"autodetect_encoding": True},
+        loader_cls=TextLoader,
+    )
+
+    with pytest.raises((UnicodeDecodeError, RuntimeError)):
+        loader.load()
+
+    docs = loader_detect_encoding.load()
+    assert len(docs) == len(list(files))
+
+
+@pytest.mark.skip(reason="slow test")
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
+    path = Path(tmpdir)
+    file_path = str(path / "blob.txt")
+    # 2mb binary blob
+    with open(file_path, "wb") as f:
+        f.write(b"\x00" * 2_000_000)
+
+    with pytest.raises(TimeoutError):
+        detect_file_encodings(file_path, timeout=1)
+
+    detect_file_encodings(file_path, timeout=10)
--- a/tests/unit_tests/examples/example-non-utf8.txt
+++ b/tests/unit_tests/examples/example-non-utf8.txt
@@ -0,0 +1 @@
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
--- a/tests/unit_tests/examples/example-utf8.txt
+++ b/tests/unit_tests/examples/example-utf8.txt
@@ -0,0 +1,6 @@
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
+incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
+nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
+fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
+culpa qui officia deserunt mollit anim id est laborum.
				`@@ -0,0 +1 @@`
				`<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>`