mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-25 13:07:58 +00:00
# TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530>
This commit is contained in:
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import DirectoryLoader, TextLoader
|
||||
from langchain.document_loaders.helpers import detect_file_encodings
|
||||
|
||||
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_loader_detect_encoding() -> None:
|
||||
"""Test text loader."""
|
||||
path = Path(__file__).parent.parent / "examples"
|
||||
files = path.glob("**/*.txt")
|
||||
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
|
||||
loader_detect_encoding = DirectoryLoader(
|
||||
str(path),
|
||||
glob="**/*.txt",
|
||||
loader_kwargs={"autodetect_encoding": True},
|
||||
loader_cls=TextLoader,
|
||||
)
|
||||
|
||||
with pytest.raises((UnicodeDecodeError, RuntimeError)):
|
||||
loader.load()
|
||||
|
||||
docs = loader_detect_encoding.load()
|
||||
assert len(docs) == len(list(files))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="slow test")
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
|
||||
path = Path(tmpdir)
|
||||
file_path = str(path / "blob.txt")
|
||||
# 2mb binary blob
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(b"\x00" * 2_000_000)
|
||||
|
||||
with pytest.raises(TimeoutError):
|
||||
detect_file_encodings(file_path, timeout=1)
|
||||
|
||||
detect_file_encodings(file_path, timeout=10)
|
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
@@ -0,0 +1 @@
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
6
tests/unit_tests/examples/example-utf8.txt
Normal file
6
tests/unit_tests/examples/example-utf8.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
||||
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
|
||||
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
|
||||
culpa qui officia deserunt mollit anim id est laborum.
|
Reference in New Issue
Block a user