feat #4479: TextLoader auto detect encoding and improved exceptions (#4927)

# TextLoader auto detect encoding and enhanced exception handling

- Add an option to enable encoding detection on `TextLoader`. 
- The detection is done using `chardet`
- The loading is done by trying all detected encodings by order of
confidence or raise an exception otherwise.

### New Dependencies:
- `chardet`

Fixes #4479 

## Before submitting

<!-- If you're adding a new integration, include an integration test and
an example notebook showing its use! -->

## Who can review?

Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:

- @eyurtsev

---------

Co-authored-by: blob42 <spike@w530>
This commit is contained in:
Eugene Yurtsev
2023-05-18 09:55:14 -04:00
committed by GitHub
parent 8c28ad6dac
commit e46202829f
8 changed files with 457 additions and 23 deletions

View File

@@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true}
pandas = {version = "^2.0.1", optional = true}
telethon = {version = "^1.28.5", optional = true}
zep-python = {version="^0.25", optional=true}
chardet = {version="^5.1.0", optional=true}
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"
@@ -156,6 +158,7 @@ ruff = "^0.0.249"
types-toml = "^0.10.8.1"
types-redis = "^4.3.21.6"
black = "^23.1.0"
types-chardet = "^5.0.4.6"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
@@ -174,6 +177,7 @@ setuptools = "^67.6.1"
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
qdrant = ["qdrant-client"]
openai = ["openai", "tiktoken"]
text_helpers = ["chardet"]
cohere = ["cohere"]
in_memory_store = ["docarray"]
hnswlib = ["docarray", "protobuf", "hnswlib"]
@@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "
# merge-conflicts
extended_testing = [
"beautifulsoup4",
"chardet",
"jq",
"pdfminer.six",
"pypdf",