mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
# TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530>
This commit is contained in:
parent
8c28ad6dac
commit
e46202829f
@ -12,7 +12,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 1,
|
||||||
"id": "019d8520",
|
"id": "019d8520",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -151,7 +151,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 15,
|
||||||
"id": "81c92da3",
|
"id": "81c92da3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -210,7 +210,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 14,
|
||||||
"id": "c558bd73",
|
"id": "c558bd73",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -259,13 +259,292 @@
|
|||||||
"len(docs)"
|
"len(docs)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "6411a0cb",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Auto detect file encodings with TextLoader\n",
|
||||||
|
"\n",
|
||||||
|
"In this example we will see some strategies that can be useful when loading a big list of arbitrary files from a directory using the `TextLoader` class.\n",
|
||||||
|
"\n",
|
||||||
|
"First to illustrate the problem, let's try to load multiple text with arbitrary encodings."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 16,
|
||||||
"id": "6a91a0bc",
|
"id": "2c787a69",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"path = '../../../../../tests/integration_tests/examples'\n",
|
||||||
|
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e9001e12",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### A. Default Behavior"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "b1e88c31",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">29</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">26 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>text = <span style=\"color: #808000; text-decoration-color: #808000\">\"\"</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">27 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">open</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path, encoding=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.encoding) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> f: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">28 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>29 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span>text = f.read() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">30 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">31 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.autodetect_encoding: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">32 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span>detected_encodings = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.detect_file_encodings() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/spike/.pyenv/versions/3.9.11/lib/python3.9/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">codecs.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">322</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 319 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span>, final=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 320 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># decode input (taking the buffer into account)</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 321 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>data = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer + <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 322 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>(result, consumed) = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._buffer_decode(data, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.errors, final) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 323 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># keep undecoded input until the next call</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 324 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer = data[consumed:] <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 325 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> result <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">UnicodeDecodeError: </span><span style=\"color: #008000; text-decoration-color: #008000\">'utf-8'</span> codec can't decode byte <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0xca</span> in position <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: invalid continuation byte\n",
|
||||||
|
"\n",
|
||||||
|
"<span style=\"font-style: italic\">The above exception was the direct cause of the following exception:</span>\n",
|
||||||
|
"\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\"><module></span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1 loader.load() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">84</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">82 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span>logger.warning(e) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">83 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>84 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> e <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">85 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">finally</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">86 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> pbar: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">87 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span>pbar.update(<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">78</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">75 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> i.is_file(): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">76 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> _is_visible(i.relative_to(p)) <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">or</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.load_hidden: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">77 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>78 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span>sub_docs = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_cls(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">str</span>(i), **<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_kwargs).load() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">79 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span>docs.extend(sub_docs) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">80 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">44</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">41 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">42 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">continue</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">43 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>44 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">45 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">46 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">47 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">RuntimeError: </span>Error loading ..<span style=\"color: #800080; text-decoration-color: #800080\">/../../../../tests/integration_tests/examples/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">example-non-utf8.txt</span>\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m29\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m26 \u001b[0m\u001b[2m│ │ \u001b[0mtext = \u001b[33m\"\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m27 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mopen\u001b[0m(\u001b[96mself\u001b[0m.file_path, encoding=\u001b[96mself\u001b[0m.encoding) \u001b[94mas\u001b[0m f: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m28 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m29 \u001b[2m│ │ │ │ \u001b[0mtext = f.read() \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m30 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m31 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.autodetect_encoding: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m32 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mdetected_encodings = \u001b[96mself\u001b[0m.detect_file_encodings() \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2;33m/home/spike/.pyenv/versions/3.9.11/lib/python3.9/\u001b[0m\u001b[1;33mcodecs.py\u001b[0m:\u001b[94m322\u001b[0m in \u001b[92mdecode\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 319 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecode\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m, final=\u001b[94mFalse\u001b[0m): \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 320 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# decode input (taking the buffer into account)\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 321 \u001b[0m\u001b[2m│ │ \u001b[0mdata = \u001b[96mself\u001b[0m.buffer + \u001b[96minput\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 322 \u001b[2m│ │ \u001b[0m(result, consumed) = \u001b[96mself\u001b[0m._buffer_decode(data, \u001b[96mself\u001b[0m.errors, final) \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 323 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# keep undecoded input until the next call\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 324 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.buffer = data[consumed:] \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m 325 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m result \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
|
||||||
|
"\u001b[1;91mUnicodeDecodeError: \u001b[0m\u001b[32m'utf-8'\u001b[0m codec can't decode byte \u001b[1;36m0xca\u001b[0m in position \u001b[1;36m0\u001b[0m: invalid continuation byte\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m:\u001b[94m1\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 loader.load() \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m84\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m82 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mlogger.warning(e) \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m83 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m84 \u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m e \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m85 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m86 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m pbar: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m87 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mpbar.update(\u001b[94m1\u001b[0m) \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m78\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m75 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m i.is_file(): \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m76 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m _is_visible(i.relative_to(p)) \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m.load_hidden: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m77 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m78 \u001b[2m│ │ │ │ │ │ \u001b[0msub_docs = \u001b[96mself\u001b[0m.loader_cls(\u001b[96mstr\u001b[0m(i), **\u001b[96mself\u001b[0m.loader_kwargs).load() \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m79 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mdocs.extend(sub_docs) \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m80 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m44\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m41 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m42 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mcontinue\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m43 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m44 \u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m45 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m46 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m│\u001b[0m \u001b[2m47 \u001b[0m \u001b[31m│\u001b[0m\n",
|
||||||
|
"\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
|
||||||
|
"\u001b[1;91mRuntimeError: \u001b[0mError loading ..\u001b[35m/../../../../tests/integration_tests/examples/\u001b[0m\u001b[95mexample-non-utf8.txt\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "da554f9a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The file `example-non-utf8.txt` uses a different encoding the `load()` function fails with a helpful message indicating which file failed decoding. \n",
|
||||||
|
"\n",
|
||||||
|
"With the default behavior of `TextLoader` any failure to load any of the documents will fail the whole loading process and no documents are loaded. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "eb844b7e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### B. Silent fail\n",
|
||||||
|
"\n",
|
||||||
|
"We can pass the parameter `silent_errors` to the `DirectoryLoader` to skip the files which could not be loaded and continue the load process."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"id": "5314ec39",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, silent_errors=True)\n",
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "216337e5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
|
||||||
|
" '../../../../../tests/integration_tests/examples/example-utf8.txt']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc_sources = [doc.metadata['source'] for doc in docs]\n",
|
||||||
|
"doc_sources"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4cba0e53",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### C. Auto detect encodings\n",
|
||||||
|
"\n",
|
||||||
|
"We can also ask `TextLoader` to auto detect the file encoding before failing, by passing the `autodetect_encoding` to the loader class."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"id": "96d527d2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||||
|
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||||
|
"docs = loader.load()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"id": "f1a136a5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['../../../../../tests/integration_tests/examples/example-non-utf8.txt',\n",
|
||||||
|
" '../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
|
||||||
|
" '../../../../../tests/integration_tests/examples/example-utf8.txt']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc_sources = [doc.metadata['source'] for doc in docs]\n",
|
||||||
|
"doc_sources"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -284,7 +563,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.9.11"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
37
langchain/document_loaders/helpers.py
Normal file
37
langchain/document_loaders/helpers.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
"""Document loader helpers."""
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
from typing import List, NamedTuple, Optional, cast
|
||||||
|
|
||||||
|
|
||||||
|
class FileEncoding(NamedTuple):
|
||||||
|
encoding: Optional[str]
|
||||||
|
confidence: float
|
||||||
|
language: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||||
|
"""Try to detect the file encoding.
|
||||||
|
|
||||||
|
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||||
|
by confidence.
|
||||||
|
"""
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
def read_and_detect(file_path: str) -> List[dict]:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
rawdata = f.read()
|
||||||
|
return cast(List[dict], chardet.detect_all(rawdata))
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
future = executor.submit(read_and_detect, file_path)
|
||||||
|
try:
|
||||||
|
encodings = future.result(timeout=timeout)
|
||||||
|
except concurrent.futures.TimeoutError:
|
||||||
|
raise TimeoutError(
|
||||||
|
f"Timeout reached while detecting encoding for {file_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if all(encoding["encoding"] is None for encoding in encodings):
|
||||||
|
raise RuntimeError(f"Could not detect encoding for {file_path}")
|
||||||
|
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
|
@ -1,20 +1,59 @@
|
|||||||
|
import logging
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.document_loaders.helpers import detect_file_encodings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TextLoader(BaseLoader):
|
class TextLoader(BaseLoader):
|
||||||
"""Load text files."""
|
"""Load text files.
|
||||||
|
|
||||||
def __init__(self, file_path: str, encoding: Optional[str] = None):
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to load.
|
||||||
|
|
||||||
|
encoding: File encoding to use. If `None`, the file will be loaded
|
||||||
|
with the default system encoding.
|
||||||
|
|
||||||
|
autodetect_encoding: Whether to try to autodetect the file encoding
|
||||||
|
if the specified encoding fails.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
autodetect_encoding: bool = False,
|
||||||
|
):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
self.autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load from file path."""
|
"""Load from file path."""
|
||||||
with open(self.file_path, encoding=self.encoding) as f:
|
text = ""
|
||||||
text = f.read()
|
try:
|
||||||
|
with open(self.file_path, encoding=self.encoding) as f:
|
||||||
|
text = f.read()
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
if self.autodetect_encoding:
|
||||||
|
detected_encodings = detect_file_encodings(self.file_path)
|
||||||
|
for encoding in detected_encodings:
|
||||||
|
logger.debug("Trying encoding: ", encoding.encoding)
|
||||||
|
try:
|
||||||
|
with open(self.file_path, encoding=encoding.encoding) as f:
|
||||||
|
text = f.read()
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Error loading {self.file_path}") from e
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Error loading {self.file_path}") from e
|
||||||
|
|
||||||
metadata = {"source": self.file_path}
|
metadata = {"source": self.file_path}
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
50
poetry.lock
generated
50
poetry.lock
generated
@ -1022,6 +1022,18 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
pycparser = "*"
|
pycparser = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chardet"
|
||||||
|
version = "5.1.0"
|
||||||
|
description = "Universal encoding detector for Python 3"
|
||||||
|
category = "main"
|
||||||
|
optional = true
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
|
||||||
|
{file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.1.0"
|
version = "3.1.0"
|
||||||
@ -1510,13 +1522,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deeplake"
|
name = "deeplake"
|
||||||
version = "3.4.4"
|
version = "3.5.0"
|
||||||
description = "Activeloop Deep Lake"
|
description = "Activeloop Deep Lake"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
{file = "deeplake-3.4.4.tar.gz", hash = "sha256:7d044a89862fe6aa4a93abc7db90090be7ecfc611169916e3975e0cd187be1ac"},
|
{file = "deeplake-3.5.0.tar.gz", hash = "sha256:ae640f75b1fec4eed9598a4e8d6b80e7243af87d9f39f0d34f01ce2c6f7c194f"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -3138,13 +3150,13 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jcloud"
|
name = "jcloud"
|
||||||
version = "0.2.8"
|
version = "0.2.9"
|
||||||
description = "Simplify deploying and managing Jina projects on Jina Cloud"
|
description = "Simplify deploying and managing Jina projects on Jina Cloud"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
{file = "jcloud-0.2.8.tar.gz", hash = "sha256:cdd30c85c0a857573651ebc329f52a8de9e43c3e0f276dc85975914006295639"},
|
{file = "jcloud-0.2.9.tar.gz", hash = "sha256:eaf8da685f8907e153ff752e6a4b945aeff548b8d94dfd650a035d8450ed547e"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -3297,14 +3309,14 @@ websockets = ["websockets"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jina-hubble-sdk"
|
name = "jina-hubble-sdk"
|
||||||
version = "0.36.0"
|
version = "0.37.1"
|
||||||
description = "SDK for Hubble API at Jina AI."
|
description = "SDK for Hubble API at Jina AI."
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "jina-hubble-sdk-0.36.0.tar.gz", hash = "sha256:ba1a72c7a5c14963fdad9af1ff4c3bba26a03ddcced08111bd11a95b153249ec"},
|
{file = "jina-hubble-sdk-0.37.1.tar.gz", hash = "sha256:5b6bd9e13f97c8c77be822e9ae49f87a0f16ef8195011f25db2552006a5ca2a0"},
|
||||||
{file = "jina_hubble_sdk-0.36.0-py3-none-any.whl", hash = "sha256:56db142147d7c72142ed1b6505020c3b4d8070b1603d07ff796e56d491dde294"},
|
{file = "jina_hubble_sdk-0.37.1-py3-none-any.whl", hash = "sha256:bc54a60ed120508e231fbed28b0fff394c288312d2ffa865c7865c03dbbbb502"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -6608,6 +6620,7 @@ files = [
|
|||||||
{file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
|
{file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
|
||||||
{file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
|
{file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
|
||||||
{file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
|
{file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
|
||||||
|
{file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -9367,6 +9380,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
|
|||||||
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
||||||
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "types-chardet"
|
||||||
|
version = "5.0.4.6"
|
||||||
|
description = "Typing stubs for chardet"
|
||||||
|
category = "dev"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818"},
|
||||||
|
{file = "types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-pyopenssl"
|
name = "types-pyopenssl"
|
||||||
version = "23.1.0.3"
|
version = "23.1.0.3"
|
||||||
@ -9762,14 +9787,14 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "weaviate-client"
|
name = "weaviate-client"
|
||||||
version = "3.18.0"
|
version = "3.19.1"
|
||||||
description = "A python native weaviate client"
|
description = "A python native weaviate client"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "weaviate-client-3.18.0.tar.gz", hash = "sha256:423a526518a32505c5293328e5f252e6cbbf20e4b3124733f70d10fc0d6823c9"},
|
{file = "weaviate-client-3.19.1.tar.gz", hash = "sha256:8528926b0b545225ab75583481d67cccf9494d2dc01cb62f1165a8f187b41ebb"},
|
||||||
{file = "weaviate_client-3.18.0-py3-none-any.whl", hash = "sha256:42b324286a4b4436317e5d2c6ba48c07da6cf01518efdd47ee097e7a8cc7584c"},
|
{file = "weaviate_client-3.19.1-py3-none-any.whl", hash = "sha256:6b4aae86cd955543fcc6328bc1462fbbae053dc50f7b6822287b05b98fec0d27"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -10309,14 +10334,15 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
|
|||||||
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
embeddings = ["sentence-transformers"]
|
embeddings = ["sentence-transformers"]
|
||||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
|
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
|
||||||
hnswlib = ["docarray", "hnswlib", "protobuf"]
|
hnswlib = ["docarray", "hnswlib", "protobuf"]
|
||||||
in-memory-store = ["docarray"]
|
in-memory-store = ["docarray"]
|
||||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
||||||
openai = ["openai", "tiktoken"]
|
openai = ["openai", "tiktoken"]
|
||||||
qdrant = ["qdrant-client"]
|
qdrant = ["qdrant-client"]
|
||||||
|
text-helpers = ["chardet"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "055d65314e800e0731b086471d357e5fb6bbd265ee9fa2bd3762470152cc3b85"
|
content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b"
|
||||||
|
@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true}
|
|||||||
pandas = {version = "^2.0.1", optional = true}
|
pandas = {version = "^2.0.1", optional = true}
|
||||||
telethon = {version = "^1.28.5", optional = true}
|
telethon = {version = "^1.28.5", optional = true}
|
||||||
zep-python = {version="^0.25", optional=true}
|
zep-python = {version="^0.25", optional=true}
|
||||||
|
chardet = {version="^5.1.0", optional=true}
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
autodoc_pydantic = "^1.8.0"
|
autodoc_pydantic = "^1.8.0"
|
||||||
@ -156,6 +158,7 @@ ruff = "^0.0.249"
|
|||||||
types-toml = "^0.10.8.1"
|
types-toml = "^0.10.8.1"
|
||||||
types-redis = "^4.3.21.6"
|
types-redis = "^4.3.21.6"
|
||||||
black = "^23.1.0"
|
black = "^23.1.0"
|
||||||
|
types-chardet = "^5.0.4.6"
|
||||||
|
|
||||||
[tool.poetry.group.typing.dependencies]
|
[tool.poetry.group.typing.dependencies]
|
||||||
mypy = "^0.991"
|
mypy = "^0.991"
|
||||||
@ -174,6 +177,7 @@ setuptools = "^67.6.1"
|
|||||||
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||||
qdrant = ["qdrant-client"]
|
qdrant = ["qdrant-client"]
|
||||||
openai = ["openai", "tiktoken"]
|
openai = ["openai", "tiktoken"]
|
||||||
|
text_helpers = ["chardet"]
|
||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
in_memory_store = ["docarray"]
|
in_memory_store = ["docarray"]
|
||||||
hnswlib = ["docarray", "protobuf", "hnswlib"]
|
hnswlib = ["docarray", "protobuf", "hnswlib"]
|
||||||
@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "
|
|||||||
# merge-conflicts
|
# merge-conflicts
|
||||||
extended_testing = [
|
extended_testing = [
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
|
"chardet",
|
||||||
"jq",
|
"jq",
|
||||||
"pdfminer.six",
|
"pdfminer.six",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.document_loaders import DirectoryLoader, TextLoader
|
||||||
|
from langchain.document_loaders.helpers import detect_file_encodings
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("chardet")
|
||||||
|
def test_loader_detect_encoding() -> None:
|
||||||
|
"""Test text loader."""
|
||||||
|
path = Path(__file__).parent.parent / "examples"
|
||||||
|
files = path.glob("**/*.txt")
|
||||||
|
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
|
||||||
|
loader_detect_encoding = DirectoryLoader(
|
||||||
|
str(path),
|
||||||
|
glob="**/*.txt",
|
||||||
|
loader_kwargs={"autodetect_encoding": True},
|
||||||
|
loader_cls=TextLoader,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises((UnicodeDecodeError, RuntimeError)):
|
||||||
|
loader.load()
|
||||||
|
|
||||||
|
docs = loader_detect_encoding.load()
|
||||||
|
assert len(docs) == len(list(files))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="slow test")
|
||||||
|
@pytest.mark.requires("chardet")
|
||||||
|
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
|
||||||
|
path = Path(tmpdir)
|
||||||
|
file_path = str(path / "blob.txt")
|
||||||
|
# 2mb binary blob
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(b"\x00" * 2_000_000)
|
||||||
|
|
||||||
|
with pytest.raises(TimeoutError):
|
||||||
|
detect_file_encodings(file_path, timeout=1)
|
||||||
|
|
||||||
|
detect_file_encodings(file_path, timeout=10)
|
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Какие-то кракозябры
|
6
tests/unit_tests/examples/example-utf8.txt
Normal file
6
tests/unit_tests/examples/example-utf8.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
||||||
|
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
|
||||||
|
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||||
|
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
||||||
|
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
|
||||||
|
culpa qui officia deserunt mollit anim id est laborum.
|
Loading…
Reference in New Issue
Block a user