mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
# TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530>
This commit is contained in:
parent
8c28ad6dac
commit
e46202829f
@ -12,7 +12,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"id": "019d8520",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -151,7 +151,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 15,
|
||||
"id": "81c92da3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -210,7 +210,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 14,
|
||||
"id": "c558bd73",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -259,13 +259,292 @@
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6411a0cb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Auto detect file encodings with TextLoader\n",
|
||||
"\n",
|
||||
"In this example we will see some strategies that can be useful when loading a big list of arbitrary files from a directory using the `TextLoader` class.\n",
|
||||
"\n",
|
||||
"First to illustrate the problem, let's try to load multiple text with arbitrary encodings."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6a91a0bc",
|
||||
"execution_count": 16,
|
||||
"id": "2c787a69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"path = '../../../../../tests/integration_tests/examples'\n",
|
||||
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9001e12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### A. Default Behavior"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "b1e88c31",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">29</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">26 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>text = <span style=\"color: #808000; text-decoration-color: #808000\">\"\"</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">27 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">open</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path, encoding=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.encoding) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> f: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">28 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>29 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span>text = f.read() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">30 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">31 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.autodetect_encoding: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">32 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span>detected_encodings = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.detect_file_encodings() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/spike/.pyenv/versions/3.9.11/lib/python3.9/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">codecs.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">322</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 319 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span>, final=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 320 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># decode input (taking the buffer into account)</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 321 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>data = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer + <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 322 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span>(result, consumed) = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._buffer_decode(data, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.errors, final) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 323 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># keep undecoded input until the next call</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 324 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer = data[consumed:] <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 325 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> result <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||
"<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">UnicodeDecodeError: </span><span style=\"color: #008000; text-decoration-color: #008000\">'utf-8'</span> codec can't decode byte <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0xca</span> in position <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: invalid continuation byte\n",
|
||||
"\n",
|
||||
"<span style=\"font-style: italic\">The above exception was the direct cause of the following exception:</span>\n",
|
||||
"\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\"><module></span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1 loader.load() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">84</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">82 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span>logger.warning(e) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">83 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>84 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> e <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">85 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">finally</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">86 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> pbar: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">87 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span>pbar.update(<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">78</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">75 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> i.is_file(): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">76 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> _is_visible(i.relative_to(p)) <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">or</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.load_hidden: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">77 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>78 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span>sub_docs = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_cls(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">str</span>(i), **<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_kwargs).load() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">79 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span>docs.extend(sub_docs) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">80 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">44</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">41 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">42 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">continue</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">43 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>44 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">45 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">46 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">47 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
|
||||
"<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||
"<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">RuntimeError: </span>Error loading ..<span style=\"color: #800080; text-decoration-color: #800080\">/../../../../tests/integration_tests/examples/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">example-non-utf8.txt</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m29\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m26 \u001b[0m\u001b[2m│ │ \u001b[0mtext = \u001b[33m\"\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m27 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mopen\u001b[0m(\u001b[96mself\u001b[0m.file_path, encoding=\u001b[96mself\u001b[0m.encoding) \u001b[94mas\u001b[0m f: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m28 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m29 \u001b[2m│ │ │ │ \u001b[0mtext = f.read() \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m30 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m31 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.autodetect_encoding: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m32 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mdetected_encodings = \u001b[96mself\u001b[0m.detect_file_encodings() \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2;33m/home/spike/.pyenv/versions/3.9.11/lib/python3.9/\u001b[0m\u001b[1;33mcodecs.py\u001b[0m:\u001b[94m322\u001b[0m in \u001b[92mdecode\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 319 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecode\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m, final=\u001b[94mFalse\u001b[0m): \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 320 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# decode input (taking the buffer into account)\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 321 \u001b[0m\u001b[2m│ │ \u001b[0mdata = \u001b[96mself\u001b[0m.buffer + \u001b[96minput\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 322 \u001b[2m│ │ \u001b[0m(result, consumed) = \u001b[96mself\u001b[0m._buffer_decode(data, \u001b[96mself\u001b[0m.errors, final) \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 323 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# keep undecoded input until the next call\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 324 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.buffer = data[consumed:] \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m 325 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m result \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
|
||||
"\u001b[1;91mUnicodeDecodeError: \u001b[0m\u001b[32m'utf-8'\u001b[0m codec can't decode byte \u001b[1;36m0xca\u001b[0m in position \u001b[1;36m0\u001b[0m: invalid continuation byte\n",
|
||||
"\n",
|
||||
"\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m:\u001b[94m1\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 loader.load() \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m84\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m82 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mlogger.warning(e) \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m83 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m84 \u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m e \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m85 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m86 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m pbar: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m87 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mpbar.update(\u001b[94m1\u001b[0m) \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m78\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m75 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m i.is_file(): \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m76 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m _is_visible(i.relative_to(p)) \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m.load_hidden: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m77 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m78 \u001b[2m│ │ │ │ │ │ \u001b[0msub_docs = \u001b[96mself\u001b[0m.loader_cls(\u001b[96mstr\u001b[0m(i), **\u001b[96mself\u001b[0m.loader_kwargs).load() \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m79 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mdocs.extend(sub_docs) \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m80 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m44\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m41 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m42 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mcontinue\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m43 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m44 \u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m45 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m46 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m│\u001b[0m \u001b[2m47 \u001b[0m \u001b[31m│\u001b[0m\n",
|
||||
"\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
|
||||
"\u001b[1;91mRuntimeError: \u001b[0mError loading ..\u001b[35m/../../../../tests/integration_tests/examples/\u001b[0m\u001b[95mexample-non-utf8.txt\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da554f9a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The file `example-non-utf8.txt` uses a different encoding the `load()` function fails with a helpful message indicating which file failed decoding. \n",
|
||||
"\n",
|
||||
"With the default behavior of `TextLoader` any failure to load any of the documents will fail the whole loading process and no documents are loaded. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb844b7e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### B. Silent fail\n",
|
||||
"\n",
|
||||
"We can pass the parameter `silent_errors` to the `DirectoryLoader` to skip the files which could not be loaded and continue the load process."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "5314ec39",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, silent_errors=True)\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "216337e5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
|
||||
" '../../../../../tests/integration_tests/examples/example-utf8.txt']"
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_sources = [doc.metadata['source'] for doc in docs]\n",
|
||||
"doc_sources"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4cba0e53",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### C. Auto detect encodings\n",
|
||||
"\n",
|
||||
"We can also ask `TextLoader` to auto detect the file encoding before failing, by passing the `autodetect_encoding` to the loader class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "96d527d2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||
"loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||
"docs = loader.load()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "f1a136a5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['../../../../../tests/integration_tests/examples/example-non-utf8.txt',\n",
|
||||
" '../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
|
||||
" '../../../../../tests/integration_tests/examples/example-utf8.txt']"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_sources = [doc.metadata['source'] for doc in docs]\n",
|
||||
"doc_sources"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -284,7 +563,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.9.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
37
langchain/document_loaders/helpers.py
Normal file
37
langchain/document_loaders/helpers.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""Document loader helpers."""
|
||||
|
||||
import concurrent.futures
|
||||
from typing import List, NamedTuple, Optional, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
encoding: Optional[str]
|
||||
confidence: float
|
||||
language: Optional[str]
|
||||
|
||||
|
||||
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||
"""Try to detect the file encoding.
|
||||
|
||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||
by confidence.
|
||||
"""
|
||||
import chardet
|
||||
|
||||
def read_and_detect(file_path: str) -> List[dict]:
|
||||
with open(file_path, "rb") as f:
|
||||
rawdata = f.read()
|
||||
return cast(List[dict], chardet.detect_all(rawdata))
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(read_and_detect, file_path)
|
||||
try:
|
||||
encodings = future.result(timeout=timeout)
|
||||
except concurrent.futures.TimeoutError:
|
||||
raise TimeoutError(
|
||||
f"Timeout reached while detecting encoding for {file_path}"
|
||||
)
|
||||
|
||||
if all(encoding["encoding"] is None for encoding in encodings):
|
||||
raise RuntimeError(f"Could not detect encoding for {file_path}")
|
||||
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
|
@ -1,20 +1,59 @@
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.helpers import detect_file_encodings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextLoader(BaseLoader):
|
||||
"""Load text files."""
|
||||
"""Load text files.
|
||||
|
||||
def __init__(self, file_path: str, encoding: Optional[str] = None):
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
|
||||
encoding: File encoding to use. If `None`, the file will be loaded
|
||||
with the default system encoding.
|
||||
|
||||
autodetect_encoding: Whether to try to autodetect the file encoding
|
||||
if the specified encoding fails.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
encoding: Optional[str] = None,
|
||||
autodetect_encoding: bool = False,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.encoding = encoding
|
||||
self.autodetect_encoding = autodetect_encoding
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
text = ""
|
||||
try:
|
||||
with open(self.file_path, encoding=self.encoding) as f:
|
||||
text = f.read()
|
||||
except UnicodeDecodeError as e:
|
||||
if self.autodetect_encoding:
|
||||
detected_encodings = detect_file_encodings(self.file_path)
|
||||
for encoding in detected_encodings:
|
||||
logger.debug("Trying encoding: ", encoding.encoding)
|
||||
try:
|
||||
with open(self.file_path, encoding=encoding.encoding) as f:
|
||||
text = f.read()
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(f"Error loading {self.file_path}") from e
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error loading {self.file_path}") from e
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
50
poetry.lock
generated
50
poetry.lock
generated
@ -1022,6 +1022,18 @@ files = [
|
||||
[package.dependencies]
|
||||
pycparser = "*"
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "5.1.0"
|
||||
description = "Universal encoding detector for Python 3"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
|
||||
{file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.1.0"
|
||||
@ -1510,13 +1522,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "deeplake"
|
||||
version = "3.4.4"
|
||||
version = "3.5.0"
|
||||
description = "Activeloop Deep Lake"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "deeplake-3.4.4.tar.gz", hash = "sha256:7d044a89862fe6aa4a93abc7db90090be7ecfc611169916e3975e0cd187be1ac"},
|
||||
{file = "deeplake-3.5.0.tar.gz", hash = "sha256:ae640f75b1fec4eed9598a4e8d6b80e7243af87d9f39f0d34f01ce2c6f7c194f"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -3138,13 +3150,13 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
|
||||
|
||||
[[package]]
|
||||
name = "jcloud"
|
||||
version = "0.2.8"
|
||||
version = "0.2.9"
|
||||
description = "Simplify deploying and managing Jina projects on Jina Cloud"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "jcloud-0.2.8.tar.gz", hash = "sha256:cdd30c85c0a857573651ebc329f52a8de9e43c3e0f276dc85975914006295639"},
|
||||
{file = "jcloud-0.2.9.tar.gz", hash = "sha256:eaf8da685f8907e153ff752e6a4b945aeff548b8d94dfd650a035d8450ed547e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -3297,14 +3309,14 @@ websockets = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "jina-hubble-sdk"
|
||||
version = "0.36.0"
|
||||
version = "0.37.1"
|
||||
description = "SDK for Hubble API at Jina AI."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
{file = "jina-hubble-sdk-0.36.0.tar.gz", hash = "sha256:ba1a72c7a5c14963fdad9af1ff4c3bba26a03ddcced08111bd11a95b153249ec"},
|
||||
{file = "jina_hubble_sdk-0.36.0-py3-none-any.whl", hash = "sha256:56db142147d7c72142ed1b6505020c3b4d8070b1603d07ff796e56d491dde294"},
|
||||
{file = "jina-hubble-sdk-0.37.1.tar.gz", hash = "sha256:5b6bd9e13f97c8c77be822e9ae49f87a0f16ef8195011f25db2552006a5ca2a0"},
|
||||
{file = "jina_hubble_sdk-0.37.1-py3-none-any.whl", hash = "sha256:bc54a60ed120508e231fbed28b0fff394c288312d2ffa865c7865c03dbbbb502"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -6608,6 +6620,7 @@ files = [
|
||||
{file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
|
||||
{file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
|
||||
{file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
|
||||
{file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -9367,6 +9380,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
|
||||
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
||||
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "types-chardet"
|
||||
version = "5.0.4.6"
|
||||
description = "Typing stubs for chardet"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818"},
|
||||
{file = "types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-pyopenssl"
|
||||
version = "23.1.0.3"
|
||||
@ -9762,14 +9787,14 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "weaviate-client"
|
||||
version = "3.18.0"
|
||||
version = "3.19.1"
|
||||
description = "A python native weaviate client"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "weaviate-client-3.18.0.tar.gz", hash = "sha256:423a526518a32505c5293328e5f252e6cbbf20e4b3124733f70d10fc0d6823c9"},
|
||||
{file = "weaviate_client-3.18.0-py3-none-any.whl", hash = "sha256:42b324286a4b4436317e5d2c6ba48c07da6cf01518efdd47ee097e7a8cc7584c"},
|
||||
{file = "weaviate-client-3.19.1.tar.gz", hash = "sha256:8528926b0b545225ab75583481d67cccf9494d2dc01cb62f1165a8f187b41ebb"},
|
||||
{file = "weaviate_client-3.19.1-py3-none-any.whl", hash = "sha256:6b4aae86cd955543fcc6328bc1462fbbae053dc50f7b6822287b05b98fec0d27"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -10309,14 +10334,15 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
|
||||
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
||||
cohere = ["cohere"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
|
||||
hnswlib = ["docarray", "hnswlib", "protobuf"]
|
||||
in-memory-store = ["docarray"]
|
||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
qdrant = ["qdrant-client"]
|
||||
text-helpers = ["chardet"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "055d65314e800e0731b086471d357e5fb6bbd265ee9fa2bd3762470152cc3b85"
|
||||
content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b"
|
||||
|
@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true}
|
||||
pandas = {version = "^2.0.1", optional = true}
|
||||
telethon = {version = "^1.28.5", optional = true}
|
||||
zep-python = {version="^0.25", optional=true}
|
||||
chardet = {version="^5.1.0", optional=true}
|
||||
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
autodoc_pydantic = "^1.8.0"
|
||||
@ -156,6 +158,7 @@ ruff = "^0.0.249"
|
||||
types-toml = "^0.10.8.1"
|
||||
types-redis = "^4.3.21.6"
|
||||
black = "^23.1.0"
|
||||
types-chardet = "^5.0.4.6"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
@ -174,6 +177,7 @@ setuptools = "^67.6.1"
|
||||
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
qdrant = ["qdrant-client"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
text_helpers = ["chardet"]
|
||||
cohere = ["cohere"]
|
||||
in_memory_store = ["docarray"]
|
||||
hnswlib = ["docarray", "protobuf", "hnswlib"]
|
||||
@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "
|
||||
# merge-conflicts
|
||||
extended_testing = [
|
||||
"beautifulsoup4",
|
||||
"chardet",
|
||||
"jq",
|
||||
"pdfminer.six",
|
||||
"pypdf",
|
||||
|
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
41
tests/unit_tests/document_loaders/test_detect_encoding.py
Normal file
@ -0,0 +1,41 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import DirectoryLoader, TextLoader
|
||||
from langchain.document_loaders.helpers import detect_file_encodings
|
||||
|
||||
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_loader_detect_encoding() -> None:
|
||||
"""Test text loader."""
|
||||
path = Path(__file__).parent.parent / "examples"
|
||||
files = path.glob("**/*.txt")
|
||||
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
|
||||
loader_detect_encoding = DirectoryLoader(
|
||||
str(path),
|
||||
glob="**/*.txt",
|
||||
loader_kwargs={"autodetect_encoding": True},
|
||||
loader_cls=TextLoader,
|
||||
)
|
||||
|
||||
with pytest.raises((UnicodeDecodeError, RuntimeError)):
|
||||
loader.load()
|
||||
|
||||
docs = loader_detect_encoding.load()
|
||||
assert len(docs) == len(list(files))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="slow test")
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
|
||||
path = Path(tmpdir)
|
||||
file_path = str(path / "blob.txt")
|
||||
# 2mb binary blob
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(b"\x00" * 2_000_000)
|
||||
|
||||
with pytest.raises(TimeoutError):
|
||||
detect_file_encodings(file_path, timeout=1)
|
||||
|
||||
detect_file_encodings(file_path, timeout=10)
|
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
1
tests/unit_tests/examples/example-non-utf8.txt
Normal file
@ -0,0 +1 @@
|
||||
Какие-то кракозябры
|
6
tests/unit_tests/examples/example-utf8.txt
Normal file
6
tests/unit_tests/examples/example-utf8.txt
Normal file
@ -0,0 +1,6 @@
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
||||
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
|
||||
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
|
||||
culpa qui officia deserunt mollit anim id est laborum.
|
Loading…
Reference in New Issue
Block a user