mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 05:43:55 +00:00
directory loader improvements (#1162)
This commit is contained in:
parent
05a05bcb04
commit
65cc81c479
@ -6,7 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Directory Loader\n",
|
"# Directory Loader\n",
|
||||||
"This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)"
|
"This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, by default this uses the [UnstructuredLoader](./unstructured_file.ipynb)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -29,7 +29,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 2,
|
||||||
"id": "891fe56f",
|
"id": "891fe56f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -39,7 +39,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 3,
|
||||||
"id": "addfe9cf",
|
"id": "addfe9cf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -49,7 +49,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 4,
|
||||||
"id": "b042086d",
|
"id": "b042086d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -59,7 +59,67 @@
|
|||||||
"1"
|
"1"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(docs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c5652850",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Change loader class\n",
|
||||||
|
"By default this uses the UnstructuredLoader class. However, you can change up the type of loader pretty easily."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "81c92da3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import TextLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "ab38ee36",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DirectoryLoader('../', glob=\"**/*.md\", loader_cls=TextLoader)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "25c8740f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "38337763",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -71,7 +131,7 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "cbc8256b",
|
"id": "984c8429",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
|
@ -1,19 +1,42 @@
|
|||||||
"""Loading logic for loading documents from a directory."""
|
"""Loading logic for loading documents from a directory."""
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List, Type, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.document_loaders.text import TextLoader
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
FILE_LOADER_TYPE = Union[Type[UnstructuredFileLoader], Type[TextLoader]]
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_visible(p: Path) -> bool:
|
||||||
|
parts = p.parts
|
||||||
|
for _p in parts:
|
||||||
|
if _p.startswith("."):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class DirectoryLoader(BaseLoader):
|
class DirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from a directory."""
|
"""Loading logic for loading documents from a directory."""
|
||||||
|
|
||||||
def __init__(self, path: str, glob: str = "**/*"):
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
glob: str = "**/[!.]*",
|
||||||
|
silent_errors: bool = False,
|
||||||
|
load_hidden: bool = False,
|
||||||
|
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
|
||||||
|
):
|
||||||
"""Initialize with path to directory and how to glob over it."""
|
"""Initialize with path to directory and how to glob over it."""
|
||||||
self.path = path
|
self.path = path
|
||||||
self.glob = glob
|
self.glob = glob
|
||||||
|
self.load_hidden = load_hidden
|
||||||
|
self.loader_cls = loader_cls
|
||||||
|
self.silent_errors = silent_errors
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
@ -21,6 +44,13 @@ class DirectoryLoader(BaseLoader):
|
|||||||
docs = []
|
docs = []
|
||||||
for i in p.glob(self.glob):
|
for i in p.glob(self.glob):
|
||||||
if i.is_file():
|
if i.is_file():
|
||||||
sub_docs = UnstructuredFileLoader(str(i)).load()
|
if _is_visible(i.relative_to(p)) or self.load_hidden:
|
||||||
docs.extend(sub_docs)
|
try:
|
||||||
|
sub_docs = self.loader_cls(str(i)).load()
|
||||||
|
docs.extend(sub_docs)
|
||||||
|
except Exception as e:
|
||||||
|
if self.silent_errors:
|
||||||
|
logger.warning(e)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
return docs
|
return docs
|
||||||
|
Loading…
Reference in New Issue
Block a user