mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-02 01:23:07 +00:00
Allow to specify a custom loader for GcsFileLoader (#8868)
Co-authored-by: Leonid Kuligin <kuligin@google.com>
This commit is contained in:
parent
ff44fe4e16
commit
b52a3785c9
@ -73,13 +73,27 @@
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41c8a46f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to use an alternative loader, you can provide a custom function, for example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eba3002d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"from langchain.document_loaders import PyPDFLoader\n",
|
||||
"def load_pdf(file_path):\n",
|
||||
" return PyPDFLoader(file_path)\n",
|
||||
"\n",
|
||||
"loader = GCSFileLoader(project_name=\"aist\", bucket=\"testing-hwc\", blob=\"fake.pdf\", loader_func=load_pdf)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -1,5 +1,5 @@
|
||||
"""Loading logic for loading documents from an GCS directory."""
|
||||
from typing import List
|
||||
from typing import Callable, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -9,17 +9,27 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
class GCSDirectoryLoader(BaseLoader):
|
||||
"""Loads Documents from GCS."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
|
||||
def __init__(
|
||||
self,
|
||||
project_name: str,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
loader_func: Optional[Callable[[str], BaseLoader]] = None,
|
||||
):
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
project_name: The name of the project for the GCS bucket.
|
||||
bucket: The name of the GCS bucket.
|
||||
prefix: The prefix of the GCS bucket.
|
||||
loader_func: A loader function that instatiates a loader based on a
|
||||
file_path argument. If nothing is provided, the GCSFileLoader
|
||||
would use its default loader.
|
||||
"""
|
||||
self.project_name = project_name
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
self._loader_func = loader_func
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
@ -37,6 +47,8 @@ class GCSDirectoryLoader(BaseLoader):
|
||||
# intermediate directories on the fly
|
||||
if blob.name.endswith("/"):
|
||||
continue
|
||||
loader = GCSFileLoader(self.project_name, self.bucket, blob.name)
|
||||
loader = GCSFileLoader(
|
||||
self.project_name, self.bucket, blob.name, loader_func=self._loader_func
|
||||
)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Load documents from a GCS file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
from typing import Callable, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -11,18 +11,42 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
class GCSFileLoader(BaseLoader):
|
||||
"""Load Documents from a GCS file."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, blob: str):
|
||||
def __init__(
|
||||
self,
|
||||
project_name: str,
|
||||
bucket: str,
|
||||
blob: str,
|
||||
loader_func: Optional[Callable[[str], BaseLoader]] = None,
|
||||
):
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
project_name: The name of the project to load
|
||||
bucket: The name of the GCS bucket.
|
||||
blob: The name of the GCS blob to load.
|
||||
loader_func: A loader function that instatiates a loader based on a
|
||||
file_path argument. If nothing is provided, the
|
||||
UnstructuredFileLoader is used.
|
||||
|
||||
Examples:
|
||||
To use an alternative PDF loader:
|
||||
>> from from langchain.document_loaders import PyPDFLoader
|
||||
>> loader = GCSFileLoader(..., loader_func=PyPDFLoader)
|
||||
|
||||
To use UnstructuredFileLoader with additional arguments:
|
||||
>> loader = GCSFileLoader(...,
|
||||
>> loader_func=lambda x: UnstructuredFileLoader(x, mode="elements"))
|
||||
|
||||
"""
|
||||
self.bucket = bucket
|
||||
self.blob = blob
|
||||
self.project_name = project_name
|
||||
|
||||
def default_loader_func(file_path: str) -> BaseLoader:
|
||||
return UnstructuredFileLoader(file_path)
|
||||
|
||||
self._loader_func = loader_func if loader_func else default_loader_func
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
@ -44,5 +68,5 @@ class GCSFileLoader(BaseLoader):
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
# Download the file to a destination
|
||||
blob.download_to_filename(file_path)
|
||||
loader = UnstructuredFileLoader(file_path)
|
||||
loader = self._loader_func(file_path)
|
||||
return loader.load()
|
||||
|
Loading…
Reference in New Issue
Block a user