mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 04:38:26 +00:00
Support GCS Objects with /
in GCS Loaders (#3356)
So, this is basically fixing the same things as #1517 but for GCS. ### Problem When loading GCS Objects with `/` in the object key (eg. folder/some-document.txt) using `GCSFileLoader`, the objects are downloaded into a temporary directory and saved as a file. This errors out when the parent directory does not exist within the temporary directory. ### What this pr does Creates parent directories based on object key. This also works with deeply nested keys: folder/subfolder/some-document.txt
This commit is contained in:
parent
a4d85f7fd5
commit
e6c1c32aff
@ -27,6 +27,10 @@ class GCSDirectoryLoader(BaseLoader):
|
||||
client = storage.Client(project=self.project_name)
|
||||
docs = []
|
||||
for blob in client.list_blobs(self.bucket, prefix=self.prefix):
|
||||
# we shall just skip directories since GCSFileLoader creates
|
||||
# intermediate directories on the fly
|
||||
if blob.name.endswith("/"):
|
||||
continue
|
||||
loader = GCSFileLoader(self.project_name, self.bucket, blob.name)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
|
@ -1,4 +1,5 @@
|
||||
"""Loading logic for loading documents from a GCS file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
@ -34,6 +35,7 @@ class GCSFileLoader(BaseLoader):
|
||||
blob = bucket.blob(self.blob)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/{self.blob}"
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
# Download the file to a destination
|
||||
blob.download_to_filename(file_path)
|
||||
loader = UnstructuredFileLoader(file_path)
|
||||
|
Loading…
Reference in New Issue
Block a user