mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
community[patch]: Fix BasePDFLoader suffix for s3 presigned urls (#18844)
BasePDFLoader doesn't parse the suffix of the file correctly when parsing S3 presigned urls. This fix enables the proper detection and parsing of S3 presigned URLs to prevent errors such as `OSError: [Errno 36] File name too long`. No additional dependencies required.
This commit is contained in:
parent
ddaf9de169
commit
a7f63d8cb4
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
@ -96,6 +97,8 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||||
self.temp_dir = tempfile.TemporaryDirectory()
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
_, suffix = os.path.splitext(self.file_path)
|
_, suffix = os.path.splitext(self.file_path)
|
||||||
|
if self._is_s3_presigned_url(self.file_path):
|
||||||
|
suffix = urlparse(self.file_path).path.split("/")[-1]
|
||||||
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
|
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
|
||||||
self.web_path = self.file_path
|
self.web_path = self.file_path
|
||||||
if not self._is_s3_url(self.file_path):
|
if not self._is_s3_url(self.file_path):
|
||||||
@ -133,6 +136,15 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_s3_presigned_url(url: str) -> bool:
|
||||||
|
"""Check if the url is a presigned S3 url."""
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def source(self) -> str:
|
def source(self) -> str:
|
||||||
return self.web_path if self.web_path is not None else self.file_path
|
return self.web_path if self.web_path is not None else self.file_path
|
||||||
|
Loading…
Reference in New Issue
Block a user