mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
community[patch]: Fix BasePDFLoader suffix for s3 presigned urls (#18844)
BasePDFLoader doesn't parse the suffix of the file correctly when parsing S3 presigned urls. This fix enables the proper detection and parsing of S3 presigned URLs to prevent errors such as `OSError: [Errno 36] File name too long`. No additional dependencies required.
This commit is contained in:
parent
ddaf9de169
commit
a7f63d8cb4
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
from abc import ABC
|
||||
@ -96,6 +97,8 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
_, suffix = os.path.splitext(self.file_path)
|
||||
if self._is_s3_presigned_url(self.file_path):
|
||||
suffix = urlparse(self.file_path).path.split("/")[-1]
|
||||
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
|
||||
self.web_path = self.file_path
|
||||
if not self._is_s3_url(self.file_path):
|
||||
@ -133,6 +136,15 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_s3_presigned_url(url: str) -> bool:
|
||||
"""Check if the url is a presigned S3 url."""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@property
|
||||
def source(self) -> str:
|
||||
return self.web_path if self.web_path is not None else self.file_path
|
||||
|
Loading…
Reference in New Issue
Block a user