community[patch]: Fix BasePDFLoader suffix for s3 presigned urls (#18844)

BasePDFLoader doesn't parse the suffix of the file correctly when
parsing S3 presigned urls. This fix enables the proper detection and
parsing of S3 presigned URLs to prevent errors such as `OSError: [Errno
36] File name too long`.
No additional dependencies required.
This commit is contained in:
Pol Ruiz Farre 2024-03-11 01:58:51 +01:00 committed by GitHub
parent ddaf9de169
commit a7f63d8cb4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,7 @@
import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
@ -96,6 +97,8 @@ class BasePDFLoader(BaseLoader, ABC):
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
if self._is_s3_presigned_url(self.file_path):
suffix = urlparse(self.file_path).path.split("/")[-1]
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
self.web_path = self.file_path
if not self._is_s3_url(self.file_path):
@ -133,6 +136,15 @@ class BasePDFLoader(BaseLoader, ABC):
except ValueError:
return False
@staticmethod
def _is_s3_presigned_url(url: str) -> bool:
"""Check if the url is a presigned S3 url."""
try:
result = urlparse(url)
return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
except ValueError:
return False
@property
def source(self) -> str:
return self.web_path if self.web_path is not None else self.file_path