community[patch]: Fix BasePDFLoader suffix for s3 presigned urls (#18844)

BasePDFLoader doesn't parse the suffix of the file correctly when parsing S3 presigned urls. This fix enables the proper detection and parsing of S3 presigned URLs to prevent errors such as `OSError: [Errno 36] File name too long`. No additional dependencies required.
2025-06-26 16:43:35 +00:00 · 2024-03-11 01:58:51 +01:00 · 2024-03-11 01:58:51 +01:00 · a7f63d8cb4
commit a7f63d8cb4
parent ddaf9de169
1 changed files with 12 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import re
 import tempfile
 import time
 from abc import ABC
@ -96,6 +97,8 @@ class BasePDFLoader(BaseLoader, ABC):
        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
            self.temp_dir = tempfile.TemporaryDirectory()
            _, suffix = os.path.splitext(self.file_path)
+            if self._is_s3_presigned_url(self.file_path):
+                suffix = urlparse(self.file_path).path.split("/")[-1]
            temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
            self.web_path = self.file_path
            if not self._is_s3_url(self.file_path):
@ -133,6 +136,15 @@ class BasePDFLoader(BaseLoader, ABC):
        except ValueError:
            return False

+    @staticmethod
+    def _is_s3_presigned_url(url: str) -> bool:
+        """Check if the url is a presigned S3 url."""
+        try:
+            result = urlparse(url)
+            return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
+        except ValueError:
+            return False
+
    @property
    def source(self) -> str:
        return self.web_path if self.web_path is not None else self.file_path