Adding headers for accessing pdf file url (#10370)

- Description: Set up 'file_headers' params for accessing pdf file url
  - Tag maintainer: @hwchase17 

 make format, make lint, make test

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Michael Kim 2023-09-14 08:09:38 +09:00 committed by GitHub
parent a34510536d
commit 2dc3c64386
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,7 +6,7 @@ import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -62,14 +62,20 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
class BasePDFLoader(BaseLoader, ABC): class BasePDFLoader(BaseLoader, ABC):
"""Base Loader class for `PDF` files. """Base Loader class for `PDF` files.
Defaults to check for local file, but if the file is a web path, it will download it If the file is a web path, it will download it to a temporary file, use it, then
to a temporary file, use it, then clean up the temporary file after completion clean up the temporary file after completion.
""" """
def __init__(self, file_path: str): def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path.""" """Initialize with a file path.
Args:
file_path: Either a local, S3 or web path to a PDF file.
headers: Headers to use for GET request to download a file from a web path.
"""
self.file_path = file_path self.file_path = file_path
self.web_path = None self.web_path = None
self.headers = headers
if "~" in self.file_path: if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path) self.file_path = os.path.expanduser(self.file_path)
@ -78,18 +84,15 @@ class BasePDFLoader(BaseLoader, ABC):
self.temp_dir = tempfile.TemporaryDirectory() self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path) _, suffix = os.path.splitext(self.file_path)
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}") temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
if self._is_s3_url(self.file_path): self.web_path = self.file_path
self.web_path = self.file_path if not self._is_s3_url(self.file_path):
else: r = requests.get(self.file_path, headers=self.headers)
r = requests.get(self.file_path)
if r.status_code != 200: if r.status_code != 200:
raise ValueError( raise ValueError(
"Check the url of your file; returned status code %s" "Check the url of your file; returned status code %s"
% r.status_code % r.status_code
) )
self.web_path = self.file_path
with open(temp_pdf, mode="wb") as f: with open(temp_pdf, mode="wb") as f:
f.write(r.content) f.write(r.content)
self.file_path = str(temp_pdf) self.file_path = str(temp_pdf)
@ -138,7 +141,10 @@ class PyPDFLoader(BasePDFLoader):
""" """
def __init__( def __init__(
self, file_path: str, password: Optional[Union[str, bytes]] = None self,
file_path: str,
password: Optional[Union[str, bytes]] = None,
headers: Optional[Dict] = None,
) -> None: ) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
@ -148,7 +154,7 @@ class PyPDFLoader(BasePDFLoader):
"pypdf package not found, please install it with " "`pip install pypdf`" "pypdf package not found, please install it with " "`pip install pypdf`"
) )
self.parser = PyPDFParser(password=password) self.parser = PyPDFParser(password=password)
super().__init__(file_path) super().__init__(file_path, headers=headers)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load given path as pages.""" """Load given path as pages."""
@ -165,9 +171,9 @@ class PyPDFLoader(BasePDFLoader):
class PyPDFium2Loader(BasePDFLoader): class PyPDFium2Loader(BasePDFLoader):
"""Load `PDF` using `pypdfium2` and chunks at character level.""" """Load `PDF` using `pypdfium2` and chunks at character level."""
def __init__(self, file_path: str): def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path.""" """Initialize with a file path."""
super().__init__(file_path) super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser() self.parser = PyPDFium2Parser()
def load(self) -> List[Document]: def load(self) -> List[Document]:
@ -230,7 +236,7 @@ class PyPDFDirectoryLoader(BaseLoader):
class PDFMinerLoader(BasePDFLoader): class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`.""" """Load `PDF` files using `PDFMiner`."""
def __init__(self, file_path: str) -> None: def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
"""Initialize with file path.""" """Initialize with file path."""
try: try:
from pdfminer.high_level import extract_text # noqa:F401 from pdfminer.high_level import extract_text # noqa:F401
@ -240,7 +246,7 @@ class PDFMinerLoader(BasePDFLoader):
"`pip install pdfminer.six`" "`pip install pdfminer.six`"
) )
super().__init__(file_path) super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser() self.parser = PDFMinerParser()
def load(self) -> List[Document]: def load(self) -> List[Document]:
@ -258,7 +264,7 @@ class PDFMinerLoader(BasePDFLoader):
class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Load `PDF` files as HTML content using `PDFMiner`.""" """Load `PDF` files as HTML content using `PDFMiner`."""
def __init__(self, file_path: str): def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401 from pdfminer.high_level import extract_text_to_fp # noqa:F401
@ -268,7 +274,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"`pip install pdfminer.six`" "`pip install pdfminer.six`"
) )
super().__init__(file_path) super().__init__(file_path, headers=headers)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file.""" """Load file."""
@ -292,7 +298,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
class PyMuPDFLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader):
"""Load `PDF` files using `PyMuPDF`.""" """Load `PDF` files using `PyMuPDF`."""
def __init__(self, file_path: str) -> None: def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
import fitz # noqa:F401 import fitz # noqa:F401
@ -302,7 +308,7 @@ class PyMuPDFLoader(BasePDFLoader):
"`pip install pymupdf`" "`pip install pymupdf`"
) )
super().__init__(file_path) super().__init__(file_path, headers=headers)
def load(self, **kwargs: Optional[Any]) -> List[Document]: def load(self, **kwargs: Optional[Any]) -> List[Document]:
"""Load file.""" """Load file."""
@ -335,19 +341,19 @@ class MathpixPDFLoader(BasePDFLoader):
should_clean_pdf: a flag to clean the PDF file. Default is False. should_clean_pdf: a flag to clean the PDF file. Default is False.
**kwargs: additional keyword arguments. **kwargs: additional keyword arguments.
""" """
super().__init__(file_path)
self.mathpix_api_key = get_from_dict_or_env( self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY" kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
) )
self.mathpix_api_id = get_from_dict_or_env( self.mathpix_api_id = get_from_dict_or_env(
kwargs, "mathpix_api_id", "MATHPIX_API_ID" kwargs, "mathpix_api_id", "MATHPIX_API_ID"
) )
super().__init__(file_path, **kwargs)
self.processed_file_format = processed_file_format self.processed_file_format = processed_file_format
self.max_wait_time_seconds = max_wait_time_seconds self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf self.should_clean_pdf = should_clean_pdf
@property @property
def headers(self) -> dict: def _mathpix_headers(self) -> Dict[str, str]:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
@property @property
@ -363,7 +369,7 @@ class MathpixPDFLoader(BasePDFLoader):
with open(self.file_path, "rb") as f: with open(self.file_path, "rb") as f:
files = {"file": f} files = {"file": f}
response = requests.post( response = requests.post(
self.url, headers=self.headers, files=files, data=self.data self.url, headers=self._mathpix_headers, files=files, data=self.data
) )
response_data = response.json() response_data = response.json()
if "pdf_id" in response_data: if "pdf_id" in response_data:
@ -441,6 +447,7 @@ class PDFPlumberLoader(BasePDFLoader):
file_path: str, file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None, text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False, dedupe: bool = False,
headers: Optional[Dict] = None,
) -> None: ) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
@ -451,7 +458,7 @@ class PDFPlumberLoader(BasePDFLoader):
"`pip install pdfplumber`" "`pip install pdfplumber`"
) )
super().__init__(file_path) super().__init__(file_path, headers=headers)
self.text_kwargs = text_kwargs or {} self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe self.dedupe = dedupe
@ -493,6 +500,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
credentials_profile_name: Optional[str] = None, credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None, region_name: Optional[str] = None,
endpoint_url: Optional[str] = None, endpoint_url: Optional[str] = None,
headers: Optional[Dict] = None,
) -> None: ) -> None:
"""Initialize the loader. """Initialize the loader.
@ -507,7 +515,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
endpoint_url: endpoint url for the textract service (Optional) endpoint_url: endpoint url for the textract service (Optional)
""" """
super().__init__(file_path) super().__init__(file_path, headers=headers)
try: try:
import textractcaller as tc # noqa: F401 import textractcaller as tc # noqa: F401
@ -608,7 +616,11 @@ class DocumentIntelligenceLoader(BasePDFLoader):
"""Loads a PDF with Azure Document Intelligence""" """Loads a PDF with Azure Document Intelligence"""
def __init__( def __init__(
self, file_path: str, client: Any, model: str = "prebuilt-document" self,
file_path: str,
client: Any,
model: str = "prebuilt-document",
headers: Optional[Dict] = None,
) -> None: ) -> None:
""" """
Initialize the object for file processing with Azure Document Intelligence Initialize the object for file processing with Azure Document Intelligence
@ -638,7 +650,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
""" """
self.parser = DocumentIntelligenceParser(client=client, model=model) self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path) super().__init__(file_path, headers=headers)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load given path as pages.""" """Load given path as pages."""