community[patch]: Refactoring PDF loaders: 01 prepare (#29062)

- **Refactoring PDF loaders step 1**: "community: Refactoring PDF
loaders to standardize approaches"

- **Description:** Declare CloudBlobLoader in __init__.py. file_path is
Union[str, PurePath] anywhere
- **Twitter handle:** pprados

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses to prepare the update of all parsers.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

@eyurtsev it's the start of a PR series.
This commit is contained in:
Philippe PRADOS 2025-01-07 17:00:04 +01:00 committed by GitHub
parent a49448a7c9
commit 2921597c71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 90 additions and 85 deletions

View File

@ -87,6 +87,7 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.blob_loaders import ( from langchain_community.document_loaders.blob_loaders import (
Blob, Blob,
BlobLoader, BlobLoader,
CloudBlobLoader,
FileSystemBlobLoader, FileSystemBlobLoader,
YoutubeAudioLoader, YoutubeAudioLoader,
) )
@ -574,6 +575,7 @@ _module_lookup = {
"CSVLoader": "langchain_community.document_loaders.csv_loader", "CSVLoader": "langchain_community.document_loaders.csv_loader",
"CassandraLoader": "langchain_community.document_loaders.cassandra", "CassandraLoader": "langchain_community.document_loaders.cassandra",
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt", "ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
"CoNLLULoader": "langchain_community.document_loaders.conllu", "CoNLLULoader": "langchain_community.document_loaders.conllu",
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501 "CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
"ConcurrentLoader": "langchain_community.document_loaders.concurrent", "ConcurrentLoader": "langchain_community.document_loaders.concurrent",
@ -781,6 +783,7 @@ __all__ = [
"CSVLoader", "CSVLoader",
"CassandraLoader", "CassandraLoader",
"ChatGPTLoader", "ChatGPTLoader",
"CloudBlobLoader",
"CoNLLULoader", "CoNLLULoader",
"CollegeConfidentialLoader", "CollegeConfidentialLoader",
"ConcurrentLoader", "ConcurrentLoader",

View File

@ -6,7 +6,6 @@ import warnings
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
Dict,
Iterable, Iterable,
Iterator, Iterator,
Mapping, Mapping,
@ -23,15 +22,13 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
if TYPE_CHECKING: if TYPE_CHECKING:
import fitz.fitz import fitz
import pdfminer.layout import pdfminer
import pdfplumber.page import pdfplumber
import pypdf._page import pypdf
import pypdfium2._helpers.page import pypdfium2
from pypdf import PageObject
from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.data.text_linearization_config import TextLinearizationConfig
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [ _PDF_FILTER_WITHOUT_LOSS = [
"LZWDecode", "LZWDecode",
@ -90,7 +87,7 @@ class PyPDFParser(BaseBlobParser):
extract_images: bool = False, extract_images: bool = False,
*, *,
extraction_mode: str = "plain", extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict[str, Any]] = None, extraction_kwargs: Optional[dict[str, Any]] = None,
): ):
self.password = password self.password = password
self.extract_images = extract_images self.extract_images = extract_images
@ -107,7 +104,7 @@ class PyPDFParser(BaseBlobParser):
"`pip install pypdf`" "`pip install pypdf`"
) )
def _extract_text_from_page(page: "PageObject") -> str: def _extract_text_from_page(page: pypdf.PageObject) -> str:
""" """
Extract text from image given the version of pypdf. Extract text from image given the version of pypdf.
""" """
@ -126,12 +123,13 @@ class PyPDFParser(BaseBlobParser):
Document( Document(
page_content=_extract_text_from_page(page=page) page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page), + self._extract_images_from_page(page),
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined] metadata={"source": blob.source, "page": page_number},
# type: ignore[attr-defined]
) )
for page_number, page in enumerate(pdf_reader.pages) for page_number, page in enumerate(pdf_reader.pages)
] ]
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str: def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
"""Extract images from page and get the text with RapidOCR.""" """Extract images from page and get the text with RapidOCR."""
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined] if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
return "" return ""
@ -307,9 +305,7 @@ class PyMuPDFParser(BaseBlobParser):
for page in doc for page in doc
] ]
def _get_page_content( def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
) -> str:
""" """
Get the text of the page using PyMuPDF and RapidOCR and issue a warning Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty. if it is empty.
@ -327,7 +323,7 @@ class PyMuPDFParser(BaseBlobParser):
return content return content
def _extract_metadata( def _extract_metadata(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob self, doc: fitz.Document, page: fitz.Page, blob: Blob
) -> dict: ) -> dict:
"""Extract metadata from the document and page.""" """Extract metadata from the document and page."""
return dict( return dict(
@ -344,9 +340,7 @@ class PyMuPDFParser(BaseBlobParser):
}, },
) )
def _extract_images_from_page( def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
) -> str:
"""Extract images from page and get the text with RapidOCR.""" """Extract images from page and get the text with RapidOCR."""
if not self.extract_images: if not self.extract_images:
return "" return ""
@ -558,7 +552,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
textract_features: Optional[Sequence[int]] = None, textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None, client: Optional[Any] = None,
*, *,
linearization_config: Optional["TextLinearizationConfig"] = None, linearization_config: Optional[TextLinearizationConfig] = None,
) -> None: ) -> None:
"""Initializes the parser. """Initializes the parser.

View File

@ -6,17 +6,17 @@ import tempfile
import time import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path, PurePath
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
Dict, BinaryIO,
Iterator, Iterator,
List,
Mapping, Mapping,
Optional, Optional,
Sequence, Sequence,
Union, Union,
cast,
) )
from urllib.parse import urlparse from urllib.parse import urlparse
@ -68,7 +68,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
""" """
def _get_elements(self) -> List: def _get_elements(self) -> list:
from unstructured.partition.pdf import partition_pdf from unstructured.partition.pdf import partition_pdf
return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
@ -81,7 +81,9 @@ class BasePDFLoader(BaseLoader, ABC):
clean up the temporary file after completion. clean up the temporary file after completion.
""" """
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None): def __init__(
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
):
"""Initialize with a file path. """Initialize with a file path.
Args: Args:
@ -154,7 +156,7 @@ class BasePDFLoader(BaseLoader, ABC):
class OnlinePDFLoader(BasePDFLoader): class OnlinePDFLoader(BasePDFLoader):
"""Load online `PDF`.""" """Load online `PDF`."""
def load(self) -> List[Document]: def load(self) -> list[Document]:
"""Load documents.""" """Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path)) loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load() return loader.load()
@ -223,13 +225,13 @@ class PyPDFLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
password: Optional[Union[str, bytes]] = None, password: Optional[Union[str, bytes]] = None,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
extract_images: bool = False, extract_images: bool = False,
*, *,
extraction_mode: str = "plain", extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict] = None, extraction_kwargs: Optional[dict] = None,
) -> None: ) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
@ -262,9 +264,9 @@ class PyPDFium2Loader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
*, *,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
extract_images: bool = False, extract_images: bool = False,
): ):
"""Initialize with a file path.""" """Initialize with a file path."""
@ -290,7 +292,7 @@ class PyPDFDirectoryLoader(BaseLoader):
def __init__( def __init__(
self, self,
path: Union[str, Path], path: Union[str, PurePath],
glob: str = "**/[!.]*.pdf", glob: str = "**/[!.]*.pdf",
silent_errors: bool = False, silent_errors: bool = False,
load_hidden: bool = False, load_hidden: bool = False,
@ -308,7 +310,7 @@ class PyPDFDirectoryLoader(BaseLoader):
def _is_visible(path: Path) -> bool: def _is_visible(path: Path) -> bool:
return not any(part.startswith(".") for part in path.parts) return not any(part.startswith(".") for part in path.parts)
def load(self) -> List[Document]: def load(self) -> list[Document]:
p = Path(self.path) p = Path(self.path)
docs = [] docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
@ -334,9 +336,9 @@ class PDFMinerLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
*, *,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
extract_images: bool = False, extract_images: bool = False,
concatenate_pages: bool = True, concatenate_pages: bool = True,
) -> None: ) -> None:
@ -374,7 +376,9 @@ class PDFMinerLoader(BasePDFLoader):
class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Load `PDF` files as HTML content using `PDFMiner`.""" """Load `PDF` files as HTML content using `PDFMiner`."""
def __init__(self, file_path: str, *, headers: Optional[Dict] = None): def __init__(
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
):
"""Initialize with a file path.""" """Initialize with a file path."""
try: try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401 from pdfminer.high_level import extract_text_to_fp # noqa:F401
@ -395,14 +399,14 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
output_string = StringIO() output_string = StringIO()
with open_filename(self.file_path, "rb") as fp: with open_filename(self.file_path, "rb") as fp:
extract_text_to_fp( extract_text_to_fp(
fp, cast(BinaryIO, fp),
output_string, output_string,
codec="", codec="",
laparams=LAParams(), laparams=LAParams(),
output_type="html", output_type="html",
) )
metadata = { metadata = {
"source": self.file_path if self.web_path is None else self.web_path "source": str(self.file_path) if self.web_path is None else self.web_path
} }
yield Document(page_content=output_string.getvalue(), metadata=metadata) yield Document(page_content=output_string.getvalue(), metadata=metadata)
@ -412,9 +416,9 @@ class PyMuPDFLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
*, *,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
extract_images: bool = False, extract_images: bool = False,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
@ -447,7 +451,7 @@ class PyMuPDFLoader(BasePDFLoader):
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from parser.lazy_parse(blob) yield from parser.lazy_parse(blob)
def load(self, **kwargs: Any) -> List[Document]: def load(self, **kwargs: Any) -> list[Document]:
return list(self._lazy_load(**kwargs)) return list(self._lazy_load(**kwargs))
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
@ -461,11 +465,11 @@ class MathpixPDFLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
processed_file_format: str = "md", processed_file_format: str = "md",
max_wait_time_seconds: int = 500, max_wait_time_seconds: int = 500,
should_clean_pdf: bool = False, should_clean_pdf: bool = False,
extra_request_data: Optional[Dict[str, Any]] = None, extra_request_data: Optional[dict[str, Any]] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Initialize with a file path. """Initialize with a file path.
@ -499,7 +503,7 @@ class MathpixPDFLoader(BasePDFLoader):
self.should_clean_pdf = should_clean_pdf self.should_clean_pdf = should_clean_pdf
@property @property
def _mathpix_headers(self) -> Dict[str, str]: def _mathpix_headers(self) -> dict[str, str]:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
@property @property
@ -515,7 +519,7 @@ class MathpixPDFLoader(BasePDFLoader):
return {"options_json": json.dumps(options)} return {"options_json": json.dumps(options)}
def send_pdf(self) -> str: def send_pdf(self) -> str:
with open(self.file_path, "rb") as f: with open(str(self.file_path), "rb") as f:
files = {"file": f} files = {"file": f}
response = requests.post( response = requests.post(
self.url, headers=self._mathpix_headers, files=files, data=self.data self.url, headers=self._mathpix_headers, files=files, data=self.data
@ -562,7 +566,7 @@ class MathpixPDFLoader(BasePDFLoader):
# This indicates an error with the PDF processing # This indicates an error with the PDF processing
raise ValueError("Unable to retrieve PDF from Mathpix") raise ValueError("Unable to retrieve PDF from Mathpix")
else: else:
print(f"Status: {status}, waiting for processing to complete") # noqa: T201 logger.info("Status: %s, waiting for processing to complete", status)
time.sleep(5) time.sleep(5)
raise TimeoutError raise TimeoutError
@ -572,8 +576,7 @@ class MathpixPDFLoader(BasePDFLoader):
response = requests.get(url, headers=self._mathpix_headers) response = requests.get(url, headers=self._mathpix_headers)
return response.content.decode("utf-8") return response.content.decode("utf-8")
@staticmethod def clean_pdf(self, contents: str) -> str:
def clean_pdf(contents: str) -> str:
"""Clean the PDF file. """Clean the PDF file.
Args: Args:
@ -596,7 +599,7 @@ class MathpixPDFLoader(BasePDFLoader):
) )
return contents return contents
def load(self) -> List[Document]: def load(self) -> list[Document]:
pdf_id = self.send_pdf() pdf_id = self.send_pdf()
contents = self.get_processed_pdf(pdf_id) contents = self.get_processed_pdf(pdf_id)
if self.should_clean_pdf: if self.should_clean_pdf:
@ -610,10 +613,10 @@ class PDFPlumberLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
text_kwargs: Optional[Mapping[str, Any]] = None, text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False, dedupe: bool = False,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
extract_images: bool = False, extract_images: bool = False,
) -> None: ) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
@ -630,7 +633,7 @@ class PDFPlumberLoader(BasePDFLoader):
self.dedupe = dedupe self.dedupe = dedupe
self.extract_images = extract_images self.extract_images = extract_images
def load(self) -> List[Document]: def load(self) -> list[Document]:
"""Load file.""" """Load file."""
parser = PDFPlumberParser( parser = PDFPlumberParser(
@ -669,13 +672,13 @@ class AmazonTextractPDFLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
textract_features: Optional[Sequence[str]] = None, textract_features: Optional[Sequence[str]] = None,
client: Optional[Any] = None, client: Optional[Any] = None,
credentials_profile_name: Optional[str] = None, credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None, region_name: Optional[str] = None,
endpoint_url: Optional[str] = None, endpoint_url: Optional[str] = None,
headers: Optional[Dict] = None, headers: Optional[dict] = None,
*, *,
linearization_config: Optional["TextLinearizationConfig"] = None, linearization_config: Optional["TextLinearizationConfig"] = None,
) -> None: ) -> None:
@ -743,7 +746,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
linearization_config=linearization_config, linearization_config=linearization_config,
) )
def load(self) -> List[Document]: def load(self) -> list[Document]:
"""Load given path as pages.""" """Load given path as pages."""
return list(self.lazy_load()) return list(self.lazy_load())
@ -758,7 +761,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
if self.web_path and self._is_s3_url(self.web_path): if self.web_path and self._is_s3_url(self.web_path):
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc] blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
else: else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] blob = Blob.from_path(self.file_path)
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
raise ValueError( raise ValueError(
f"the file {blob.path} is a multi-page document, \ f"the file {blob.path} is a multi-page document, \
@ -792,7 +795,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined] elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined]
return 1 return 1
else: else:
raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined] raise ValueError( # type: ignore[attr-defined]
f"unsupported mime type: {blob.mimetype}"
)
class DedocPDFLoader(DedocBaseLoader): class DedocPDFLoader(DedocBaseLoader):
@ -887,7 +892,7 @@ class DedocPDFLoader(DedocBaseLoader):
from dedoc.utils.langchain import make_manager_pdf_config from dedoc.utils.langchain import make_manager_pdf_config
return make_manager_pdf_config( return make_manager_pdf_config(
file_path=self.file_path, file_path=str(self.file_path),
parsing_params=self.parsing_parameters, parsing_params=self.parsing_parameters,
split=self.split, split=self.split,
) )
@ -898,10 +903,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: str, file_path: Union[str, PurePath],
client: Any, client: Any,
model: str = "prebuilt-document", model: str = "prebuilt-document",
headers: Optional[Dict] = None, headers: Optional[dict] = None,
) -> None: ) -> None:
""" """
Initialize the object for file processing with Azure Document Intelligence Initialize the object for file processing with Azure Document Intelligence
@ -930,10 +935,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
... ) ... )
""" """
self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
self.parser = DocumentIntelligenceParser(client=client, model=model)
def load(self) -> List[Document]: def load(self) -> list[Document]:
"""Load given path as pages.""" """Load given path as pages."""
return list(self.lazy_load()) return list(self.lazy_load())
@ -964,7 +969,7 @@ class ZeroxPDFLoader(BasePDFLoader):
def __init__( def __init__(
self, self,
file_path: Union[str, Path], file_path: Union[str, PurePath],
model: str = "gpt-4o-mini", model: str = "gpt-4o-mini",
**zerox_kwargs: Any, **zerox_kwargs: Any,
) -> None: ) -> None:
@ -1005,7 +1010,7 @@ class ZeroxPDFLoader(BasePDFLoader):
# Directly call asyncio.run to execute zerox synchronously # Directly call asyncio.run to execute zerox synchronously
zerox_output = asyncio.run( zerox_output = asyncio.run(
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs) zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
) )
# Convert zerox output to Document instances and yield them # Convert zerox output to Document instances and yield them

View File

@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
if splits_by_page: if splits_by_page:
assert metadata["page"] == 0 assert int(metadata["page"]) == 0
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None: def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:

View File

@ -1,3 +1,4 @@
import os
from pathlib import Path from pathlib import Path
from typing import Sequence, Union from typing import Sequence, Union
@ -17,7 +18,7 @@ from langchain_community.document_loaders import (
def test_unstructured_pdf_loader_elements_mode() -> None: def test_unstructured_pdf_loader_elements_mode() -> None:
"""Test unstructured loader with various modes.""" """Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="elements") loader = UnstructuredPDFLoader(file_path, mode="elements")
docs = loader.load() docs = loader.load()
assert len(docs) == 2 assert len(docs) == 2
@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None:
def test_unstructured_pdf_loader_paged_mode() -> None: def test_unstructured_pdf_loader_paged_mode() -> None:
"""Test unstructured loader with various modes.""" """Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="paged") loader = UnstructuredPDFLoader(file_path, mode="paged")
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16
@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None:
def test_unstructured_pdf_loader_default_mode() -> None: def test_unstructured_pdf_loader_default_mode() -> None:
"""Test unstructured loader.""" """Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path)) loader = UnstructuredPDFLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None:
def test_pdfminer_loader() -> None: def test_pdfminer_loader() -> None:
"""Test PDFMiner loader.""" """Test PDFMiner loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(str(file_path)) loader = PDFMinerLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(str(file_path)) loader = PDFMinerLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
# Verify that concatenating pages parameter works # Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=True) loader = PDFMinerLoader(file_path, concatenate_pages=True)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=False) loader = PDFMinerLoader(file_path, concatenate_pages=False)
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16
@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None:
def test_pdfminer_pdf_as_html_loader() -> None: def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader.""" """Test PDFMinerPDFasHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerPDFasHTMLLoader(str(file_path)) loader = PDFMinerPDFasHTMLLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerPDFasHTMLLoader(str(file_path)) loader = PDFMinerPDFasHTMLLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None:
def test_pypdfium2_loader() -> None: def test_pypdfium2_loader() -> None:
"""Test PyPDFium2Loader.""" """Test PyPDFium2Loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyPDFium2Loader(str(file_path)) loader = PyPDFium2Loader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFium2Loader(str(file_path)) loader = PyPDFium2Loader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16
@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None:
def test_pymupdf_loader() -> None: def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader.""" """Test PyMuPDF loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyMuPDFLoader(str(file_path)) loader = PyMuPDFLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyMuPDFLoader(str(file_path)) loader = PyMuPDFLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16
@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None:
assert len(docs) == 1 assert len(docs) == 1
@pytest.mark.skipif(
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
)
def test_mathpix_loader() -> None: def test_mathpix_loader() -> None:
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = MathpixPDFLoader(str(file_path)) loader = MathpixPDFLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
print(docs[0].page_content) # noqa: T201
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = MathpixPDFLoader(str(file_path)) loader = MathpixPDFLoader(file_path)
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
print(docs[0].page_content) # noqa: T201
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -187,8 +189,8 @@ def test_mathpix_loader() -> None:
1, 1,
False, False,
), ),
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False), (Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False),
(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False), (Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False),
( (
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf", "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
["FORMS", "TABLES", "LAYOUT"], ["FORMS", "TABLES", "LAYOUT"],
@ -222,7 +224,7 @@ def test_amazontextract_loader(
@pytest.mark.skip(reason="Requires AWS credentials to run") @pytest.mark.skip(reason="Requires AWS credentials to run")
def test_amazontextract_loader_failures() -> None: def test_amazontextract_loader_failures() -> None:
# 2-page PDF local file system # 2-page PDF local file system
two_page_pdf = str( two_page_pdf = (
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf" Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
) )
loader = AmazonTextractPDFLoader(two_page_pdf) loader = AmazonTextractPDFLoader(two_page_pdf)

View File

@ -43,6 +43,7 @@ EXPECTED_ALL = [
"CassandraLoader", "CassandraLoader",
"CSVLoader", "CSVLoader",
"ChatGPTLoader", "ChatGPTLoader",
"CloudBlobLoader",
"CoNLLULoader", "CoNLLULoader",
"CollegeConfidentialLoader", "CollegeConfidentialLoader",
"ConcurrentLoader", "ConcurrentLoader",