mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
community[patch]: Refactoring PDF loaders: 01 prepare (#29062)
- **Refactoring PDF loaders step 1**: "community: Refactoring PDF loaders to standardize approaches" - **Description:** Declare CloudBlobLoader in __init__.py. file_path is Union[str, PurePath] anywhere - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). @eyurtsev it's the start of a PR series.
This commit is contained in:
parent
a49448a7c9
commit
2921597c71
@ -87,6 +87,7 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.blob_loaders import (
|
||||
Blob,
|
||||
BlobLoader,
|
||||
CloudBlobLoader,
|
||||
FileSystemBlobLoader,
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
@ -574,6 +575,7 @@ _module_lookup = {
|
||||
"CSVLoader": "langchain_community.document_loaders.csv_loader",
|
||||
"CassandraLoader": "langchain_community.document_loaders.cassandra",
|
||||
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
|
||||
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
|
||||
"CoNLLULoader": "langchain_community.document_loaders.conllu",
|
||||
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
|
||||
"ConcurrentLoader": "langchain_community.document_loaders.concurrent",
|
||||
@ -781,6 +783,7 @@ __all__ = [
|
||||
"CSVLoader",
|
||||
"CassandraLoader",
|
||||
"ChatGPTLoader",
|
||||
"CloudBlobLoader",
|
||||
"CoNLLULoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
|
@ -6,7 +6,6 @@ import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
@ -23,15 +22,13 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import fitz.fitz
|
||||
import pdfminer.layout
|
||||
import pdfplumber.page
|
||||
import pypdf._page
|
||||
import pypdfium2._helpers.page
|
||||
from pypdf import PageObject
|
||||
import fitz
|
||||
import pdfminer
|
||||
import pdfplumber
|
||||
import pypdf
|
||||
import pypdfium2
|
||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||
|
||||
|
||||
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
||||
_PDF_FILTER_WITHOUT_LOSS = [
|
||||
"LZWDecode",
|
||||
@ -90,7 +87,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict[str, Any]] = None,
|
||||
extraction_kwargs: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
self.password = password
|
||||
self.extract_images = extract_images
|
||||
@ -107,7 +104,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
"`pip install pypdf`"
|
||||
)
|
||||
|
||||
def _extract_text_from_page(page: "PageObject") -> str:
|
||||
def _extract_text_from_page(page: pypdf.PageObject) -> str:
|
||||
"""
|
||||
Extract text from image given the version of pypdf.
|
||||
"""
|
||||
@ -126,12 +123,13 @@ class PyPDFParser(BaseBlobParser):
|
||||
Document(
|
||||
page_content=_extract_text_from_page(page=page)
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
|
||||
metadata={"source": blob.source, "page": page_number},
|
||||
# type: ignore[attr-defined]
|
||||
)
|
||||
for page_number, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
|
||||
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
|
||||
return ""
|
||||
@ -307,9 +305,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
for page in doc
|
||||
]
|
||||
|
||||
def _get_page_content(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
|
||||
) -> str:
|
||||
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
|
||||
"""
|
||||
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
||||
if it is empty.
|
||||
@ -327,7 +323,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
return content
|
||||
|
||||
def _extract_metadata(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
|
||||
self, doc: fitz.Document, page: fitz.Page, blob: Blob
|
||||
) -> dict:
|
||||
"""Extract metadata from the document and page."""
|
||||
return dict(
|
||||
@ -344,9 +340,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
},
|
||||
)
|
||||
|
||||
def _extract_images_from_page(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
|
||||
) -> str:
|
||||
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images:
|
||||
return ""
|
||||
@ -558,7 +552,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
textract_features: Optional[Sequence[int]] = None,
|
||||
client: Optional[Any] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
linearization_config: Optional[TextLinearizationConfig] = None,
|
||||
) -> None:
|
||||
"""Initializes the parser.
|
||||
|
||||
|
@ -6,17 +6,17 @@ import tempfile
|
||||
import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
BinaryIO,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@ -68,7 +68,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
|
||||
"""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> list:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
|
||||
@ -81,7 +81,9 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
clean up the temporary file after completion.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
|
||||
):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
@ -154,7 +156,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Load online `PDF`."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load documents."""
|
||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
||||
return loader.load()
|
||||
@ -223,13 +225,13 @@ class PyPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
password: Optional[Union[str, bytes]] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict] = None,
|
||||
extraction_kwargs: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
@ -262,9 +264,9 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
@ -290,7 +292,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
path: Union[str, PurePath],
|
||||
glob: str = "**/[!.]*.pdf",
|
||||
silent_errors: bool = False,
|
||||
load_hidden: bool = False,
|
||||
@ -308,7 +310,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
def _is_visible(path: Path) -> bool:
|
||||
return not any(part.startswith(".") for part in path.parts)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
p = Path(self.path)
|
||||
docs = []
|
||||
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||
@ -334,9 +336,9 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
concatenate_pages: bool = True,
|
||||
) -> None:
|
||||
@ -374,7 +376,9 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
"""Load `PDF` files as HTML content using `PDFMiner`."""
|
||||
|
||||
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text_to_fp # noqa:F401
|
||||
@ -395,14 +399,14 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
output_string = StringIO()
|
||||
with open_filename(self.file_path, "rb") as fp:
|
||||
extract_text_to_fp(
|
||||
fp,
|
||||
cast(BinaryIO, fp),
|
||||
output_string,
|
||||
codec="",
|
||||
laparams=LAParams(),
|
||||
output_type="html",
|
||||
)
|
||||
metadata = {
|
||||
"source": self.file_path if self.web_path is None else self.web_path
|
||||
"source": str(self.file_path) if self.web_path is None else self.web_path
|
||||
}
|
||||
yield Document(page_content=output_string.getvalue(), metadata=metadata)
|
||||
|
||||
@ -412,9 +416,9 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@ -447,7 +451,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from parser.lazy_parse(blob)
|
||||
|
||||
def load(self, **kwargs: Any) -> List[Document]:
|
||||
def load(self, **kwargs: Any) -> list[Document]:
|
||||
return list(self._lazy_load(**kwargs))
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
@ -461,11 +465,11 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
processed_file_format: str = "md",
|
||||
max_wait_time_seconds: int = 500,
|
||||
should_clean_pdf: bool = False,
|
||||
extra_request_data: Optional[Dict[str, Any]] = None,
|
||||
extra_request_data: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with a file path.
|
||||
@ -499,7 +503,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
self.should_clean_pdf = should_clean_pdf
|
||||
|
||||
@property
|
||||
def _mathpix_headers(self) -> Dict[str, str]:
|
||||
def _mathpix_headers(self) -> dict[str, str]:
|
||||
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
|
||||
|
||||
@property
|
||||
@ -515,7 +519,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
return {"options_json": json.dumps(options)}
|
||||
|
||||
def send_pdf(self) -> str:
|
||||
with open(self.file_path, "rb") as f:
|
||||
with open(str(self.file_path), "rb") as f:
|
||||
files = {"file": f}
|
||||
response = requests.post(
|
||||
self.url, headers=self._mathpix_headers, files=files, data=self.data
|
||||
@ -562,7 +566,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
# This indicates an error with the PDF processing
|
||||
raise ValueError("Unable to retrieve PDF from Mathpix")
|
||||
else:
|
||||
print(f"Status: {status}, waiting for processing to complete") # noqa: T201
|
||||
logger.info("Status: %s, waiting for processing to complete", status)
|
||||
time.sleep(5)
|
||||
raise TimeoutError
|
||||
|
||||
@ -572,8 +576,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
response = requests.get(url, headers=self._mathpix_headers)
|
||||
return response.content.decode("utf-8")
|
||||
|
||||
@staticmethod
|
||||
def clean_pdf(contents: str) -> str:
|
||||
def clean_pdf(self, contents: str) -> str:
|
||||
"""Clean the PDF file.
|
||||
|
||||
Args:
|
||||
@ -596,7 +599,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
)
|
||||
return contents
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
pdf_id = self.send_pdf()
|
||||
contents = self.get_processed_pdf(pdf_id)
|
||||
if self.should_clean_pdf:
|
||||
@ -610,10 +613,10 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
dedupe: bool = False,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
@ -630,7 +633,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
self.dedupe = dedupe
|
||||
self.extract_images = extract_images
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load file."""
|
||||
|
||||
parser = PDFPlumberParser(
|
||||
@ -669,13 +672,13 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
textract_features: Optional[Sequence[str]] = None,
|
||||
client: Optional[Any] = None,
|
||||
credentials_profile_name: Optional[str] = None,
|
||||
region_name: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
) -> None:
|
||||
@ -743,7 +746,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
linearization_config=linearization_config,
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
@ -758,7 +761,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
if self.web_path and self._is_s3_url(self.web_path):
|
||||
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
blob = Blob.from_path(self.file_path)
|
||||
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
|
||||
raise ValueError(
|
||||
f"the file {blob.path} is a multi-page document, \
|
||||
@ -792,7 +795,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined]
|
||||
return 1
|
||||
else:
|
||||
raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined]
|
||||
raise ValueError( # type: ignore[attr-defined]
|
||||
f"unsupported mime type: {blob.mimetype}"
|
||||
)
|
||||
|
||||
|
||||
class DedocPDFLoader(DedocBaseLoader):
|
||||
@ -887,7 +892,7 @@ class DedocPDFLoader(DedocBaseLoader):
|
||||
from dedoc.utils.langchain import make_manager_pdf_config
|
||||
|
||||
return make_manager_pdf_config(
|
||||
file_path=self.file_path,
|
||||
file_path=str(self.file_path),
|
||||
parsing_params=self.parsing_parameters,
|
||||
split=self.split,
|
||||
)
|
||||
@ -898,10 +903,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
client: Any,
|
||||
model: str = "prebuilt-document",
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the object for file processing with Azure Document Intelligence
|
||||
@ -930,10 +935,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
... )
|
||||
"""
|
||||
|
||||
self.parser = DocumentIntelligenceParser(client=client, model=model)
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = DocumentIntelligenceParser(client=client, model=model)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
@ -964,7 +969,7 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
file_path: Union[str, PurePath],
|
||||
model: str = "gpt-4o-mini",
|
||||
**zerox_kwargs: Any,
|
||||
) -> None:
|
||||
@ -1005,7 +1010,7 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
|
||||
# Directly call asyncio.run to execute zerox synchronously
|
||||
zerox_output = asyncio.run(
|
||||
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
|
||||
zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
|
||||
)
|
||||
|
||||
# Convert zerox output to Document instances and yield them
|
||||
|
@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
|
||||
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||
|
||||
if splits_by_page:
|
||||
assert metadata["page"] == 0
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
|
||||
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Sequence, Union
|
||||
|
||||
@ -17,7 +18,7 @@ from langchain_community.document_loaders import (
|
||||
def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="elements")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 2
|
||||
@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="paged")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 16
|
||||
@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path))
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None:
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
def test_pypdfium2_loader() -> None:
|
||||
"""Test PyPDFium2Loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None:
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
||||
)
|
||||
def test_mathpix_loader() -> None:
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -187,8 +189,8 @@ def test_mathpix_loader() -> None:
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False),
|
||||
(
|
||||
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
||||
["FORMS", "TABLES", "LAYOUT"],
|
||||
@ -222,7 +224,7 @@ def test_amazontextract_loader(
|
||||
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||
def test_amazontextract_loader_failures() -> None:
|
||||
# 2-page PDF local file system
|
||||
two_page_pdf = str(
|
||||
two_page_pdf = (
|
||||
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
||||
)
|
||||
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||
|
@ -43,6 +43,7 @@ EXPECTED_ALL = [
|
||||
"CassandraLoader",
|
||||
"CSVLoader",
|
||||
"ChatGPTLoader",
|
||||
"CloudBlobLoader",
|
||||
"CoNLLULoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
|
Loading…
Reference in New Issue
Block a user