mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-25 04:49:17 +00:00
community[patch]: Refactoring PDF loaders: 01 prepare (#29062)
- **Refactoring PDF loaders step 1**: "community: Refactoring PDF loaders to standardize approaches" - **Description:** Declare CloudBlobLoader in __init__.py. file_path is Union[str, PurePath] anywhere - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). @eyurtsev it's the start of a PR series.
This commit is contained in:
@@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
|
||||
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||
|
||||
if splits_by_page:
|
||||
assert metadata["page"] == 0
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Sequence, Union
|
||||
|
||||
@@ -17,7 +18,7 @@ from langchain_community.document_loaders import (
|
||||
def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="elements")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 2
|
||||
@@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="paged")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 16
|
||||
@@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path))
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
@@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None:
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
@@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
def test_pypdfium2_loader() -> None:
|
||||
"""Test PyPDFium2Loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None:
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
||||
)
|
||||
def test_mathpix_loader() -> None:
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -187,8 +189,8 @@ def test_mathpix_loader() -> None:
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False),
|
||||
(
|
||||
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
||||
["FORMS", "TABLES", "LAYOUT"],
|
||||
@@ -222,7 +224,7 @@ def test_amazontextract_loader(
|
||||
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||
def test_amazontextract_loader_failures() -> None:
|
||||
# 2-page PDF local file system
|
||||
two_page_pdf = str(
|
||||
two_page_pdf = (
|
||||
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
||||
)
|
||||
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||
|
@@ -43,6 +43,7 @@ EXPECTED_ALL = [
|
||||
"CassandraLoader",
|
||||
"CSVLoader",
|
||||
"ChatGPTLoader",
|
||||
"CloudBlobLoader",
|
||||
"CoNLLULoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
|
Reference in New Issue
Block a user