mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
Clean up tests for pdf parsers (#4595)
# Organize tests for pdf parsers Clean up tests for pdf parsers, remove duplicate tests, convert to unit tests.
This commit is contained in:
parent
70fd7cda14
commit
09587a3201
@ -93,12 +93,20 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
"""Lazily parse the blob."""
|
"""Lazily parse the blob."""
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
|
|
||||||
with blob.as_bytes_io() as f:
|
# pypdfium2 is really finicky with respect to closing things,
|
||||||
pdf_reader = pypdfium2.PdfDocument(f)
|
# if done incorrectly creates seg faults.
|
||||||
for page_number, page in enumerate(pdf_reader):
|
with blob.as_bytes_io() as file_path:
|
||||||
content = page.get_textpage().get_text_range()
|
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||||
metadata = {"source": blob.source, "page": page_number}
|
try:
|
||||||
yield Document(page_content=content, metadata=metadata)
|
for page_number, page in enumerate(pdf_reader):
|
||||||
|
text_page = page.get_textpage()
|
||||||
|
content = text_page.get_text_range()
|
||||||
|
text_page.close()
|
||||||
|
page.close()
|
||||||
|
metadata = {"source": blob.source, "page": page_number}
|
||||||
|
yield Document(page_content=content, metadata=metadata)
|
||||||
|
finally:
|
||||||
|
pdf_reader.close()
|
||||||
|
|
||||||
|
|
||||||
class PDFPlumberParser(BaseBlobParser):
|
class PDFPlumberParser(BaseBlobParser):
|
||||||
|
592
poetry.lock
generated
592
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -83,6 +83,8 @@ docarray = {version="^0.31.0", optional=true}
|
|||||||
protobuf = {version="3.19", optional=true}
|
protobuf = {version="3.19", optional=true}
|
||||||
hnswlib = {version="^0.7.0", optional=true}
|
hnswlib = {version="^0.7.0", optional=true}
|
||||||
lxml = {version = "^4.9.2", optional = true}
|
lxml = {version = "^4.9.2", optional = true}
|
||||||
|
pymupdf = {version = "^1.22.3", optional = true}
|
||||||
|
pypdfium2 = {version = "^4.10.0", optional = true}
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
@ -177,6 +179,8 @@ extended_testing = [
|
|||||||
"jq",
|
"jq",
|
||||||
"pdfminer.six",
|
"pdfminer.six",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pymupdf",
|
||||||
|
"pypdfium2",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
"lxml",
|
"lxml",
|
||||||
]
|
]
|
||||||
|
@ -7,6 +7,8 @@ from langchain.document_loaders.base import BaseBlobParser
|
|||||||
from langchain.document_loaders.blob_loaders import Blob
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
from langchain.document_loaders.parsers.pdf import (
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
|
PyMuPDFParser,
|
||||||
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
)
|
)
|
||||||
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
||||||
@ -62,3 +64,16 @@ def test_pdfminer_parser() -> None:
|
|||||||
"""Test PDFMiner parser."""
|
"""Test PDFMiner parser."""
|
||||||
# Does not follow defaults to split by page.
|
# Does not follow defaults to split by page.
|
||||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("fitz") # package is PyMuPDF
|
||||||
|
def test_pymupdf_loader() -> None:
|
||||||
|
"""Test PyMuPDF loader."""
|
||||||
|
_assert_with_parser(PyMuPDFParser())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("pypdfium2")
|
||||||
|
def test_pypdfium2_parser() -> None:
|
||||||
|
"""Test PyPDFium2 parser."""
|
||||||
|
# Does not follow defaults to split by page.
|
||||||
|
_assert_with_parser(PyPDFium2Parser())
|
||||||
|
Loading…
Reference in New Issue
Block a user