mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 08:03:39 +00:00
community[minor]: 05 - Refactoring PyPDFium2 parser (#29625)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the PyPDFium2 parser. For more details, see https://github.com/langchain-ai/langchain/pull/28970.
This commit is contained in:
parent
723031d548
commit
beb75b2150
File diff suppressed because it is too large
Load Diff
@ -1158,50 +1158,216 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFium2Parser(BaseBlobParser):
|
class PyPDFium2Parser(BaseBlobParser):
|
||||||
"""Parse `PDF` with `PyPDFium2`."""
|
"""Parse a blob from a PDF using `PyPDFium2` library.
|
||||||
|
|
||||||
def __init__(self, extract_images: bool = False) -> None:
|
This class provides methods to parse a blob from a PDF document, supporting various
|
||||||
"""Initialize the parser."""
|
configurations such as handling password-protected PDFs, extracting images, and
|
||||||
|
defining extraction mode.
|
||||||
|
It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
|
||||||
|
blob parsing.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pypdfium2
|
||||||
|
|
||||||
|
Load a blob from a PDF file:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents.base import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||||
|
|
||||||
|
Instantiate the parser:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.parsers import PyPDFium2Parser
|
||||||
|
|
||||||
|
parser = PyPDFium2Parser(
|
||||||
|
# password=None,
|
||||||
|
mode="page",
|
||||||
|
pages_delimiter="\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazily parse the blob:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# PyPDFium2 is not thread safe.
|
||||||
|
# See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
extract_images: bool = False,
|
||||||
|
*,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
mode: Literal["single", "page"] = "page",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
) -> None:
|
||||||
|
"""Initialize a parser based on PyPDFium2.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
password: Optional password for opening encrypted PDFs.
|
||||||
|
mode: The extraction mode, either "single" for the entire document or "page"
|
||||||
|
for page-wise extraction.
|
||||||
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
|
extraction.
|
||||||
|
extract_images: Whether to extract images from the PDF.
|
||||||
|
images_parser: Optional image blob parser.
|
||||||
|
images_inner_format: The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
extraction_mode: “plain” for legacy functionality, “layout” for experimental
|
||||||
|
layout mode functionality
|
||||||
|
extraction_kwargs: Optional additional parameters for the extraction
|
||||||
|
process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This method does not directly return data. Use the `parse` or `lazy_parse`
|
||||||
|
methods to retrieve parsed documents with content and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the mode is not "single" or "page".
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
if mode not in ["single", "page"]:
|
||||||
|
raise ValueError("mode must be single or page")
|
||||||
|
self.extract_images = extract_images
|
||||||
|
if extract_images and not images_parser:
|
||||||
|
images_parser = RapidOCRBlobParser()
|
||||||
|
self.images_parser = images_parser
|
||||||
|
self.images_inner_format = images_inner_format
|
||||||
|
self.password = password
|
||||||
|
self.mode = mode
|
||||||
|
self.pages_delimiter = pages_delimiter
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
|
"""
|
||||||
|
Lazily parse the blob.
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: The blob to parse.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If the `pypdf` package is not found.
|
||||||
|
|
||||||
|
Yield:
|
||||||
|
An iterator over the parsed documents.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
import pypdfium2 # noqa:F401
|
import pypdfium2
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pypdfium2 package not found, please install it with"
|
"pypdfium2 package not found, please install it with"
|
||||||
" `pip install pypdfium2`"
|
" `pip install pypdfium2`"
|
||||||
)
|
)
|
||||||
self.extract_images = extract_images
|
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
|
||||||
"""Lazily parse the blob."""
|
|
||||||
import pypdfium2
|
|
||||||
|
|
||||||
# pypdfium2 is really finicky with respect to closing things,
|
# pypdfium2 is really finicky with respect to closing things,
|
||||||
# if done incorrectly creates seg faults.
|
# if done incorrectly creates seg faults.
|
||||||
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
|
with PyPDFium2Parser._lock:
|
||||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
|
||||||
try:
|
pdf_reader = None
|
||||||
for page_number, page in enumerate(pdf_reader):
|
try:
|
||||||
text_page = page.get_textpage()
|
pdf_reader = pypdfium2.PdfDocument(
|
||||||
content = text_page.get_text_range()
|
file_path, password=self.password, autoclose=True
|
||||||
text_page.close()
|
)
|
||||||
content += "\n" + self._extract_images_from_page(page)
|
full_content = []
|
||||||
page.close()
|
|
||||||
metadata = {"source": blob.source, "page": page_number} # type: ignore[attr-defined]
|
doc_metadata = _purge_metadata(pdf_reader.get_metadata_dict())
|
||||||
yield Document(page_content=content, metadata=metadata)
|
doc_metadata["source"] = blob.source
|
||||||
finally:
|
doc_metadata["total_pages"] = len(pdf_reader)
|
||||||
pdf_reader.close()
|
|
||||||
|
for page_number, page in enumerate(pdf_reader):
|
||||||
|
text_page = page.get_textpage()
|
||||||
|
text_from_page = "\n".join(
|
||||||
|
text_page.get_text_range().splitlines()
|
||||||
|
) # Replace \r\n
|
||||||
|
text_page.close()
|
||||||
|
image_from_page = self._extract_images_from_page(page)
|
||||||
|
all_text = _merge_text_and_extras(
|
||||||
|
[image_from_page], text_from_page
|
||||||
|
).strip()
|
||||||
|
page.close()
|
||||||
|
|
||||||
|
if self.mode == "page":
|
||||||
|
# For legacy compatibility, add the last '\n'
|
||||||
|
if not all_text.endswith("\n"):
|
||||||
|
all_text += "\n"
|
||||||
|
yield Document(
|
||||||
|
page_content=all_text,
|
||||||
|
metadata=_validate_metadata(
|
||||||
|
{
|
||||||
|
**doc_metadata,
|
||||||
|
"page": page_number,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
full_content.append(all_text)
|
||||||
|
|
||||||
|
if self.mode == "single":
|
||||||
|
yield Document(
|
||||||
|
page_content=self.pages_delimiter.join(full_content),
|
||||||
|
metadata=_validate_metadata(doc_metadata),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
if pdf_reader:
|
||||||
|
pdf_reader.close()
|
||||||
|
|
||||||
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
|
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
|
||||||
"""Extract images from page and get the text with RapidOCR."""
|
"""Extract images from a PDF page and get the text using images_to_text.
|
||||||
if not self.extract_images:
|
|
||||||
|
Args:
|
||||||
|
page: The page object from which to extract images.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted text from the images on the page.
|
||||||
|
"""
|
||||||
|
if not self.images_parser:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
|
|
||||||
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
|
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
|
||||||
|
if not images:
|
||||||
images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
|
return ""
|
||||||
return extract_from_images_with_rapidocr(images)
|
str_images = []
|
||||||
|
for image in images:
|
||||||
|
image_bytes = io.BytesIO()
|
||||||
|
np_image = image.get_bitmap().to_numpy()
|
||||||
|
if np_image.size < 3:
|
||||||
|
continue
|
||||||
|
numpy.save(image_bytes, image.get_bitmap().to_numpy())
|
||||||
|
blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
|
||||||
|
text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
|
||||||
|
str_images.append(
|
||||||
|
_format_inner_image(blob, text_from_image, self.images_inner_format)
|
||||||
|
)
|
||||||
|
image.close()
|
||||||
|
return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
|
||||||
|
|
||||||
|
|
||||||
class PDFPlumberParser(BaseBlobParser):
|
class PDFPlumberParser(BaseBlobParser):
|
||||||
|
@ -308,25 +308,116 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFium2Loader(BasePDFLoader):
|
class PyPDFium2Loader(BasePDFLoader):
|
||||||
"""Load `PDF` using `pypdfium2` and chunks at character level."""
|
"""Load and parse a PDF file using the `pypdfium2` library.
|
||||||
|
|
||||||
|
This class provides methods to load and parse PDF documents, supporting various
|
||||||
|
configurations such as handling password-protected files, extracting images, and
|
||||||
|
defining extraction mode.
|
||||||
|
It integrates the `pypdfium2` library for PDF processing and offers both
|
||||||
|
synchronous and asynchronous document loading.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pypdfium2
|
||||||
|
|
||||||
|
Instantiate the loader:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import PyPDFium2Loader
|
||||||
|
|
||||||
|
loader = PyPDFium2Loader(
|
||||||
|
file_path = "./example_data/layout-parser-paper.pdf",
|
||||||
|
# headers = None
|
||||||
|
# password = None,
|
||||||
|
mode = "single",
|
||||||
|
pages_delimiter = "\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazy load documents:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = loader.lazy_load()
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
|
||||||
|
Load documents asynchronously:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = await loader.aload()
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
file_path: Union[str, PurePath],
|
file_path: Union[str, PurePath],
|
||||||
*,
|
*,
|
||||||
headers: Optional[dict] = None,
|
mode: Literal["single", "page"] = "page",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
|
password: Optional[str] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
headers: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the PDF file to be loaded.
|
||||||
|
headers: Optional headers to use for GET request to download a file from a
|
||||||
|
web path.
|
||||||
|
password: Optional password for opening encrypted PDFs.
|
||||||
|
mode: The extraction mode, either "single" for the entire document or "page"
|
||||||
|
for page-wise extraction.
|
||||||
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
|
extraction.
|
||||||
|
extract_images: Whether to extract images from the PDF.
|
||||||
|
images_parser: Optional image blob parser.
|
||||||
|
images_inner_format: The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This class does not directly return data. Use the `load`, `lazy_load` or
|
||||||
|
`aload` methods to retrieve parsed documents with content and metadata.
|
||||||
|
"""
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PyPDFium2Parser(extract_images=extract_images)
|
self.parser = PyPDFium2Parser(
|
||||||
|
mode=mode,
|
||||||
|
password=password,
|
||||||
|
extract_images=extract_images,
|
||||||
|
images_parser=images_parser,
|
||||||
|
images_inner_format=images_inner_format,
|
||||||
|
pages_delimiter=pages_delimiter,
|
||||||
|
)
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazy load given path as pages."""
|
"""
|
||||||
|
Lazy load given path as pages.
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
"""
|
||||||
if self.web_path:
|
if self.web_path:
|
||||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
blob = Blob.from_data( # type: ignore[attr-defined]
|
||||||
|
open(self.file_path, "rb").read(), path=self.web_path
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.parse(blob)
|
||||||
|
@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
|
|||||||
from langchain_community.document_loaders.parsers import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyPDFium2Parser,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -96,12 +95,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
|||||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||||
|
|
||||||
|
|
||||||
def test_pypdfium2_parser() -> None:
|
|
||||||
"""Test PyPDFium2 parser."""
|
|
||||||
# Does not follow defaults to split by page.
|
|
||||||
_assert_with_parser(PyPDFium2Parser())
|
|
||||||
|
|
||||||
|
|
||||||
def test_pdfplumber_parser() -> None:
|
def test_pdfplumber_parser() -> None:
|
||||||
"""Test PDFPlumber parser."""
|
"""Test PDFPlumber parser."""
|
||||||
_assert_with_parser(PDFPlumberParser())
|
_assert_with_parser(PDFPlumberParser())
|
||||||
@ -109,11 +102,6 @@ def test_pdfplumber_parser() -> None:
|
|||||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
|
||||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
|
||||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
|
||||||
|
|
||||||
|
|
||||||
class EmptyImageBlobParser(BaseImageBlobParser):
|
class EmptyImageBlobParser(BaseImageBlobParser):
|
||||||
def _analyze_image(self, img: "Image") -> str:
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
return "Hello world"
|
return "Hello world"
|
||||||
@ -128,6 +116,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
|||||||
[
|
[
|
||||||
("PDFMinerParser", {}),
|
("PDFMinerParser", {}),
|
||||||
("PyMuPDFParser", {}),
|
("PyMuPDFParser", {}),
|
||||||
|
("PyPDFium2Parser", {}),
|
||||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||||
],
|
],
|
||||||
@ -157,6 +146,7 @@ def test_mode_and_extract_images_variations(
|
|||||||
[
|
[
|
||||||
("PDFMinerParser", {}),
|
("PDFMinerParser", {}),
|
||||||
("PyMuPDFParser", {}),
|
("PyMuPDFParser", {}),
|
||||||
|
("PyPDFium2Parser", {}),
|
||||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||||
],
|
],
|
||||||
|
@ -9,7 +9,6 @@ from langchain_community.document_loaders import (
|
|||||||
AmazonTextractPDFLoader,
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyPDFium2Loader,
|
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -56,21 +55,6 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
|||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_pypdfium2_loader() -> None:
|
|
||||||
"""Test PyPDFium2Loader."""
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
||||||
loader = PyPDFium2Loader(file_path)
|
|
||||||
docs = loader.load()
|
|
||||||
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PyPDFium2Loader(file_path)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 16
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
||||||
)
|
)
|
||||||
@ -184,6 +168,7 @@ def test_amazontextract_loader_failures() -> None:
|
|||||||
[
|
[
|
||||||
("PDFMinerLoader", {}),
|
("PDFMinerLoader", {}),
|
||||||
("PyMuPDFLoader", {}),
|
("PyMuPDFLoader", {}),
|
||||||
|
("PyPDFium2Loader", {}),
|
||||||
("PyPDFLoader", {}),
|
("PyPDFLoader", {}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -206,8 +191,6 @@ def test_standard_parameters(
|
|||||||
images_parser=None,
|
images_parser=None,
|
||||||
images_inner_format="text",
|
images_inner_format="text",
|
||||||
password=None,
|
password=None,
|
||||||
extract_tables=None,
|
|
||||||
extract_tables_settings=None,
|
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
|
@ -9,10 +9,7 @@ import pytest
|
|||||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import _merge_text_and_extras
|
||||||
PyPDFium2Parser,
|
|
||||||
_merge_text_and_extras,
|
|
||||||
)
|
|
||||||
|
|
||||||
_THIS_DIR = Path(__file__).parents[3]
|
_THIS_DIR = Path(__file__).parents[3]
|
||||||
|
|
||||||
@ -74,19 +71,13 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
|||||||
assert int(metadata["page"]) == 0
|
assert int(metadata["page"]) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("pypdfium2")
|
|
||||||
def test_pypdfium2_parser() -> None:
|
|
||||||
"""Test PyPDFium2 parser."""
|
|
||||||
# Does not follow defaults to split by page.
|
|
||||||
_assert_with_parser(PyPDFium2Parser())
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_factory,require,params",
|
"parser_factory,require,params",
|
||||||
[
|
[
|
||||||
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||||
("PyMuPDFParser", "pymupdf", {}),
|
("PyMuPDFParser", "pymupdf", {}),
|
||||||
("PyPDFParser", "pypdf", {}),
|
("PyPDFParser", "pypdf", {}),
|
||||||
|
("PyPDFium2Parser", "pypdfium2", {}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_parsers(
|
def test_parsers(
|
||||||
|
Loading…
Reference in New Issue
Block a user