mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-26 13:01:55 +00:00
Refactor ZeroxPDFLoader
This commit is contained in:
parent
33354f984f
commit
cf242093c2
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,7 @@ jq>=1.4.1,<2
|
||||
jsonschema>1
|
||||
keybert>=0.8.5
|
||||
langchain_openai>=0.2.1
|
||||
litellm>=1.30,<=1.39.5
|
||||
litellm>=1.30
|
||||
lxml>=4.9.3,<6.0
|
||||
markdownify>=0.11.6,<0.12
|
||||
motor>=3.3.1,<4
|
||||
@ -62,6 +62,7 @@ pandas>=2.0.1,<3
|
||||
pdfminer-six==20231228
|
||||
pdfplumber>=0.11
|
||||
pgvector>=0.1.6,<0.2
|
||||
pillow>=10.4
|
||||
playwright>=1.48.0,<2
|
||||
praw>=7.7.1,<8
|
||||
premai>=0.3.25,<0.4,!=0.3.100
|
||||
|
@ -360,6 +360,7 @@ if TYPE_CHECKING:
|
||||
PyPDFium2Loader,
|
||||
PyPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
ZeroxPDFLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.pebblo import (
|
||||
PebbloSafeLoader,
|
||||
@ -732,6 +733,7 @@ _module_lookup = {
|
||||
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
|
||||
"YoutubeLoader": "langchain_community.document_loaders.youtube",
|
||||
"YuqueLoader": "langchain_community.document_loaders.yuque",
|
||||
"ZeroxPDFLoader": "langchain_community.document_loaders.pdf",
|
||||
}
|
||||
|
||||
|
||||
@ -940,4 +942,5 @@ __all__ = [
|
||||
"YoutubeAudioLoader",
|
||||
"YoutubeLoader",
|
||||
"YuqueLoader",
|
||||
"ZeroxPDFLoader",
|
||||
]
|
||||
|
@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
ZeroxPDFParser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.vsdx import (
|
||||
VsdxParser,
|
||||
@ -55,6 +56,7 @@ _module_lookup = {
|
||||
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
|
||||
"ZeroxPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
}
|
||||
|
||||
|
||||
@ -82,4 +84,5 @@ __all__ = [
|
||||
"RapidOCRBlobParser",
|
||||
"TesseractBlobParser",
|
||||
"VsdxParser",
|
||||
"ZeroxPDFParser",
|
||||
]
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import io
|
||||
import logging
|
||||
@ -9,11 +10,12 @@ import threading
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Literal,
|
||||
@ -28,9 +30,14 @@ from urllib.parse import urlparse
|
||||
import numpy
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
LLMImageBlobParser,
|
||||
TesseractBlobParser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.images import (
|
||||
BaseImageBlobParser,
|
||||
RapidOCRBlobParser,
|
||||
@ -1464,6 +1471,326 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class ZeroxPDFParser(BaseBlobParser):
|
||||
"""Parse a blob from a PDF using `py-zerox` library.
|
||||
|
||||
This class provides methods to parse a blob from a PDF document, supporting various
|
||||
configurations such as handling password-protected PDFs, extracting images.
|
||||
It integrates the 'py-zerox' library for PDF processing and offers synchronous blob
|
||||
parsing.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community py-zerox
|
||||
|
||||
Load a blob from a PDF file:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||
|
||||
Instantiate the parser:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers import ZeroxPDFParser
|
||||
|
||||
parser = ZeroxPDFParser(
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||
)
|
||||
|
||||
Lazily parse the blob:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = parser.lazy_parse(blob)
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
module=r"^pyzerox.models.modellitellm$",
|
||||
message=r"\s*Custom system prompt was provided which.*",
|
||||
)
|
||||
_warn_images_to_text = False
|
||||
_warn_creator = False
|
||||
_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
|
||||
"markdown": "",
|
||||
"html": "But, use html syntax for convert all tables. ",
|
||||
}
|
||||
_map_extract_images = {
|
||||
RapidOCRBlobParser: "",
|
||||
TesseractBlobParser: "",
|
||||
LLMImageBlobParser: "If you come across a picture, "
|
||||
"diagram or other illustration, "
|
||||
"describe it. ",
|
||||
}
|
||||
_prompt = (
|
||||
"Convert the following PDF page to markdown. "
|
||||
"{prompt_tables}"
|
||||
"{prompt_images}"
|
||||
"Remove the header, footer and page number. "
|
||||
"Return only the markdown with no explanation text. "
|
||||
"Do not exclude any content from the page. "
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: Literal["single", "page"] = "page",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
extract_tables: Union[Literal["markdown", "html"], None] = "markdown",
|
||||
cleanup: bool = True,
|
||||
concurrency: int = 10,
|
||||
maintain_format: bool = False,
|
||||
model: str = "gpt-4o-mini",
|
||||
custom_system_prompt: Optional[str] = None,
|
||||
select_pages: Optional[Union[int, Iterable[int]]] = None,
|
||||
**zerox_kwargs: dict[str, Any],
|
||||
):
|
||||
"""
|
||||
Initialize the parser with arguments to be passed to the zerox function.
|
||||
Make sure to set necessary environment variables such as API key, endpoint, etc.
|
||||
Check zerox documentation for list of necessary environment variables for
|
||||
any given model.
|
||||
|
||||
Args:
|
||||
mode: The extraction mode, either "single" for the entire document or "page"
|
||||
for page-wise extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
images_parser: Optional image blob parser.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
model:
|
||||
Vision capable model to use. Defaults to "gpt-4o-mini".
|
||||
Hosted models are passed in format "<provider>/<model>"
|
||||
Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001"
|
||||
See more details in zerox documentation.
|
||||
cleanup:
|
||||
Whether to cleanup the temporary files after processing, defaults
|
||||
to True
|
||||
concurrency:
|
||||
The number of concurrent processes to run, defaults to 10
|
||||
maintain_format:
|
||||
Whether to maintain the format from the previous page, defaults to False
|
||||
model:
|
||||
The model to use for generating completions, defaults to "gpt-4o-mini".
|
||||
Note - Refer: https://docs.litellm.ai/docs/providers to pass correct
|
||||
model name as according to provider it might be different from actual
|
||||
name.
|
||||
output_dir:
|
||||
The directory to save the markdown output, defaults to None
|
||||
temp_dir:
|
||||
The directory to store temporary files, defaults to some named folder
|
||||
in system's temp directory. If already exists, the contents will be
|
||||
deleted for zerox uses it.
|
||||
custom_system_prompt:
|
||||
The system prompt to use for the model, this overrides the default
|
||||
system prompt of zerox. Generally it is not required unless you want
|
||||
some specific behaviour. When set, it will raise a friendly warning,
|
||||
defaults to None
|
||||
select_pages:
|
||||
Pages to process, can be a single page number or an iterable of page
|
||||
numbers, defaults to None
|
||||
**zerox_kwargs:
|
||||
Arguments specific to the zerox function.
|
||||
"""
|
||||
if mode not in ["single", "page"]:
|
||||
raise ValueError("mode must be single or page")
|
||||
if extract_tables not in ["markdown", "html", None]:
|
||||
logger.warning("extract_tables must be markdown or html")
|
||||
extract_tables = "markdown"
|
||||
if not images_parser:
|
||||
images_parser = RapidOCRBlobParser()
|
||||
self.mode = mode
|
||||
self.pages_delimiter = pages_delimiter
|
||||
self.images_parser = images_parser
|
||||
self.images_inner_format = images_inner_format
|
||||
self.extract_tables = extract_tables
|
||||
|
||||
self.cleanup = cleanup
|
||||
self.concurrency = concurrency
|
||||
self.maintain_format = maintain_format
|
||||
self.model = model
|
||||
if not custom_system_prompt:
|
||||
custom_system_prompt = ZeroxPDFParser._prompt
|
||||
self.custom_system_prompt = custom_system_prompt
|
||||
self.select_pages = select_pages
|
||||
self.zerox_kwargs = zerox_kwargs
|
||||
|
||||
@staticmethod
|
||||
def _is_valid_url(url: str) -> bool:
|
||||
"""Check if the url is valid."""
|
||||
parsed = urlparse(url)
|
||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob.
|
||||
|
||||
Args:
|
||||
blob: The blob to parse.
|
||||
|
||||
Raises:
|
||||
ImportError: If the `py-zerox` package is not installed.
|
||||
|
||||
Yields:
|
||||
An iterator over the parsed documents.
|
||||
"""
|
||||
try:
|
||||
from pyzerox import zerox
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pyzerox python package. "
|
||||
"Please install it with `pip install py-zerox`."
|
||||
)
|
||||
temp_file = None
|
||||
try:
|
||||
if not ZeroxPDFParser._is_valid_url(str(blob.path)):
|
||||
temp_file = NamedTemporaryFile()
|
||||
with open(temp_file.name, "wb") as f:
|
||||
f.write(blob.as_bytes())
|
||||
file_path = temp_file.name
|
||||
else:
|
||||
file_path = str(blob.path)
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
doc_metadata = _purge_metadata(self._get_metadata(pdf_file_obj))
|
||||
|
||||
doc_metadata["source"] = blob.source or blob.path
|
||||
zerox_prompt = self.custom_system_prompt
|
||||
|
||||
if not zerox_prompt and self.images_parser or self.extract_tables:
|
||||
prompt_tables = ZeroxPDFParser._map_extract_tables[self.extract_tables]
|
||||
clazz = self.images_parser.__class__
|
||||
if clazz in ZeroxPDFParser._map_extract_images:
|
||||
prompt_images = ZeroxPDFParser._map_extract_images[clazz]
|
||||
else:
|
||||
if not ZeroxPDFParser._warn_creator:
|
||||
ZeroxPDFParser._warn_creator = True
|
||||
logger.warning("images_parser can not be simulated")
|
||||
prompt_images = ""
|
||||
zerox_prompt = PromptTemplate.from_template(
|
||||
self.custom_system_prompt
|
||||
).format(prompt_tables=prompt_tables, prompt_images=prompt_images)
|
||||
zerox_output = asyncio.run(
|
||||
zerox(
|
||||
file_path=str(file_path),
|
||||
model=self.model,
|
||||
cleanup=self.cleanup,
|
||||
concurrency=self.concurrency,
|
||||
maintain_format=self.maintain_format,
|
||||
custom_system_prompt=zerox_prompt,
|
||||
select_pages=self.select_pages,
|
||||
**self.zerox_kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
# Convert zerox output to Document instances and yield them
|
||||
if len(zerox_output.pages) > 0:
|
||||
doc_metadata = _purge_metadata(
|
||||
{
|
||||
"producer": "ZeroxPDF",
|
||||
"creator": "ZeroxPDF",
|
||||
"creationdate": "",
|
||||
}
|
||||
| doc_metadata
|
||||
| {
|
||||
"total_pages": zerox_output.pages[-1].page,
|
||||
"num_pages": zerox_output.pages[-1].page, # Deprecated
|
||||
}
|
||||
)
|
||||
single_texts = []
|
||||
for page in zerox_output.pages:
|
||||
text_from_page = page.content
|
||||
images_from_page = "" # FIXME
|
||||
all_text = _merge_text_and_extras(
|
||||
[images_from_page], text_from_page
|
||||
)
|
||||
if self.mode == "page":
|
||||
yield Document(
|
||||
page_content=all_text,
|
||||
metadata=_validate_metadata(
|
||||
doc_metadata | {"page": page.page - 1}
|
||||
),
|
||||
)
|
||||
else:
|
||||
single_texts.append(all_text)
|
||||
if self.mode == "single":
|
||||
yield Document(
|
||||
page_content=self.pages_delimiter.join(single_texts),
|
||||
metadata=_validate_metadata(doc_metadata),
|
||||
)
|
||||
finally:
|
||||
if temp_file:
|
||||
temp_file.close()
|
||||
|
||||
def _get_metadata(
|
||||
self,
|
||||
fp: BinaryIO,
|
||||
password: str = "",
|
||||
caching: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from a PDF file.
|
||||
|
||||
Args:
|
||||
fp: The file pointer to the PDF file.
|
||||
password: The password for the PDF file, if encrypted. Defaults to an empty
|
||||
string.
|
||||
caching: Whether to cache the PDF structure. Defaults to True.
|
||||
|
||||
Returns:
|
||||
Metadata of the PDF file.
|
||||
"""
|
||||
from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser
|
||||
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
metadata = {}
|
||||
|
||||
for info in doc.info:
|
||||
metadata.update(info)
|
||||
for k, v in metadata.items():
|
||||
try:
|
||||
metadata[k] = PDFMinerParser.resolve_and_decode(v)
|
||||
except Exception as e: # pragma: nocover
|
||||
# This metadata value could not be parsed. Instead of failing the PDF
|
||||
# read, treat it as a warning only if `strict_metadata=False`.
|
||||
logger.warning(
|
||||
'[WARNING] Metadata key "%s" could not be parsed due to '
|
||||
"exception: %s",
|
||||
k,
|
||||
str(e),
|
||||
)
|
||||
|
||||
# Count number of pages.
|
||||
metadata["total_pages"] = len(list(PDFPage.create_pages(doc)))
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
class AmazonTextractPDFParser(BaseBlobParser):
|
||||
"""Send `PDF` files to `Amazon Textract` and parse them.
|
||||
|
||||
|
@ -11,6 +11,7 @@ from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Literal,
|
||||
Mapping,
|
||||
@ -38,6 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
ZeroxPDFParser,
|
||||
)
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -1353,72 +1355,177 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class ZeroxPDFLoader(BasePDFLoader):
|
||||
"""Document loader utilizing Zerox library:
|
||||
"""Load and parse a PDF file using 'py-zerox' library.
|
||||
https://github.com/getomni-ai/zerox
|
||||
|
||||
Zerox converts PDF document to series of images (page-wise) and
|
||||
This class provides methods to load and parse PDF documents, supporting various
|
||||
configurations such as handling password-protected files, extracting tables,
|
||||
extracting images, and defining extraction mode. It integrates the `py-zerox`
|
||||
library for PDF processing and offers both synchronous and asynchronous document
|
||||
loading.
|
||||
|
||||
Zerox converts PDF document to serties of images (page-wise) and
|
||||
uses vision-capable LLM model to generate Markdown representation.
|
||||
|
||||
Zerox utilizes anyc operations. Therefore when using this loader
|
||||
Zerox utilizes async operations. Therefore when using this loader
|
||||
inside Jupyter Notebook (or any environment running async)
|
||||
you will need to:
|
||||
```python
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
```
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community pymupdf
|
||||
|
||||
Instantiate the loader:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import ZeroxPDFLoader
|
||||
|
||||
loader = ZeroxPDFLoader(
|
||||
file_path = "./example_data/layout-parser-paper.pdf",
|
||||
# headers = None
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||
# extract_tables = "markdown",
|
||||
# extract_tables_settings = None,
|
||||
)
|
||||
|
||||
Lazy load documents:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
Load documents asynchronously:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, PurePath],
|
||||
file_path: Union[str, Path],
|
||||
*,
|
||||
headers: Optional[dict] = None,
|
||||
mode: Literal["single", "page"] = "page",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
extract_tables: Union[Literal["markdown", "html"], None] = "markdown",
|
||||
cleanup: bool = True,
|
||||
concurrency: int = 10,
|
||||
maintain_format: bool = False,
|
||||
model: str = "gpt-4o-mini",
|
||||
**zerox_kwargs: Any,
|
||||
custom_system_prompt: Optional[str] = None,
|
||||
select_pages: Optional[Union[int, Iterable[int]]] = None,
|
||||
**zerox_kwargs: dict[str, Any],
|
||||
) -> None:
|
||||
super().__init__(file_path=file_path)
|
||||
"""Initialize the parser with arguments to be passed to the zerox function.
|
||||
"""
|
||||
Initialize the loader with arguments to be passed to the zerox function.
|
||||
Make sure to set necessary environment variables such as API key, endpoint, etc.
|
||||
Check zerox documentation for list of necessary environment variables for
|
||||
any given model.
|
||||
|
||||
Args:
|
||||
file_path:
|
||||
Path or url of the pdf file
|
||||
file_path: The path to the PDF file to be loaded.
|
||||
headers: Optional headers to use for GET request to download a file from a
|
||||
web path.
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
mode: The extraction mode, either "single" for the entire document or "page"
|
||||
for page-wise extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
images_parser: Optional image blob parser.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
extract_tables: Whether to extract tables in a specific format, such as
|
||||
"csv", "markdown", or "html".
|
||||
extract_tables_settings: Optional dictionary of settings for customizing
|
||||
table extraction.
|
||||
cleanup:
|
||||
Whether to cleanup the temporary files after processing, defaults
|
||||
to True
|
||||
concurrency:
|
||||
The number of concurrent processes to run, defaults to 10
|
||||
maintain_format:
|
||||
Whether to maintain the format from the previous page, defaults to False
|
||||
model:
|
||||
Vision capable model to use. Defaults to "gpt-4o-mini".
|
||||
Hosted models are passed in format "<provider>/<model>"
|
||||
Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001"
|
||||
See more details in zerox documentation.
|
||||
**zerox_kwargs:
|
||||
The model to use for generating completions, defaults to "gpt-4o-mini".
|
||||
Note - Refer: https://docs.litellm.ai/docs/providers to pass correct
|
||||
model name as according to provider it might be different from
|
||||
actual name.
|
||||
output_dir:
|
||||
The directory to save the markdown output, defaults to None
|
||||
temp_dir:
|
||||
The directory to store temporary files, defaults to some named folder
|
||||
in system's temp directory. If already exists, the contents will be
|
||||
deleted for zerox uses it.
|
||||
custom_system_prompt:
|
||||
The system prompt to use for the model, this overrides the default
|
||||
system prompt of zerox. Generally it is not required unless you want
|
||||
some specific behaviour. When set, it will raise a friendly warning,
|
||||
defaults to None
|
||||
select_pages:
|
||||
Pages to process, can be a single page number or an iterable of page
|
||||
numbers, defaults to None
|
||||
**kwargs:
|
||||
Arguments specific to the zerox function.
|
||||
see datailed list of arguments here in zerox repository:
|
||||
https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25
|
||||
""" # noqa: E501
|
||||
self.zerox_kwargs = zerox_kwargs
|
||||
self.model = model
|
||||
"""
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = ZeroxPDFParser(
|
||||
mode=mode,
|
||||
pages_delimiter=pages_delimiter,
|
||||
images_parser=images_parser,
|
||||
images_inner_format=images_inner_format,
|
||||
extract_tables=extract_tables,
|
||||
cleanup=cleanup,
|
||||
concurrency=concurrency,
|
||||
maintain_format=maintain_format,
|
||||
model=model,
|
||||
custom_system_prompt=custom_system_prompt,
|
||||
select_pages=select_pages,
|
||||
**zerox_kwargs,
|
||||
)
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Lazily load pages."""
|
||||
import asyncio
|
||||
"""
|
||||
Loads documents from pdf utilizing zerox library:
|
||||
https://github.com/getomni-ai/zerox
|
||||
|
||||
from pyzerox import zerox
|
||||
|
||||
# Directly call asyncio.run to execute zerox synchronously
|
||||
zerox_output = asyncio.run(
|
||||
zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
|
||||
)
|
||||
|
||||
# Convert zerox output to Document instances and yield them
|
||||
if len(zerox_output.pages) > 0:
|
||||
num_pages = zerox_output.pages[-1].page
|
||||
for page in zerox_output.pages:
|
||||
yield Document(
|
||||
page_content=page.content,
|
||||
metadata={
|
||||
"source": self.source,
|
||||
"page": page.page,
|
||||
"num_pages": num_pages,
|
||||
},
|
||||
Returns:
|
||||
Iterator[Document]: An iterator over parsed Document instances.
|
||||
"""
|
||||
"""Lazy load given path as pages."""
|
||||
if self.web_path:
|
||||
blob = Blob.from_data( # type: ignore[attr-defined]
|
||||
open(self.file_path, "rb").read(), path=self.web_path
|
||||
)
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from self.parser.lazy_parse(blob)
|
||||
|
||||
|
||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||
|
@ -5,6 +5,7 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator
|
||||
|
||||
import pytest
|
||||
from PIL.Image import Image
|
||||
|
||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
@ -119,6 +120,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
("PyPDFium2Parser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
("ZeroxPDFParser", {}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
@ -128,6 +130,11 @@ def test_mode_and_extract_images_variations(
|
||||
mode: str,
|
||||
image_parser: BaseImageBlobParser,
|
||||
) -> None:
|
||||
if parser_factory == "ZeroxPDFParser":
|
||||
try:
|
||||
import pyzerox # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pyzerox is valid only with Python +3.11")
|
||||
_test_matrix(
|
||||
parser_factory,
|
||||
params,
|
||||
@ -149,6 +156,7 @@ def test_mode_and_extract_images_variations(
|
||||
("PyPDFium2Parser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
("ZeroxPDFParser", {}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
@ -157,6 +165,11 @@ def test_mode_and_image_formats_variations(
|
||||
params: dict,
|
||||
images_inner_format: str,
|
||||
) -> None:
|
||||
if parser_factory == "ZeroxPDFParser":
|
||||
try:
|
||||
import pyzerox # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pyzerox is valid only with Python +3.11")
|
||||
mode = "single"
|
||||
image_parser = EmptyImageBlobParser()
|
||||
|
||||
@ -246,6 +259,7 @@ def _test_matrix(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PyMuPDFParser", {}),
|
||||
("ZeroxPDFParser", {}),
|
||||
],
|
||||
)
|
||||
def test_parser_with_table(
|
||||
@ -254,6 +268,12 @@ def test_parser_with_table(
|
||||
mode: str,
|
||||
extract_tables: str,
|
||||
) -> None:
|
||||
if parser_factory == "ZeroxPDFParser":
|
||||
try:
|
||||
import pyzerox # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pyzerox is valid only with Python +3.11")
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||
|
@ -170,12 +170,18 @@ def test_amazontextract_loader_failures() -> None:
|
||||
("PyMuPDFLoader", {}),
|
||||
("PyPDFium2Loader", {}),
|
||||
("PyPDFLoader", {}),
|
||||
("ZeroxPDFLoader", {}),
|
||||
],
|
||||
)
|
||||
def test_standard_parameters(
|
||||
parser_factory: str,
|
||||
params: dict,
|
||||
) -> None:
|
||||
if parser_factory == "ZeroxPDFLoader":
|
||||
try:
|
||||
import pyzerox # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pyzerox is valid only with Python +3.11")
|
||||
loader_class = getattr(pdf_loaders, parser_factory)
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
|
@ -9,7 +9,9 @@ import pytest
|
||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import _merge_text_and_extras
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
|
||||
_THIS_DIR = Path(__file__).parents[3]
|
||||
|
||||
@ -75,6 +77,7 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
"parser_factory,require,params",
|
||||
[
|
||||
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||
("PDFPlumberParser", "pdfplumber", {}),
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
("PyPDFium2Parser", "pypdfium2", {}),
|
||||
|
@ -20,4 +20,5 @@ def test_parsers_public_api_correct() -> None:
|
||||
"RapidOCRBlobParser",
|
||||
"TesseractBlobParser",
|
||||
"VsdxParser",
|
||||
"ZeroxPDFParser",
|
||||
}
|
||||
|
@ -200,6 +200,7 @@ EXPECTED_ALL = [
|
||||
"YoutubeAudioLoader",
|
||||
"YoutubeLoader",
|
||||
"YuqueLoader",
|
||||
"ZeroxPDFLoader",
|
||||
]
|
||||
|
||||
|
||||
|
@ -16,6 +16,7 @@ if TYPE_CHECKING:
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
ZeroxPDFParser,
|
||||
)
|
||||
|
||||
# Create a way to dynamically look up deprecated imports.
|
||||
@ -34,6 +35,7 @@ DEPRECATED_LOOKUP = {
|
||||
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"ZeroxPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
}
|
||||
|
||||
_import_attribute = create_importer(__package__, deprecated_lookups=DEPRECATED_LOOKUP)
|
||||
@ -55,4 +57,5 @@ __all__ = [
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PyPDFParser",
|
||||
"ZeroxPDFParser",
|
||||
]
|
||||
|
@ -14,4 +14,5 @@ def test_parsers_public_api_correct() -> None:
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
"ZeroxPDFParser",
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ def test_no_more_changes_to_proxy_community() -> None:
|
||||
# most cases.
|
||||
hash_ += len(str(sorted(deprecated_lookup.items())))
|
||||
|
||||
evil_magic_number = 38620
|
||||
evil_magic_number = 38692
|
||||
|
||||
assert hash_ == evil_magic_number, (
|
||||
"If you're triggering this test, you're likely adding a new import "
|
||||
|
Loading…
Reference in New Issue
Block a user