mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 21:11:43 +00:00
community: ZeroxPDFLoader (#27800)
# OCR-based PDF loader This implements [Zerox](https://github.com/getomni-ai/zerox) PDF document loader. Zerox utilizes simple but very powerful (even though slower and more costly) approach to parsing PDF documents: it converts PDF to series of images and passes it to a vision model requesting the contents in markdown. It is especially suitable for complex PDFs that are not parsed well by other alternatives. ## Example use: ```python from langchain_community.document_loaders.pdf import ZeroxPDFLoader os.environ["OPENAI_API_KEY"] = "" ## your-api-key model = "gpt-4o-mini" ## openai model pdf_url = "https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf" loader = ZeroxPDFLoader(file_path=pdf_url, model=model) docs = loader.load() ``` The Zerox library supports wide range of provides/models. See Zerox documentation for details. - **Dependencies:** `zerox` - **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <erickfriis@gmail.com>
This commit is contained in:
@@ -945,5 +945,82 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
yield from self.parser.parse(blob)
|
||||
|
||||
|
||||
class ZeroxPDFLoader(BasePDFLoader):
|
||||
"""
|
||||
Document loader utilizing Zerox library:
|
||||
https://github.com/getomni-ai/zerox
|
||||
|
||||
Zerox converts PDF document to serties of images (page-wise) and
|
||||
uses vision-capable LLM model to generate Markdown representation.
|
||||
|
||||
Zerox utilizes anyc operations. Therefore when using this loader
|
||||
inside Jupyter Notebook (or any environment running async)
|
||||
you will need to:
|
||||
```python
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
model: str = "gpt-4o-mini",
|
||||
**zerox_kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(file_path=file_path)
|
||||
"""
|
||||
Initialize the parser with arguments to be passed to the zerox function.
|
||||
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
|
||||
Check zerox documentation for list of necessary environment variables for
|
||||
any given model.
|
||||
|
||||
Args:
|
||||
file_path:
|
||||
Path or url of the pdf file
|
||||
model:
|
||||
Vision capable model to use. Defaults to "gpt-4o-mini".
|
||||
Hosted models are passed in format "<provider>/<model>"
|
||||
Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001"
|
||||
See more details in zerox documentation.
|
||||
**zerox_kwargs:
|
||||
Arguments specific to the zerox function.
|
||||
see datailed list of arguments here in zerox repository:
|
||||
https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25
|
||||
""" # noqa: E501
|
||||
self.zerox_kwargs = zerox_kwargs
|
||||
self.model = model
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Loads documnts from pdf utilizing zerox library:
|
||||
https://github.com/getomni-ai/zerox
|
||||
|
||||
Returns:
|
||||
Iterator[Document]: An iterator over parsed Document instances.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
from pyzerox import zerox
|
||||
|
||||
# Directly call asyncio.run to execute zerox synchronously
|
||||
zerox_output = asyncio.run(
|
||||
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
|
||||
)
|
||||
|
||||
# Convert zerox output to Document instances and yield them
|
||||
if len(zerox_output.pages) > 0:
|
||||
num_pages = zerox_output.pages[-1].page
|
||||
for page in zerox_output.pages:
|
||||
yield Document(
|
||||
page_content=page.content,
|
||||
metadata={
|
||||
"source": self.source,
|
||||
"page": page.page,
|
||||
"num_pages": num_pages,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||
PagedPDFSplitter = PyPDFLoader
|
||||
|
Reference in New Issue
Block a user