Propose PDFRouterParser and Loader

This commit is contained in:
Philippe Prados 2025-04-15 16:11:54 +02:00
parent ed5c4805f6
commit 83563989d5
4 changed files with 178 additions and 3 deletions

View File

@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
oracle-ads>=2.9.1,<3 oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3 oracledb>=2.2.0,<3
pandas>=2.0.1,<3 pandas>=2.0.1,<3
pdfminer-six==20231228 pdfminer-six==20250327
pdfplumber>=0.11 pdfplumber>=0.11
pgvector>=0.1.6,<0.2 pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2 playwright>=1.48.0,<2

View File

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import re
import html import html
import io import io
import logging import logging
@ -1668,3 +1669,91 @@ class DocumentIntelligenceParser(BaseBlobParser):
docs = self._generate_docs(blob, result) docs = self._generate_docs(blob, result)
yield from docs yield from docs
class PDFRouterParser(BaseBlobParser):
"""
Load PDFs using different parsers based on the metadata of the PDF
or the body of the first page.
The routes are defined as a list of tuples, where each tuple contains
the name, a dictionary of metadata and regex pattern and the parser to use.
The special key "page1" is to search in the first page with a regexp.
Use the route in the correct order, as the first matching route is used.
Add a default route ("default", {}, parser) at the end to catch all PDFs.
This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
Sample:
```python
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
from langchain_community.document_loaders.parsers import PDFPlumberParser
routes = [
# Name, keys with regex, parser
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser()),
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
("defautl", {}, PyPDFium2Parser())
]
loader = PDFRouterLoader(filename, routes)
loader.load()
```
"""
def __init__(
self,
routes: list[
tuple[
str,
dict[str, Union[re.Pattern, str]],
BaseBlobParser,
]
],
*,
password: Optional[str] = None,
):
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf`"
)
super().__init__()
self.password = password
new_routes = []
for name, matchs, parser in routes:
new_matchs = {}
for k, v in matchs.items():
if isinstance(v, str):
v = re.compile(v)
new_matchs[k] = v
new_routes.append((name, new_matchs, parser))
self.routes = new_routes
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf.six`"
)
from pypdf import PdfReader
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
with PdfReader(pdf_file_obj, password=self.password) as reader:
metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
page1 = reader.pages[0].extract_text()
metadata["page1"] = page1
find = False
for name, match, parser in self.routes:
for k, p in match.items():
if k not in metadata or not p.search(metadata[k]):
break
else:
find = True
break
if find:
for doc in parser.lazy_parse(blob):
doc.metadata["router"] = name
yield doc

View File

@ -22,6 +22,8 @@ from typing import (
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env from langchain_core.utils import get_from_dict_or_env
@ -37,7 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import (
PDFPlumberParser, PDFPlumberParser,
PyMuPDFParser, PyMuPDFParser,
PyPDFium2Parser, PyPDFium2Parser,
PyPDFParser, PyPDFParser, PDFRouterParser,
) )
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -1423,3 +1425,61 @@ class ZeroxPDFLoader(BasePDFLoader):
# Legacy: only for backwards compatibility. Use PyPDFLoader instead # Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader PagedPDFSplitter = PyPDFLoader
class PDFRouterLoader(BasePDFLoader):
"""
Load PDFs using different parsers based on the metadata of the PDF
or the body of the first page.
The routes are defined as a list of tuples, where each tuple contains
the name, a dictionary of metadata and regex pattern and the parser to use.
The special key "page1" is to search in the first page with a regexp.
Use the route in the correct order, as the first matching route is used.
Add a default route ("default", {}, parser) at the end to catch all PDFs.
Sample:
```python
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
from langchain_community.document_loaders.parsers import PDFPlumberParser
routes = [
# Name, keys with regex, parser
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser()),
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
("defautl", {}, PyPDFium2Parser())
]
loader = PDFRouterLoader(filename, routes)
loader.load()
```
"""
def __init__(
self,
file_path: Union[str, Path],
*,
routes: list[
tuple[
str,
dict[str, Union[re.Pattern | str]],
BaseBlobParser,
]
],
password: Optional[str] = None,
):
"""Initialize with a file path."""
super().__init__(file_path)
self.parser = PDFRouterParser(routes, password=password)
def lazy_load(
self,
) -> Iterator[Document]:
if self.web_path:
blob = Blob.from_data(
open(self.file_path, "rb").read(), path=self.web_path
) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.lazy_parse(blob)

View File

@ -11,8 +11,10 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import ( from langchain_community.document_loaders.parsers import (
BaseImageBlobParser, BaseImageBlobParser,
PDFPlumberParser, PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
) )
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
PDFMinerParser
if TYPE_CHECKING: if TYPE_CHECKING:
from PIL.Image import Image from PIL.Image import Image
@ -312,3 +314,27 @@ def test_parser_with_table(
**params, **params,
) )
_std_assert_with_parser(parser) _std_assert_with_parser(parser)
def test_parser_router_parse() -> None:
mode = "single"
routes = [
(
"Microsoft",
{"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser(mode=mode),
),
(
"LibreOffice",
{
"producer": "LibreOffice",
},
PDFMinerParser(mode=mode),
),
(
"Xdvipdfmx",
{"producer": "xdvipdfmx.*", "page1": "Hello"},
PDFMinerParser(mode=mode),
),
("default", {}, PyPDFium2Parser(mode=mode)),
]
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)