Propose PDFRouterParser and Loader

This commit is contained in:
Philippe Prados 2025-04-15 16:39:13 +02:00
parent b5221f2476
commit 007180d990
5 changed files with 98 additions and 79 deletions

View File

@ -29,6 +29,7 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
@ -51,6 +52,7 @@ _module_lookup = {
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@ -76,6 +78,7 @@ __all__ = [
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PDFRouterParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",

View File

@ -2,10 +2,10 @@
from __future__ import annotations
import re
import html
import io
import logging
import re
import threading
import warnings
from datetime import datetime
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
yield from docs
class PDFRouterParser(BaseBlobParser):
"""
Load PDFs using different parsers based on the metadata of the PDF
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
```
"""
def __init__(
self,
routes: list[
Routes = Sequence[
tuple[
str,
dict[str, Union[re.Pattern, str]],
Mapping[str, Union[re.Pattern, str]],
BaseBlobParser,
]
],
]
def __init__(
self,
routes: Routes,
*,
password: Optional[str] = None,
):
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf.six`"
"pypdf package not found, please install it with "
"`pip install pypdf.six`"
)
from pypdf import PdfReader

View File

@ -22,8 +22,6 @@ from typing import (
from urllib.parse import urlparse
import requests
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env
@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser, PDFRouterParser,
PyPDFParser,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader
class PDFRouterLoader(BasePDFLoader):
"""
Load PDFs using different parsers based on the metadata of the PDF
or the body of the first page.
The routes are defined as a list of tuples, where each tuple contains
the name, a dictionary of metadata and regex pattern and the parser to use.
The special key "page1" is to search in the first page with a regexp.
Use the route in the correct order, as the first matching route is used.
Add a default route ("default", {}, parser) at the end to catch all PDFs.
Sample:
```python
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
from langchain_community.document_loaders.parsers import PDFPlumberParser
routes = [
# Name, keys with regex, parser
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser()),
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
("defautl", {}, PyPDFium2Parser())
]
loader = PDFRouterLoader(filename, routes)
loader.load()
```
"""
def __init__(
self,
file_path: Union[str, Path],
*,
routes: list[
tuple[
str,
dict[str, Union[re.Pattern | str]],
BaseBlobParser,
]
],
password: Optional[str] = None,
):
"""Initialize with a file path."""
super().__init__(file_path)
self.parser = PDFRouterParser(routes, password=password)
def lazy_load(
self,
) -> Iterator[Document]:
if self.web_path:
blob = Blob.from_data(
open(self.file_path, "rb").read(), path=self.web_path
) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.lazy_parse(blob)
# class PDFRouterLoader(BasePDFLoader):
# """
# Load PDFs using different parsers based on the metadata of the PDF
# or the body of the first page.
# The routes are defined as a list of tuples, where each tuple contains
# the name, a dictionary of metadata and regex pattern and the parser to use.
# The special key "page1" is to search in the first page with a regexp.
# Use the route in the correct order, as the first matching route is used.
# Add a default route ("default", {}, parser) at the end to catch all PDFs.
#
# Sample:
# ```python
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
# from langchain_community.document_loaders.parsers import PDFPlumberParser
# routes = [
# # Name, keys with regex, parser
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
# PyMuPDFParser()),
# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
# PDFPlumberParser()),
# ("defautl", {}, PyPDFium2Parser())
# ]
# loader = PDFRouterLoader(filename, routes)
# loader.load()
# ```
# """
#
# def __init__(
# self,
# file_path: Union[str, Path],
# *,
# routes: list[
# tuple[
# str,
# dict[str, Union[re.Pattern, str]],
# BaseBlobParser,
# ]
# ],
# password: Optional[str] = None,
# ):
# """Initialize with a file path."""
# super().__init__(file_path)
# self.parser = PDFRouterParser(routes, password=password)
#
# def lazy_load(
# self,
# ) -> Iterator[Document]:
# if self.web_path:
# blob = Blob.from_data(open(self.file_path, "rb").read(),
# path=self.web_path) # type: ignore[attr-defined]
# else:
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
# yield from self.parser.lazy_parse(blob)
# FIXME

View File

@ -2,7 +2,7 @@
import re
from pathlib import Path
from typing import TYPE_CHECKING, Iterator
from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
import pytest
@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
PDFMinerParser,
PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser,
PyPDFium2Parser,
)
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
PDFMinerParser
if TYPE_CHECKING:
from PIL.Image import Image
@ -315,9 +317,15 @@ def test_parser_with_table(
)
_std_assert_with_parser(parser)
def test_parser_router_parse() -> None:
mode = "single"
routes = [
mode: Literal["single"] = "single"
routes: PDFRouterParser.Routes = [
(
"Xdvipdfmx",
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
PDFMinerParser(mode=mode),
),
(
"Microsoft",
{"producer": "Microsoft", "creator": "Microsoft"},
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
PDFMinerParser(mode=mode),
),
(
"Xdvipdfmx",
{"producer": "xdvipdfmx.*", "page1": "Hello"},
PDFMinerParser(mode=mode),
"default",
cast(dict[str, Union[re.Pattern, str]], dict()),
PyPDFium2Parser(mode=mode),
),
("default", {}, PyPDFium2Parser(mode=mode)),
]
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
_assert_with_parser(
PDFRouterParser(
routes=routes,
),
splits_by_page=False,
)

View File

@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
"OpenAIWhisperParser",
"PyPDFParser",
"PDFMinerParser",
"PDFRouterParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",