Propose PDFRouterParser and Loader

This commit is contained in:
Philippe Prados 2025-04-15 16:39:13 +02:00
parent b5221f2476
commit 007180d990
5 changed files with 98 additions and 79 deletions

View File

@ -29,6 +29,7 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser, PDFMinerParser,
PDFPlumberParser, PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser, PyMuPDFParser,
PyPDFium2Parser, PyPDFium2Parser,
PyPDFParser, PyPDFParser,
@ -51,6 +52,7 @@ _module_lookup = {
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf", "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf", "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf", "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf", "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images", "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images", "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@ -76,6 +78,7 @@ __all__ = [
"OpenAIWhisperParser", "OpenAIWhisperParser",
"PDFMinerParser", "PDFMinerParser",
"PDFPlumberParser", "PDFPlumberParser",
"PDFRouterParser",
"PyMuPDFParser", "PyMuPDFParser",
"PyPDFParser", "PyPDFParser",
"PyPDFium2Parser", "PyPDFium2Parser",

View File

@ -2,10 +2,10 @@
from __future__ import annotations from __future__ import annotations
import re
import html import html
import io import io
import logging import logging
import re
import threading import threading
import warnings import warnings
from datetime import datetime from datetime import datetime
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
yield from docs yield from docs
class PDFRouterParser(BaseBlobParser): class PDFRouterParser(BaseBlobParser):
""" """
Load PDFs using different parsers based on the metadata of the PDF Load PDFs using different parsers based on the metadata of the PDF
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
``` ```
""" """
Routes = Sequence[
tuple[
str,
Mapping[str, Union[re.Pattern, str]],
BaseBlobParser,
]
]
def __init__( def __init__(
self, self,
routes: list[ routes: Routes,
tuple[
str,
dict[str, Union[re.Pattern, str]],
BaseBlobParser,
]
],
*, *,
password: Optional[str] = None, password: Optional[str] = None,
): ):
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
import pypdf # noqa:F401 import pypdf # noqa:F401
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"pypdf package not found, please install it with `pip install pypdf.six`" "pypdf package not found, please install it with "
"`pip install pypdf.six`"
) )
from pypdf import PdfReader from pypdf import PdfReader

View File

@ -22,8 +22,6 @@ from typing import (
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env from langchain_core.utils import get_from_dict_or_env
@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
PDFPlumberParser, PDFPlumberParser,
PyMuPDFParser, PyMuPDFParser,
PyPDFium2Parser, PyPDFium2Parser,
PyPDFParser, PDFRouterParser, PyPDFParser,
) )
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
# Legacy: only for backwards compatibility. Use PyPDFLoader instead # Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader PagedPDFSplitter = PyPDFLoader
class PDFRouterLoader(BasePDFLoader):
"""
Load PDFs using different parsers based on the metadata of the PDF
or the body of the first page.
The routes are defined as a list of tuples, where each tuple contains
the name, a dictionary of metadata and regex pattern and the parser to use.
The special key "page1" is to search in the first page with a regexp.
Use the route in the correct order, as the first matching route is used.
Add a default route ("default", {}, parser) at the end to catch all PDFs.
Sample: # class PDFRouterLoader(BasePDFLoader):
```python # """
from langchain_community.document_loaders import PyPDFLoader # Load PDFs using different parsers based on the metadata of the PDF
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser # or the body of the first page.
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser # The routes are defined as a list of tuples, where each tuple contains
from langchain_community.document_loaders.parsers import PDFPlumberParser # the name, a dictionary of metadata and regex pattern and the parser to use.
routes = [ # The special key "page1" is to search in the first page with a regexp.
# Name, keys with regex, parser # Use the route in the correct order, as the first matching route is used.
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, # Add a default route ("default", {}, parser) at the end to catch all PDFs.
PyMuPDFParser()), #
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), # Sample:
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()), # ```python
("defautl", {}, PyPDFium2Parser()) # from langchain_community.document_loaders import PyPDFLoader
] # from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
loader = PDFRouterLoader(filename, routes) # from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
loader.load() # from langchain_community.document_loaders.parsers import PDFPlumberParser
``` # routes = [
""" # # Name, keys with regex, parser
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
def __init__( # PyMuPDFParser()),
self, # ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
file_path: Union[str, Path], # ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
*, # PDFPlumberParser()),
routes: list[ # ("defautl", {}, PyPDFium2Parser())
tuple[ # ]
str, # loader = PDFRouterLoader(filename, routes)
dict[str, Union[re.Pattern | str]], # loader.load()
BaseBlobParser, # ```
] # """
], #
password: Optional[str] = None, # def __init__(
): # self,
"""Initialize with a file path.""" # file_path: Union[str, Path],
super().__init__(file_path) # *,
self.parser = PDFRouterParser(routes, password=password) # routes: list[
# tuple[
# str,
def lazy_load( # dict[str, Union[re.Pattern, str]],
self, # BaseBlobParser,
) -> Iterator[Document]: # ]
if self.web_path: # ],
blob = Blob.from_data( # password: Optional[str] = None,
open(self.file_path, "rb").read(), path=self.web_path # ):
) # type: ignore[attr-defined] # """Initialize with a file path."""
else: # super().__init__(file_path)
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] # self.parser = PDFRouterParser(routes, password=password)
yield from self.parser.lazy_parse(blob) #
# def lazy_load(
# self,
# ) -> Iterator[Document]:
# if self.web_path:
# blob = Blob.from_data(open(self.file_path, "rb").read(),
# path=self.web_path) # type: ignore[attr-defined]
# else:
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
# yield from self.parser.lazy_parse(blob)
# FIXME

View File

@ -2,7 +2,7 @@
import re import re
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterator from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
import pytest import pytest
@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import ( from langchain_community.document_loaders.parsers import (
BaseImageBlobParser, BaseImageBlobParser,
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, PDFMinerParser,
PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser,
PyPDFium2Parser,
) )
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
PDFMinerParser
if TYPE_CHECKING: if TYPE_CHECKING:
from PIL.Image import Image from PIL.Image import Image
@ -315,9 +317,15 @@ def test_parser_with_table(
) )
_std_assert_with_parser(parser) _std_assert_with_parser(parser)
def test_parser_router_parse() -> None: def test_parser_router_parse() -> None:
mode = "single" mode: Literal["single"] = "single"
routes = [ routes: PDFRouterParser.Routes = [
(
"Xdvipdfmx",
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
PDFMinerParser(mode=mode),
),
( (
"Microsoft", "Microsoft",
{"producer": "Microsoft", "creator": "Microsoft"}, {"producer": "Microsoft", "creator": "Microsoft"},
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
PDFMinerParser(mode=mode), PDFMinerParser(mode=mode),
), ),
( (
"Xdvipdfmx", "default",
{"producer": "xdvipdfmx.*", "page1": "Hello"}, cast(dict[str, Union[re.Pattern, str]], dict()),
PDFMinerParser(mode=mode), PyPDFium2Parser(mode=mode),
), ),
("default", {}, PyPDFium2Parser(mode=mode)),
] ]
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False) _assert_with_parser(
PDFRouterParser(
routes=routes,
),
splits_by_page=False,
)

View File

@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
"OpenAIWhisperParser", "OpenAIWhisperParser",
"PyPDFParser", "PyPDFParser",
"PDFMinerParser", "PDFMinerParser",
"PDFRouterParser",
"PyMuPDFParser", "PyMuPDFParser",
"PyPDFium2Parser", "PyPDFium2Parser",
"PDFPlumberParser", "PDFPlumberParser",