Propose PDFRouterParser and Loader

2025-08-22 19:08:40 +00:00 · 2025-04-15 16:11:54 +02:00 · 2025-04-15 16:11:54 +02:00 · 83563989d5
commit 83563989d5
parent ed5c4805f6
4 changed files with 178 additions and 3 deletions
--- a/libs/community/extended_testing_deps.txt
+++ b/libs/community/extended_testing_deps.txt
@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
 oracle-ads>=2.9.1,<3
 oracledb>=2.2.0,<3
 pandas>=2.0.1,<3
-pdfminer-six==20231228
+pdfminer-six==20250327
 pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -2,6 +2,7 @@
 from __future__ import annotations
 import re
 import html
 import io
 import logging
@ -1668,3 +1669,91 @@ class DocumentIntelligenceParser(BaseBlobParser):
            docs = self._generate_docs(blob, result)
            yield from docs
 class PDFRouterParser(BaseBlobParser):
    """
    Load PDFs using different parsers based on the metadata of the PDF
    or the body of the first page.
    The routes are defined as a list of tuples, where each tuple contains
    the name, a dictionary of metadata and regex pattern and the parser to use.
    The special key "page1" is to search in the first page with a regexp.
    Use the route in the correct order, as the first matching route is used.
    Add a default route ("default", {}, parser) at the end to catch all PDFs.
    This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
    Sample:
    ```python
    from langchain_community.document_loaders import PyPDFLoader
    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
    from langchain_community.document_loaders.parsers import PDFPlumberParser
    routes = [
        # Name, keys with regex, parser
        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
        PyMuPDFParser()),
        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
        ("defautl", {}, PyPDFium2Parser())
    ]
    loader = PDFRouterLoader(filename, routes)
    loader.load()
    ```
    """
    def __init__(
        self,
        routes: list[
            tuple[
                str,
                dict[str, Union[re.Pattern, str]],
                BaseBlobParser,
            ]
        ],
        *,
        password: Optional[str] = None,
    ):
        try:
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
                "pypdf package not found, please install it with `pip install pypdf`"
            )
        super().__init__()
        self.password = password
        new_routes = []
        for name, matchs, parser in routes:
            new_matchs = {}
            for k, v in matchs.items():
                if isinstance(v, str):
                    v = re.compile(v)
                new_matchs[k] = v
            new_routes.append((name, new_matchs, parser))
        self.routes = new_routes
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        try:
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
                "pypdf package not found, please install it with `pip install pypdf.six`"
            )
        from pypdf import PdfReader
        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
            with PdfReader(pdf_file_obj, password=self.password) as reader:
                metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
                page1 = reader.pages[0].extract_text()
                metadata["page1"] = page1
                find = False
                for name, match, parser in self.routes:
                    for k, p in match.items():
                        if k not in metadata or not p.search(metadata[k]):
                            break
                    else:
                        find = True
                        break
                if find:
                    for doc in parser.lazy_parse(blob):
                        doc.metadata["router"] = name
                        yield doc
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -22,6 +22,8 @@ from typing import (
 from urllib.parse import urlparse
 import requests
 from langchain_core.document_loaders import BaseBlobParser
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_dict_or_env
@ -37,7 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import (
    PDFPlumberParser,
    PyMuPDFParser,
    PyPDFium2Parser,
-    PyPDFParser,
+    PyPDFParser, PDFRouterParser,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -1423,3 +1425,61 @@ class ZeroxPDFLoader(BasePDFLoader):
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
 class PDFRouterLoader(BasePDFLoader):
    """
    Load PDFs using different parsers based on the metadata of the PDF
    or the body of the first page.
    The routes are defined as a list of tuples, where each tuple contains
    the name, a dictionary of metadata and regex pattern and the parser to use.
    The special key "page1" is to search in the first page with a regexp.
    Use the route in the correct order, as the first matching route is used.
    Add a default route ("default", {}, parser) at the end to catch all PDFs.
    Sample:
    ```python
    from langchain_community.document_loaders import PyPDFLoader
    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
    from langchain_community.document_loaders.parsers import PDFPlumberParser
    routes = [
        # Name, keys with regex, parser
        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
        PyMuPDFParser()),
        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
        ("defautl", {}, PyPDFium2Parser())
    ]
    loader = PDFRouterLoader(filename, routes)
    loader.load()
    ```
    """
    def __init__(
        self,
        file_path: Union[str, Path],
        *,
        routes: list[
            tuple[
                str,
                dict[str, Union[re.Pattern | str]],
                BaseBlobParser,
            ]
        ],
        password: Optional[str] = None,
    ):
        """Initialize with a file path."""
        super().__init__(file_path)
        self.parser = PDFRouterParser(routes, password=password)
    def lazy_load(
        self,
    ) -> Iterator[Document]:
        if self.web_path:
            blob = Blob.from_data(
                open(self.file_path, "rb").read(), path=self.web_path
            )  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.lazy_parse(blob)
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -11,8 +11,10 @@ from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
    BaseImageBlobParser,
-    PDFPlumberParser,
+    PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
 )
 from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
    PDFMinerParser
 if TYPE_CHECKING:
    from PIL.Image import Image
@ -312,3 +314,27 @@ def test_parser_with_table(
        **params,
    )
    _std_assert_with_parser(parser)
 def test_parser_router_parse() -> None:
    mode = "single"
    routes = [
        (
            "Microsoft",
            {"producer": "Microsoft", "creator": "Microsoft"},
            PyMuPDFParser(mode=mode),
        ),
        (
            "LibreOffice",
            {
                "producer": "LibreOffice",
            },
            PDFMinerParser(mode=mode),
        ),
        (
            "Xdvipdfmx",
            {"producer": "xdvipdfmx.*", "page1": "Hello"},
            PDFMinerParser(mode=mode),
        ),
        ("default", {}, PyPDFium2Parser(mode=mode)),
    ]
    _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)