Propose PDFRouterParser and Loader

2025-08-21 02:17:12 +00:00 · 2025-04-15 16:39:13 +02:00 · 2025-04-15 16:39:13 +02:00 · 007180d990
commit 007180d990
parent b5221f2476
5 changed files with 98 additions and 79 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/init.py
+++ b/libs/community/langchain_community/document_loaders/parsers/init.py
@ -29,6 +29,7 @@ if TYPE_CHECKING:
    from langchain_community.document_loaders.parsers.pdf import (
        PDFMinerParser,
        PDFPlumberParser,
+        PDFRouterParser,
        PyMuPDFParser,
        PyPDFium2Parser,
        PyPDFParser,
@ -51,6 +52,7 @@ _module_lookup = {
    "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
    "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
    "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
+    "PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
    "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
    "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
    "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@ -76,6 +78,7 @@ __all__ = [
    "OpenAIWhisperParser",
    "PDFMinerParser",
    "PDFPlumberParser",
+    "PDFRouterParser",
    "PyMuPDFParser",
    "PyPDFParser",
    "PyPDFium2Parser",
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -2,10 +2,10 @@

 from __future__ import annotations

-import re
 import html
 import io
 import logging
+import re
 import threading
 import warnings
 from datetime import datetime
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):

            yield from docs

+
 class PDFRouterParser(BaseBlobParser):
    """
    Load PDFs using different parsers based on the metadata of the PDF
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
    ```
    """

-    def __init__(
-        self,
-        routes: list[
+    Routes = Sequence[
        tuple[
            str,
-                dict[str, Union[re.Pattern, str]],
+            Mapping[str, Union[re.Pattern, str]],
            BaseBlobParser,
        ]
-        ],
+    ]
+
+    def __init__(
+        self,
+        routes: Routes,
        *,
        password: Optional[str] = None,
    ):
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
-                "pypdf package not found, please install it with `pip install pypdf.six`"
+                "pypdf package not found, please install it with "
+                "`pip install pypdf.six`"
            )
        from pypdf import PdfReader

--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -22,8 +22,6 @@ from typing import (
 from urllib.parse import urlparse

 import requests
-
-from langchain_core.document_loaders import BaseBlobParser
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_dict_or_env

@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
    PDFPlumberParser,
    PyMuPDFParser,
    PyPDFium2Parser,
-    PyPDFParser, PDFRouterParser,
+    PyPDFParser,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader

-class PDFRouterLoader(BasePDFLoader):
-    """
-    Load PDFs using different parsers based on the metadata of the PDF
-    or the body of the first page.
-    The routes are defined as a list of tuples, where each tuple contains
-    the name, a dictionary of metadata and regex pattern and the parser to use.
-    The special key "page1" is to search in the first page with a regexp.
-    Use the route in the correct order, as the first matching route is used.
-    Add a default route ("default", {}, parser) at the end to catch all PDFs.

-    Sample:
-    ```python
-    from langchain_community.document_loaders import PyPDFLoader
-    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
-    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
-    from langchain_community.document_loaders.parsers import PDFPlumberParser
-    routes = [
-        # Name, keys with regex, parser
-        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
-        PyMuPDFParser()),
-        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
-        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
-        ("defautl", {}, PyPDFium2Parser())
-    ]
-    loader = PDFRouterLoader(filename, routes)
-    loader.load()
-    ```
-    """
-
-    def __init__(
-        self,
-        file_path: Union[str, Path],
-        *,
-        routes: list[
-            tuple[
-                str,
-                dict[str, Union[re.Pattern | str]],
-                BaseBlobParser,
-            ]
-        ],
-        password: Optional[str] = None,
-    ):
-        """Initialize with a file path."""
-        super().__init__(file_path)
-        self.parser = PDFRouterParser(routes, password=password)
-
-
-    def lazy_load(
-        self,
-    ) -> Iterator[Document]:
-        if self.web_path:
-            blob = Blob.from_data(
-                open(self.file_path, "rb").read(), path=self.web_path
-            )  # type: ignore[attr-defined]
-        else:
-            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
-        yield from self.parser.lazy_parse(blob)
+# class PDFRouterLoader(BasePDFLoader):
+#     """
+#     Load PDFs using different parsers based on the metadata of the PDF
+#     or the body of the first page.
+#     The routes are defined as a list of tuples, where each tuple contains
+#     the name, a dictionary of metadata and regex pattern and the parser to use.
+#     The special key "page1" is to search in the first page with a regexp.
+#     Use the route in the correct order, as the first matching route is used.
+#     Add a default route ("default", {}, parser) at the end to catch all PDFs.
+#
+#     Sample:
+#     ```python
+#     from langchain_community.document_loaders import PyPDFLoader
+#     from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+#     from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+#     from langchain_community.document_loaders.parsers import PDFPlumberParser
+#     routes = [
+#         # Name, keys with regex, parser
+#         ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+#         PyMuPDFParser()),
+#         ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+#         ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
+#         PDFPlumberParser()),
+#         ("defautl", {}, PyPDFium2Parser())
+#     ]
+#     loader = PDFRouterLoader(filename, routes)
+#     loader.load()
+#     ```
+#     """
+#
+#     def __init__(
+#         self,
+#         file_path: Union[str, Path],
+#         *,
+#         routes: list[
+#             tuple[
+#                 str,
+#                 dict[str, Union[re.Pattern, str]],
+#                 BaseBlobParser,
+#             ]
+#         ],
+#         password: Optional[str] = None,
+#     ):
+#         """Initialize with a file path."""
+#         super().__init__(file_path)
+#         self.parser = PDFRouterParser(routes, password=password)
+#
+#     def lazy_load(
+#         self,
+#     ) -> Iterator[Document]:
+#         if self.web_path:
+#             blob = Blob.from_data(open(self.file_path, "rb").read(),
+#             path=self.web_path)  # type: ignore[attr-defined]
+#         else:
+#             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+#         yield from self.parser.lazy_parse(blob)
+# FIXME
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -2,7 +2,7 @@

 import re
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING, Iterator, Literal, Union, cast

 import pytest

@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
    BaseImageBlobParser,
-    PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
+    PDFMinerParser,
+    PDFPlumberParser,
+    PDFRouterParser,
+    PyMuPDFParser,
+    PyPDFium2Parser,
 )
-from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
-    PDFMinerParser

 if TYPE_CHECKING:
    from PIL.Image import Image
@ -315,9 +317,15 @@ def test_parser_with_table(
    )
    _std_assert_with_parser(parser)

+
 def test_parser_router_parse() -> None:
-    mode = "single"
-    routes = [
+    mode: Literal["single"] = "single"
+    routes: PDFRouterParser.Routes = [
+        (
+            "Xdvipdfmx",
+            {"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
+            PDFMinerParser(mode=mode),
+        ),
        (
            "Microsoft",
            {"producer": "Microsoft", "creator": "Microsoft"},
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
            PDFMinerParser(mode=mode),
        ),
        (
-            "Xdvipdfmx",
-            {"producer": "xdvipdfmx.*", "page1": "Hello"},
-            PDFMinerParser(mode=mode),
+            "default",
+            cast(dict[str, Union[re.Pattern, str]], dict()),
+            PyPDFium2Parser(mode=mode),
        ),
-        ("default", {}, PyPDFium2Parser(mode=mode)),
    ]
-    _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
+    _assert_with_parser(
+        PDFRouterParser(
+            routes=routes,
+        ),
+        splits_by_page=False,
+    )
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
        "OpenAIWhisperParser",
        "PyPDFParser",
        "PDFMinerParser",
+        "PDFRouterParser",
        "PyMuPDFParser",
        "PyPDFium2Parser",
        "PDFPlumberParser",