Propose PDFRouterParser and Loader

2025-08-22 02:45:49 +00:00 · 2025-04-15 16:39:13 +02:00 · 2025-04-15 16:39:13 +02:00 · 007180d990
commit 007180d990
parent b5221f2476
5 changed files with 98 additions and 79 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/init.py
+++ b/libs/community/langchain_community/document_loaders/parsers/init.py
@ -29,6 +29,7 @@ if TYPE_CHECKING:
    from langchain_community.document_loaders.parsers.pdf import (
        PDFMinerParser,
        PDFPlumberParser,
        PDFRouterParser,
        PyMuPDFParser,
        PyPDFium2Parser,
        PyPDFParser,
@ -51,6 +52,7 @@ _module_lookup = {
    "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
    "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
    "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
    "PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
    "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
    "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
    "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@ -76,6 +78,7 @@ __all__ = [
    "OpenAIWhisperParser",
    "PDFMinerParser",
    "PDFPlumberParser",
    "PDFRouterParser",
    "PyMuPDFParser",
    "PyPDFParser",
    "PyPDFium2Parser",
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -2,10 +2,10 @@
 from __future__ import annotations
 import re
 import html
 import io
 import logging
 import re
 import threading
 import warnings
 from datetime import datetime
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
            yield from docs
 class PDFRouterParser(BaseBlobParser):
    """
    Load PDFs using different parsers based on the metadata of the PDF
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
    ```
    """
    Routes = Sequence[
        tuple[
            str,
            Mapping[str, Union[re.Pattern, str]],
            BaseBlobParser,
        ]
    ]
    def __init__(
        self,
-        routes: list[
+        routes: Routes,
            tuple[
                str,
                dict[str, Union[re.Pattern, str]],
                BaseBlobParser,
            ]
        ],
        *,
        password: Optional[str] = None,
    ):
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
-                "pypdf package not found, please install it with `pip install pypdf.six`"
+                "pypdf package not found, please install it with "
                "`pip install pypdf.six`"
            )
        from pypdf import PdfReader
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -22,8 +22,6 @@ from typing import (
 from urllib.parse import urlparse
 import requests
 from langchain_core.document_loaders import BaseBlobParser
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_dict_or_env
@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
    PDFPlumberParser,
    PyMuPDFParser,
    PyPDFium2Parser,
-    PyPDFParser, PDFRouterParser,
+    PyPDFParser,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
 class PDFRouterLoader(BasePDFLoader):
    """
    Load PDFs using different parsers based on the metadata of the PDF
    or the body of the first page.
    The routes are defined as a list of tuples, where each tuple contains
    the name, a dictionary of metadata and regex pattern and the parser to use.
    The special key "page1" is to search in the first page with a regexp.
    Use the route in the correct order, as the first matching route is used.
    Add a default route ("default", {}, parser) at the end to catch all PDFs.
-    Sample:
+# class PDFRouterLoader(BasePDFLoader):
-    ```python
+#     """
-    from langchain_community.document_loaders import PyPDFLoader
+#     Load PDFs using different parsers based on the metadata of the PDF
-    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+#     or the body of the first page.
-    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+#     The routes are defined as a list of tuples, where each tuple contains
-    from langchain_community.document_loaders.parsers import PDFPlumberParser
+#     the name, a dictionary of metadata and regex pattern and the parser to use.
-    routes = [
+#     The special key "page1" is to search in the first page with a regexp.
-        # Name, keys with regex, parser
+#     Use the route in the correct order, as the first matching route is used.
-        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+#     Add a default route ("default", {}, parser) at the end to catch all PDFs.
-        PyMuPDFParser()),
+#
-        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+#     Sample:
-        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
+#     ```python
-        ("defautl", {}, PyPDFium2Parser())
+#     from langchain_community.document_loaders import PyPDFLoader
-    ]
+#     from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
-    loader = PDFRouterLoader(filename, routes)
+#     from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
-    loader.load()
+#     from langchain_community.document_loaders.parsers import PDFPlumberParser
-    ```
+#     routes = [
-    """
+#         # Name, keys with regex, parser
-
+#         ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
-    def __init__(
+#         PyMuPDFParser()),
-        self,
+#         ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
-        file_path: Union[str, Path],
+#         ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
-        *,
+#         PDFPlumberParser()),
-        routes: list[
+#         ("defautl", {}, PyPDFium2Parser())
-            tuple[
+#     ]
-                str,
+#     loader = PDFRouterLoader(filename, routes)
-                dict[str, Union[re.Pattern | str]],
+#     loader.load()
-                BaseBlobParser,
+#     ```
-            ]
+#     """
-        ],
+#
-        password: Optional[str] = None,
+#     def __init__(
-    ):
+#         self,
-        """Initialize with a file path."""
+#         file_path: Union[str, Path],
-        super().__init__(file_path)
+#         *,
-        self.parser = PDFRouterParser(routes, password=password)
+#         routes: list[
-
+#             tuple[
-
+#                 str,
-    def lazy_load(
+#                 dict[str, Union[re.Pattern, str]],
-        self,
+#                 BaseBlobParser,
-    ) -> Iterator[Document]:
+#             ]
-        if self.web_path:
+#         ],
-            blob = Blob.from_data(
+#         password: Optional[str] = None,
-                open(self.file_path, "rb").read(), path=self.web_path
+#     ):
-            )  # type: ignore[attr-defined]
+#         """Initialize with a file path."""
-        else:
+#         super().__init__(file_path)
-            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+#         self.parser = PDFRouterParser(routes, password=password)
-        yield from self.parser.lazy_parse(blob)
+#
 #     def lazy_load(
 #         self,
 #     ) -> Iterator[Document]:
 #         if self.web_path:
 #             blob = Blob.from_data(open(self.file_path, "rb").read(),
 #             path=self.web_path)  # type: ignore[attr-defined]
 #         else:
 #             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
 #         yield from self.parser.lazy_parse(blob)
 # FIXME
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -2,7 +2,7 @@
 import re
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
 import pytest
@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
    BaseImageBlobParser,
-    PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
+    PDFMinerParser,
    PDFPlumberParser,
    PDFRouterParser,
    PyMuPDFParser,
    PyPDFium2Parser,
 )
 from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
    PDFMinerParser
 if TYPE_CHECKING:
    from PIL.Image import Image
@ -315,9 +317,15 @@ def test_parser_with_table(
    )
    _std_assert_with_parser(parser)
 def test_parser_router_parse() -> None:
-    mode = "single"
+    mode: Literal["single"] = "single"
-    routes = [
+    routes: PDFRouterParser.Routes = [
        (
            "Xdvipdfmx",
            {"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
            PDFMinerParser(mode=mode),
        ),
        (
            "Microsoft",
            {"producer": "Microsoft", "creator": "Microsoft"},
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
            PDFMinerParser(mode=mode),
        ),
        (
-            "Xdvipdfmx",
+            "default",
-            {"producer": "xdvipdfmx.*", "page1": "Hello"},
+            cast(dict[str, Union[re.Pattern, str]], dict()),
-            PDFMinerParser(mode=mode),
+            PyPDFium2Parser(mode=mode),
        ),
        ("default", {}, PyPDFium2Parser(mode=mode)),
    ]
-    _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
+    _assert_with_parser(
        PDFRouterParser(
            routes=routes,
        ),
        splits_by_page=False,
    )
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
        "OpenAIWhisperParser",
        "PyPDFParser",
        "PDFMinerParser",
        "PDFRouterParser",
        "PyMuPDFParser",
        "PyPDFium2Parser",
        "PDFPlumberParser",