From 007180d990dd1362edf5574d2da8fe04f7139dbc Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 15 Apr 2025 16:39:13 +0200 Subject: [PATCH] Propose PDFRouterParser and Loader --- .../document_loaders/parsers/__init__.py | 3 + .../document_loaders/parsers/pdf.py | 22 ++-- .../document_loaders/pdf.py | 117 +++++++++--------- .../parsers/test_pdf_parsers.py | 34 +++-- .../parsers/test_public_api.py | 1 + 5 files changed, 98 insertions(+), 79 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py index 9712718e197..a2c8c2a9c25 100644 --- a/libs/community/langchain_community/document_loaders/parsers/__init__.py +++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py @@ -29,6 +29,7 @@ if TYPE_CHECKING: from langchain_community.document_loaders.parsers.pdf import ( PDFMinerParser, PDFPlumberParser, + PDFRouterParser, PyMuPDFParser, PyPDFium2Parser, PyPDFParser, @@ -51,6 +52,7 @@ _module_lookup = { "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf", "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf", "PyPDFParser": "langchain_community.document_loaders.parsers.pdf", + "PDFRouterParser": "langchain_community.document_loaders.parsers.pdf", "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf", "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images", "TesseractBlobParser": "langchain_community.document_loaders.parsers.images", @@ -76,6 +78,7 @@ __all__ = [ "OpenAIWhisperParser", "PDFMinerParser", "PDFPlumberParser", + "PDFRouterParser", "PyMuPDFParser", "PyPDFParser", "PyPDFium2Parser", diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 0d7a00a6ad7..eb2674cf96d 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -2,10 +2,10 @@ from __future__ import annotations -import re import html import io import logging +import re import threading import warnings from datetime import datetime @@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser): yield from docs + class PDFRouterParser(BaseBlobParser): """ Load PDFs using different parsers based on the metadata of the PDF @@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser): ``` """ + Routes = Sequence[ + tuple[ + str, + Mapping[str, Union[re.Pattern, str]], + BaseBlobParser, + ] + ] + def __init__( self, - routes: list[ - tuple[ - str, - dict[str, Union[re.Pattern, str]], - BaseBlobParser, - ] - ], + routes: Routes, *, password: Optional[str] = None, ): @@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser): import pypdf # noqa:F401 except ImportError: raise ImportError( - "pypdf package not found, please install it with `pip install pypdf.six`" + "pypdf package not found, please install it with " + "`pip install pypdf.six`" ) from pypdf import PdfReader diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index e5c7523431e..ca3a9432656 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -22,8 +22,6 @@ from typing import ( from urllib.parse import urlparse import requests - -from langchain_core.document_loaders import BaseBlobParser from langchain_core.documents import Document from langchain_core.utils import get_from_dict_or_env @@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import ( PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, - PyPDFParser, PDFRouterParser, + PyPDFParser, ) from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader): # Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader -class PDFRouterLoader(BasePDFLoader): - """ - Load PDFs using different parsers based on the metadata of the PDF - or the body of the first page. - The routes are defined as a list of tuples, where each tuple contains - the name, a dictionary of metadata and regex pattern and the parser to use. - The special key "page1" is to search in the first page with a regexp. - Use the route in the correct order, as the first matching route is used. - Add a default route ("default", {}, parser) at the end to catch all PDFs. - Sample: - ```python - from langchain_community.document_loaders import PyPDFLoader - from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser - from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser - from langchain_community.document_loaders.parsers import PDFPlumberParser - routes = [ - # Name, keys with regex, parser - ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, - PyMuPDFParser()), - ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), - ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()), - ("defautl", {}, PyPDFium2Parser()) - ] - loader = PDFRouterLoader(filename, routes) - loader.load() - ``` - """ - - def __init__( - self, - file_path: Union[str, Path], - *, - routes: list[ - tuple[ - str, - dict[str, Union[re.Pattern | str]], - BaseBlobParser, - ] - ], - password: Optional[str] = None, - ): - """Initialize with a file path.""" - super().__init__(file_path) - self.parser = PDFRouterParser(routes, password=password) - - - def lazy_load( - self, - ) -> Iterator[Document]: - if self.web_path: - blob = Blob.from_data( - open(self.file_path, "rb").read(), path=self.web_path - ) # type: ignore[attr-defined] - else: - blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] - yield from self.parser.lazy_parse(blob) +# class PDFRouterLoader(BasePDFLoader): +# """ +# Load PDFs using different parsers based on the metadata of the PDF +# or the body of the first page. +# The routes are defined as a list of tuples, where each tuple contains +# the name, a dictionary of metadata and regex pattern and the parser to use. +# The special key "page1" is to search in the first page with a regexp. +# Use the route in the correct order, as the first matching route is used. +# Add a default route ("default", {}, parser) at the end to catch all PDFs. +# +# Sample: +# ```python +# from langchain_community.document_loaders import PyPDFLoader +# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser +# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser +# from langchain_community.document_loaders.parsers import PDFPlumberParser +# routes = [ +# # Name, keys with regex, parser +# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, +# PyMuPDFParser()), +# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), +# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, +# PDFPlumberParser()), +# ("defautl", {}, PyPDFium2Parser()) +# ] +# loader = PDFRouterLoader(filename, routes) +# loader.load() +# ``` +# """ +# +# def __init__( +# self, +# file_path: Union[str, Path], +# *, +# routes: list[ +# tuple[ +# str, +# dict[str, Union[re.Pattern, str]], +# BaseBlobParser, +# ] +# ], +# password: Optional[str] = None, +# ): +# """Initialize with a file path.""" +# super().__init__(file_path) +# self.parser = PDFRouterParser(routes, password=password) +# +# def lazy_load( +# self, +# ) -> Iterator[Document]: +# if self.web_path: +# blob = Blob.from_data(open(self.file_path, "rb").read(), +# path=self.web_path) # type: ignore[attr-defined] +# else: +# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] +# yield from self.parser.lazy_parse(blob) +# FIXME diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index da658f2805a..48076f7e6ef 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -2,7 +2,7 @@ import re from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING, Iterator, Literal, Union, cast import pytest @@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers import ( BaseImageBlobParser, - PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, + PDFMinerParser, + PDFPlumberParser, + PDFRouterParser, + PyMuPDFParser, + PyPDFium2Parser, ) -from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \ - PDFMinerParser if TYPE_CHECKING: from PIL.Image import Image @@ -315,9 +317,15 @@ def test_parser_with_table( ) _std_assert_with_parser(parser) + def test_parser_router_parse() -> None: - mode = "single" - routes = [ + mode: Literal["single"] = "single" + routes: PDFRouterParser.Routes = [ + ( + "Xdvipdfmx", + {"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"}, + PDFMinerParser(mode=mode), + ), ( "Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, @@ -331,10 +339,14 @@ def test_parser_router_parse() -> None: PDFMinerParser(mode=mode), ), ( - "Xdvipdfmx", - {"producer": "xdvipdfmx.*", "page1": "Hello"}, - PDFMinerParser(mode=mode), + "default", + cast(dict[str, Union[re.Pattern, str]], dict()), + PyPDFium2Parser(mode=mode), ), - ("default", {}, PyPDFium2Parser(mode=mode)), ] - _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False) + _assert_with_parser( + PDFRouterParser( + routes=routes, + ), + splits_by_page=False, + ) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py index edb5d1a35d8..d73d450671d 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py @@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None: "OpenAIWhisperParser", "PyPDFParser", "PDFMinerParser", + "PDFRouterParser", "PyMuPDFParser", "PyPDFium2Parser", "PDFPlumberParser",