From 83563989d59184894360966a20cde599cdfac80c Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 15 Apr 2025 16:11:54 +0200 Subject: [PATCH] Propose PDFRouterParser and Loader --- libs/community/extended_testing_deps.txt | 2 +- .../document_loaders/parsers/pdf.py | 89 +++++++++++++++++++ .../document_loaders/pdf.py | 62 ++++++++++++- .../parsers/test_pdf_parsers.py | 28 +++++- 4 files changed, 178 insertions(+), 3 deletions(-) diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 6ad4f43e867..8f071a9336f 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4 oracle-ads>=2.9.1,<3 oracledb>=2.2.0,<3 pandas>=2.0.1,<3 -pdfminer-six==20231228 +pdfminer-six==20250327 pdfplumber>=0.11 pgvector>=0.1.6,<0.2 playwright>=1.48.0,<2 diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 46ad300be96..0d7a00a6ad7 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -2,6 +2,7 @@ from __future__ import annotations +import re import html import io import logging @@ -1668,3 +1669,91 @@ class DocumentIntelligenceParser(BaseBlobParser): docs = self._generate_docs(blob, result) yield from docs + +class PDFRouterParser(BaseBlobParser): + """ + Load PDFs using different parsers based on the metadata of the PDF + or the body of the first page. + The routes are defined as a list of tuples, where each tuple contains + the name, a dictionary of metadata and regex pattern and the parser to use. + The special key "page1" is to search in the first page with a regexp. + Use the route in the correct order, as the first matching route is used. + Add a default route ("default", {}, parser) at the end to catch all PDFs. + This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file. + + Sample: + ```python + from langchain_community.document_loaders import PyPDFLoader + from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser + from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser + from langchain_community.document_loaders.parsers import PDFPlumberParser + routes = [ + # Name, keys with regex, parser + ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, + PyMuPDFParser()), + ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), + ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()), + ("defautl", {}, PyPDFium2Parser()) + ] + loader = PDFRouterLoader(filename, routes) + loader.load() + ``` + """ + + def __init__( + self, + routes: list[ + tuple[ + str, + dict[str, Union[re.Pattern, str]], + BaseBlobParser, + ] + ], + *, + password: Optional[str] = None, + ): + try: + import pypdf # noqa:F401 + except ImportError: + raise ImportError( + "pypdf package not found, please install it with `pip install pypdf`" + ) + super().__init__() + self.password = password + new_routes = [] + for name, matchs, parser in routes: + new_matchs = {} + for k, v in matchs.items(): + if isinstance(v, str): + v = re.compile(v) + new_matchs[k] = v + new_routes.append((name, new_matchs, parser)) + self.routes = new_routes + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + """Lazily parse the blob.""" + try: + import pypdf # noqa:F401 + except ImportError: + raise ImportError( + "pypdf package not found, please install it with `pip install pypdf.six`" + ) + from pypdf import PdfReader + + with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] + with PdfReader(pdf_file_obj, password=self.password) as reader: + metadata = _purge_metadata(cast(dict[str, Any], reader.metadata)) + page1 = reader.pages[0].extract_text() + metadata["page1"] = page1 + find = False + for name, match, parser in self.routes: + for k, p in match.items(): + if k not in metadata or not p.search(metadata[k]): + break + else: + find = True + break + if find: + for doc in parser.lazy_parse(blob): + doc.metadata["router"] = name + yield doc diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index b9e57b19ff1..e5c7523431e 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -22,6 +22,8 @@ from typing import ( from urllib.parse import urlparse import requests + +from langchain_core.document_loaders import BaseBlobParser from langchain_core.documents import Document from langchain_core.utils import get_from_dict_or_env @@ -37,7 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import ( PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, - PyPDFParser, + PyPDFParser, PDFRouterParser, ) from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -1423,3 +1425,61 @@ class ZeroxPDFLoader(BasePDFLoader): # Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader + +class PDFRouterLoader(BasePDFLoader): + """ + Load PDFs using different parsers based on the metadata of the PDF + or the body of the first page. + The routes are defined as a list of tuples, where each tuple contains + the name, a dictionary of metadata and regex pattern and the parser to use. + The special key "page1" is to search in the first page with a regexp. + Use the route in the correct order, as the first matching route is used. + Add a default route ("default", {}, parser) at the end to catch all PDFs. + + Sample: + ```python + from langchain_community.document_loaders import PyPDFLoader + from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser + from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser + from langchain_community.document_loaders.parsers import PDFPlumberParser + routes = [ + # Name, keys with regex, parser + ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, + PyMuPDFParser()), + ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), + ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()), + ("defautl", {}, PyPDFium2Parser()) + ] + loader = PDFRouterLoader(filename, routes) + loader.load() + ``` + """ + + def __init__( + self, + file_path: Union[str, Path], + *, + routes: list[ + tuple[ + str, + dict[str, Union[re.Pattern | str]], + BaseBlobParser, + ] + ], + password: Optional[str] = None, + ): + """Initialize with a file path.""" + super().__init__(file_path) + self.parser = PDFRouterParser(routes, password=password) + + + def lazy_load( + self, + ) -> Iterator[Document]: + if self.web_path: + blob = Blob.from_data( + open(self.file_path, "rb").read(), path=self.web_path + ) # type: ignore[attr-defined] + else: + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] + yield from self.parser.lazy_parse(blob) diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 1137dd79f2e..da658f2805a 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -11,8 +11,10 @@ from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers import ( BaseImageBlobParser, - PDFPlumberParser, + PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, ) +from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \ + PDFMinerParser if TYPE_CHECKING: from PIL.Image import Image @@ -312,3 +314,27 @@ def test_parser_with_table( **params, ) _std_assert_with_parser(parser) + +def test_parser_router_parse() -> None: + mode = "single" + routes = [ + ( + "Microsoft", + {"producer": "Microsoft", "creator": "Microsoft"}, + PyMuPDFParser(mode=mode), + ), + ( + "LibreOffice", + { + "producer": "LibreOffice", + }, + PDFMinerParser(mode=mode), + ), + ( + "Xdvipdfmx", + {"producer": "xdvipdfmx.*", "page1": "Hello"}, + PDFMinerParser(mode=mode), + ), + ("default", {}, PyPDFium2Parser(mode=mode)), + ] + _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)