mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-21 02:17:12 +00:00
Propose PDFRouterParser and Loader
This commit is contained in:
parent
b5221f2476
commit
007180d990
@ -29,6 +29,7 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PDFRouterParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
@ -51,6 +52,7 @@ _module_lookup = {
|
||||
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
||||
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||
@ -76,6 +78,7 @@ __all__ = [
|
||||
"OpenAIWhisperParser",
|
||||
"PDFMinerParser",
|
||||
"PDFPlumberParser",
|
||||
"PDFRouterParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
|
@ -2,10 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import html
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import threading
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
||||
|
||||
yield from docs
|
||||
|
||||
|
||||
class PDFRouterParser(BaseBlobParser):
|
||||
"""
|
||||
Load PDFs using different parsers based on the metadata of the PDF
|
||||
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
routes: list[
|
||||
Routes = Sequence[
|
||||
tuple[
|
||||
str,
|
||||
dict[str, Union[re.Pattern, str]],
|
||||
Mapping[str, Union[re.Pattern, str]],
|
||||
BaseBlobParser,
|
||||
]
|
||||
],
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
routes: Routes,
|
||||
*,
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with `pip install pypdf.six`"
|
||||
"pypdf package not found, please install it with "
|
||||
"`pip install pypdf.six`"
|
||||
)
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
@ -22,8 +22,6 @@ from typing import (
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from langchain_core.document_loaders import BaseBlobParser
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
|
||||
@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser, PDFRouterParser,
|
||||
PyPDFParser,
|
||||
)
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||
PagedPDFSplitter = PyPDFLoader
|
||||
|
||||
class PDFRouterLoader(BasePDFLoader):
|
||||
"""
|
||||
Load PDFs using different parsers based on the metadata of the PDF
|
||||
or the body of the first page.
|
||||
The routes are defined as a list of tuples, where each tuple contains
|
||||
the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||
The special key "page1" is to search in the first page with a regexp.
|
||||
Use the route in the correct order, as the first matching route is used.
|
||||
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||
|
||||
Sample:
|
||||
```python
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||
routes = [
|
||||
# Name, keys with regex, parser
|
||||
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||
PyMuPDFParser()),
|
||||
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
||||
("defautl", {}, PyPDFium2Parser())
|
||||
]
|
||||
loader = PDFRouterLoader(filename, routes)
|
||||
loader.load()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
*,
|
||||
routes: list[
|
||||
tuple[
|
||||
str,
|
||||
dict[str, Union[re.Pattern | str]],
|
||||
BaseBlobParser,
|
||||
]
|
||||
],
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
super().__init__(file_path)
|
||||
self.parser = PDFRouterParser(routes, password=password)
|
||||
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
if self.web_path:
|
||||
blob = Blob.from_data(
|
||||
open(self.file_path, "rb").read(), path=self.web_path
|
||||
) # type: ignore[attr-defined]
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from self.parser.lazy_parse(blob)
|
||||
# class PDFRouterLoader(BasePDFLoader):
|
||||
# """
|
||||
# Load PDFs using different parsers based on the metadata of the PDF
|
||||
# or the body of the first page.
|
||||
# The routes are defined as a list of tuples, where each tuple contains
|
||||
# the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||
# The special key "page1" is to search in the first page with a regexp.
|
||||
# Use the route in the correct order, as the first matching route is used.
|
||||
# Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||
#
|
||||
# Sample:
|
||||
# ```python
|
||||
# from langchain_community.document_loaders import PyPDFLoader
|
||||
# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||
# from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||
# routes = [
|
||||
# # Name, keys with regex, parser
|
||||
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||
# PyMuPDFParser()),
|
||||
# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||
# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
|
||||
# PDFPlumberParser()),
|
||||
# ("defautl", {}, PyPDFium2Parser())
|
||||
# ]
|
||||
# loader = PDFRouterLoader(filename, routes)
|
||||
# loader.load()
|
||||
# ```
|
||||
# """
|
||||
#
|
||||
# def __init__(
|
||||
# self,
|
||||
# file_path: Union[str, Path],
|
||||
# *,
|
||||
# routes: list[
|
||||
# tuple[
|
||||
# str,
|
||||
# dict[str, Union[re.Pattern, str]],
|
||||
# BaseBlobParser,
|
||||
# ]
|
||||
# ],
|
||||
# password: Optional[str] = None,
|
||||
# ):
|
||||
# """Initialize with a file path."""
|
||||
# super().__init__(file_path)
|
||||
# self.parser = PDFRouterParser(routes, password=password)
|
||||
#
|
||||
# def lazy_load(
|
||||
# self,
|
||||
# ) -> Iterator[Document]:
|
||||
# if self.web_path:
|
||||
# blob = Blob.from_data(open(self.file_path, "rb").read(),
|
||||
# path=self.web_path) # type: ignore[attr-defined]
|
||||
# else:
|
||||
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
# yield from self.parser.lazy_parse(blob)
|
||||
# FIXME
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator
|
||||
from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
|
||||
|
||||
import pytest
|
||||
|
||||
@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
BaseImageBlobParser,
|
||||
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PDFRouterParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
|
||||
PDFMinerParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from PIL.Image import Image
|
||||
@ -315,9 +317,15 @@ def test_parser_with_table(
|
||||
)
|
||||
_std_assert_with_parser(parser)
|
||||
|
||||
|
||||
def test_parser_router_parse() -> None:
|
||||
mode = "single"
|
||||
routes = [
|
||||
mode: Literal["single"] = "single"
|
||||
routes: PDFRouterParser.Routes = [
|
||||
(
|
||||
"Xdvipdfmx",
|
||||
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
|
||||
PDFMinerParser(mode=mode),
|
||||
),
|
||||
(
|
||||
"Microsoft",
|
||||
{"producer": "Microsoft", "creator": "Microsoft"},
|
||||
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
|
||||
PDFMinerParser(mode=mode),
|
||||
),
|
||||
(
|
||||
"Xdvipdfmx",
|
||||
{"producer": "xdvipdfmx.*", "page1": "Hello"},
|
||||
PDFMinerParser(mode=mode),
|
||||
"default",
|
||||
cast(dict[str, Union[re.Pattern, str]], dict()),
|
||||
PyPDFium2Parser(mode=mode),
|
||||
),
|
||||
("default", {}, PyPDFium2Parser(mode=mode)),
|
||||
]
|
||||
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
|
||||
_assert_with_parser(
|
||||
PDFRouterParser(
|
||||
routes=routes,
|
||||
),
|
||||
splits_by_page=False,
|
||||
)
|
||||
|
@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
|
||||
"OpenAIWhisperParser",
|
||||
"PyPDFParser",
|
||||
"PDFMinerParser",
|
||||
"PDFRouterParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
|
Loading…
Reference in New Issue
Block a user