mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-22 02:45:49 +00:00
Propose PDFRouterParser and Loader
This commit is contained in:
parent
b5221f2476
commit
007180d990
@ -29,6 +29,7 @@ if TYPE_CHECKING:
|
|||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
|
PDFRouterParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
@ -51,6 +52,7 @@ _module_lookup = {
|
|||||||
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
|
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
|
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
|
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
|
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
@ -76,6 +78,7 @@ __all__ = [
|
|||||||
"OpenAIWhisperParser",
|
"OpenAIWhisperParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
"PDFPlumberParser",
|
"PDFPlumberParser",
|
||||||
|
"PDFRouterParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
"PyPDFParser",
|
"PyPDFParser",
|
||||||
"PyPDFium2Parser",
|
"PyPDFium2Parser",
|
||||||
|
@ -2,10 +2,10 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
|
||||||
import html
|
import html
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import threading
|
import threading
|
||||||
import warnings
|
import warnings
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -1670,6 +1670,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
|||||||
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
|
||||||
class PDFRouterParser(BaseBlobParser):
|
class PDFRouterParser(BaseBlobParser):
|
||||||
"""
|
"""
|
||||||
Load PDFs using different parsers based on the metadata of the PDF
|
Load PDFs using different parsers based on the metadata of the PDF
|
||||||
@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
|
|||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
Routes = Sequence[
|
||||||
self,
|
|
||||||
routes: list[
|
|
||||||
tuple[
|
tuple[
|
||||||
str,
|
str,
|
||||||
dict[str, Union[re.Pattern, str]],
|
Mapping[str, Union[re.Pattern, str]],
|
||||||
BaseBlobParser,
|
BaseBlobParser,
|
||||||
]
|
]
|
||||||
],
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
routes: Routes,
|
||||||
*,
|
*,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
):
|
):
|
||||||
@ -1736,7 +1739,8 @@ class PDFRouterParser(BaseBlobParser):
|
|||||||
import pypdf # noqa:F401
|
import pypdf # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pypdf package not found, please install it with `pip install pypdf.six`"
|
"pypdf package not found, please install it with "
|
||||||
|
"`pip install pypdf.six`"
|
||||||
)
|
)
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
@ -22,8 +22,6 @@ from typing import (
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from langchain_core.document_loaders import BaseBlobParser
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.utils import get_from_dict_or_env
|
from langchain_core.utils import get_from_dict_or_env
|
||||||
|
|
||||||
@ -39,7 +37,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
|||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser, PDFRouterParser,
|
PyPDFParser,
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
@ -1426,60 +1424,61 @@ class ZeroxPDFLoader(BasePDFLoader):
|
|||||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||||
PagedPDFSplitter = PyPDFLoader
|
PagedPDFSplitter = PyPDFLoader
|
||||||
|
|
||||||
class PDFRouterLoader(BasePDFLoader):
|
|
||||||
"""
|
|
||||||
Load PDFs using different parsers based on the metadata of the PDF
|
|
||||||
or the body of the first page.
|
|
||||||
The routes are defined as a list of tuples, where each tuple contains
|
|
||||||
the name, a dictionary of metadata and regex pattern and the parser to use.
|
|
||||||
The special key "page1" is to search in the first page with a regexp.
|
|
||||||
Use the route in the correct order, as the first matching route is used.
|
|
||||||
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
|
||||||
|
|
||||||
Sample:
|
# class PDFRouterLoader(BasePDFLoader):
|
||||||
```python
|
# """
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
# Load PDFs using different parsers based on the metadata of the PDF
|
||||||
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
# or the body of the first page.
|
||||||
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
# The routes are defined as a list of tuples, where each tuple contains
|
||||||
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
# the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||||
routes = [
|
# The special key "page1" is to search in the first page with a regexp.
|
||||||
# Name, keys with regex, parser
|
# Use the route in the correct order, as the first matching route is used.
|
||||||
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
# Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||||
PyMuPDFParser()),
|
#
|
||||||
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
# Sample:
|
||||||
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
# ```python
|
||||||
("defautl", {}, PyPDFium2Parser())
|
# from langchain_community.document_loaders import PyPDFLoader
|
||||||
]
|
# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||||
loader = PDFRouterLoader(filename, routes)
|
# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||||
loader.load()
|
# from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||||
```
|
# routes = [
|
||||||
"""
|
# # Name, keys with regex, parser
|
||||||
|
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||||
def __init__(
|
# PyMuPDFParser()),
|
||||||
self,
|
# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||||
file_path: Union[str, Path],
|
# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
|
||||||
*,
|
# PDFPlumberParser()),
|
||||||
routes: list[
|
# ("defautl", {}, PyPDFium2Parser())
|
||||||
tuple[
|
# ]
|
||||||
str,
|
# loader = PDFRouterLoader(filename, routes)
|
||||||
dict[str, Union[re.Pattern | str]],
|
# loader.load()
|
||||||
BaseBlobParser,
|
# ```
|
||||||
]
|
# """
|
||||||
],
|
#
|
||||||
password: Optional[str] = None,
|
# def __init__(
|
||||||
):
|
# self,
|
||||||
"""Initialize with a file path."""
|
# file_path: Union[str, Path],
|
||||||
super().__init__(file_path)
|
# *,
|
||||||
self.parser = PDFRouterParser(routes, password=password)
|
# routes: list[
|
||||||
|
# tuple[
|
||||||
|
# str,
|
||||||
def lazy_load(
|
# dict[str, Union[re.Pattern, str]],
|
||||||
self,
|
# BaseBlobParser,
|
||||||
) -> Iterator[Document]:
|
# ]
|
||||||
if self.web_path:
|
# ],
|
||||||
blob = Blob.from_data(
|
# password: Optional[str] = None,
|
||||||
open(self.file_path, "rb").read(), path=self.web_path
|
# ):
|
||||||
) # type: ignore[attr-defined]
|
# """Initialize with a file path."""
|
||||||
else:
|
# super().__init__(file_path)
|
||||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
# self.parser = PDFRouterParser(routes, password=password)
|
||||||
yield from self.parser.lazy_parse(blob)
|
#
|
||||||
|
# def lazy_load(
|
||||||
|
# self,
|
||||||
|
# ) -> Iterator[Document]:
|
||||||
|
# if self.web_path:
|
||||||
|
# blob = Blob.from_data(open(self.file_path, "rb").read(),
|
||||||
|
# path=self.web_path) # type: ignore[attr-defined]
|
||||||
|
# else:
|
||||||
|
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
|
# yield from self.parser.lazy_parse(blob)
|
||||||
|
# FIXME
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterator
|
from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -11,10 +11,12 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
|||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
|
PDFMinerParser,
|
||||||
|
PDFPlumberParser,
|
||||||
|
PDFRouterParser,
|
||||||
|
PyMuPDFParser,
|
||||||
|
PyPDFium2Parser,
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
|
|
||||||
PDFMinerParser
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@ -315,9 +317,15 @@ def test_parser_with_table(
|
|||||||
)
|
)
|
||||||
_std_assert_with_parser(parser)
|
_std_assert_with_parser(parser)
|
||||||
|
|
||||||
|
|
||||||
def test_parser_router_parse() -> None:
|
def test_parser_router_parse() -> None:
|
||||||
mode = "single"
|
mode: Literal["single"] = "single"
|
||||||
routes = [
|
routes: PDFRouterParser.Routes = [
|
||||||
|
(
|
||||||
|
"Xdvipdfmx",
|
||||||
|
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
|
||||||
|
PDFMinerParser(mode=mode),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"Microsoft",
|
"Microsoft",
|
||||||
{"producer": "Microsoft", "creator": "Microsoft"},
|
{"producer": "Microsoft", "creator": "Microsoft"},
|
||||||
@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
|
|||||||
PDFMinerParser(mode=mode),
|
PDFMinerParser(mode=mode),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"Xdvipdfmx",
|
"default",
|
||||||
{"producer": "xdvipdfmx.*", "page1": "Hello"},
|
cast(dict[str, Union[re.Pattern, str]], dict()),
|
||||||
PDFMinerParser(mode=mode),
|
PyPDFium2Parser(mode=mode),
|
||||||
),
|
),
|
||||||
("default", {}, PyPDFium2Parser(mode=mode)),
|
|
||||||
]
|
]
|
||||||
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
|
_assert_with_parser(
|
||||||
|
PDFRouterParser(
|
||||||
|
routes=routes,
|
||||||
|
),
|
||||||
|
splits_by_page=False,
|
||||||
|
)
|
||||||
|
@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
|
|||||||
"OpenAIWhisperParser",
|
"OpenAIWhisperParser",
|
||||||
"PyPDFParser",
|
"PyPDFParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
|
"PDFRouterParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
"PyPDFium2Parser",
|
"PyPDFium2Parser",
|
||||||
"PDFPlumberParser",
|
"PDFPlumberParser",
|
||||||
|
Loading…
Reference in New Issue
Block a user