mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-22 10:59:22 +00:00
Propose PDFRouterParser and Loader
This commit is contained in:
parent
ed5c4805f6
commit
83563989d5
@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
|
||||
oracle-ads>=2.9.1,<3
|
||||
oracledb>=2.2.0,<3
|
||||
pandas>=2.0.1,<3
|
||||
pdfminer-six==20231228
|
||||
pdfminer-six==20250327
|
||||
pdfplumber>=0.11
|
||||
pgvector>=0.1.6,<0.2
|
||||
playwright>=1.48.0,<2
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import html
|
||||
import io
|
||||
import logging
|
||||
@ -1668,3 +1669,91 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
||||
docs = self._generate_docs(blob, result)
|
||||
|
||||
yield from docs
|
||||
|
||||
class PDFRouterParser(BaseBlobParser):
|
||||
"""
|
||||
Load PDFs using different parsers based on the metadata of the PDF
|
||||
or the body of the first page.
|
||||
The routes are defined as a list of tuples, where each tuple contains
|
||||
the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||
The special key "page1" is to search in the first page with a regexp.
|
||||
Use the route in the correct order, as the first matching route is used.
|
||||
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||
This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
|
||||
|
||||
Sample:
|
||||
```python
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||
routes = [
|
||||
# Name, keys with regex, parser
|
||||
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||
PyMuPDFParser()),
|
||||
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
||||
("defautl", {}, PyPDFium2Parser())
|
||||
]
|
||||
loader = PDFRouterLoader(filename, routes)
|
||||
loader.load()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
routes: list[
|
||||
tuple[
|
||||
str,
|
||||
dict[str, Union[re.Pattern, str]],
|
||||
BaseBlobParser,
|
||||
]
|
||||
],
|
||||
*,
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with `pip install pypdf`"
|
||||
)
|
||||
super().__init__()
|
||||
self.password = password
|
||||
new_routes = []
|
||||
for name, matchs, parser in routes:
|
||||
new_matchs = {}
|
||||
for k, v in matchs.items():
|
||||
if isinstance(v, str):
|
||||
v = re.compile(v)
|
||||
new_matchs[k] = v
|
||||
new_routes.append((name, new_matchs, parser))
|
||||
self.routes = new_routes
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with `pip install pypdf.six`"
|
||||
)
|
||||
from pypdf import PdfReader
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||
with PdfReader(pdf_file_obj, password=self.password) as reader:
|
||||
metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
|
||||
page1 = reader.pages[0].extract_text()
|
||||
metadata["page1"] = page1
|
||||
find = False
|
||||
for name, match, parser in self.routes:
|
||||
for k, p in match.items():
|
||||
if k not in metadata or not p.search(metadata[k]):
|
||||
break
|
||||
else:
|
||||
find = True
|
||||
break
|
||||
if find:
|
||||
for doc in parser.lazy_parse(blob):
|
||||
doc.metadata["router"] = name
|
||||
yield doc
|
||||
|
@ -22,6 +22,8 @@ from typing import (
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from langchain_core.document_loaders import BaseBlobParser
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
|
||||
@ -37,7 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
PyPDFParser, PDFRouterParser,
|
||||
)
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -1423,3 +1425,61 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
|
||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||
PagedPDFSplitter = PyPDFLoader
|
||||
|
||||
class PDFRouterLoader(BasePDFLoader):
|
||||
"""
|
||||
Load PDFs using different parsers based on the metadata of the PDF
|
||||
or the body of the first page.
|
||||
The routes are defined as a list of tuples, where each tuple contains
|
||||
the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||
The special key "page1" is to search in the first page with a regexp.
|
||||
Use the route in the correct order, as the first matching route is used.
|
||||
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||
|
||||
Sample:
|
||||
```python
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||
routes = [
|
||||
# Name, keys with regex, parser
|
||||
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||
PyMuPDFParser()),
|
||||
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
||||
("defautl", {}, PyPDFium2Parser())
|
||||
]
|
||||
loader = PDFRouterLoader(filename, routes)
|
||||
loader.load()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
*,
|
||||
routes: list[
|
||||
tuple[
|
||||
str,
|
||||
dict[str, Union[re.Pattern | str]],
|
||||
BaseBlobParser,
|
||||
]
|
||||
],
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
super().__init__(file_path)
|
||||
self.parser = PDFRouterParser(routes, password=password)
|
||||
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
if self.web_path:
|
||||
blob = Blob.from_data(
|
||||
open(self.file_path, "rb").read(), path=self.web_path
|
||||
) # type: ignore[attr-defined]
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from self.parser.lazy_parse(blob)
|
||||
|
@ -11,8 +11,10 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
BaseImageBlobParser,
|
||||
PDFPlumberParser,
|
||||
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
|
||||
PDFMinerParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from PIL.Image import Image
|
||||
@ -312,3 +314,27 @@ def test_parser_with_table(
|
||||
**params,
|
||||
)
|
||||
_std_assert_with_parser(parser)
|
||||
|
||||
def test_parser_router_parse() -> None:
|
||||
mode = "single"
|
||||
routes = [
|
||||
(
|
||||
"Microsoft",
|
||||
{"producer": "Microsoft", "creator": "Microsoft"},
|
||||
PyMuPDFParser(mode=mode),
|
||||
),
|
||||
(
|
||||
"LibreOffice",
|
||||
{
|
||||
"producer": "LibreOffice",
|
||||
},
|
||||
PDFMinerParser(mode=mode),
|
||||
),
|
||||
(
|
||||
"Xdvipdfmx",
|
||||
{"producer": "xdvipdfmx.*", "page1": "Hello"},
|
||||
PDFMinerParser(mode=mode),
|
||||
),
|
||||
("default", {}, PyPDFium2Parser(mode=mode)),
|
||||
]
|
||||
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
|
||||
|
Loading…
Reference in New Issue
Block a user