mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-22 19:08:40 +00:00
Propose PDFRouterParser and Loader
This commit is contained in:
parent
ed5c4805f6
commit
83563989d5
@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
|
|||||||
oracle-ads>=2.9.1,<3
|
oracle-ads>=2.9.1,<3
|
||||||
oracledb>=2.2.0,<3
|
oracledb>=2.2.0,<3
|
||||||
pandas>=2.0.1,<3
|
pandas>=2.0.1,<3
|
||||||
pdfminer-six==20231228
|
pdfminer-six==20250327
|
||||||
pdfplumber>=0.11
|
pdfplumber>=0.11
|
||||||
pgvector>=0.1.6,<0.2
|
pgvector>=0.1.6,<0.2
|
||||||
playwright>=1.48.0,<2
|
playwright>=1.48.0,<2
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
import html
|
import html
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
@ -1668,3 +1669,91 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
|||||||
docs = self._generate_docs(blob, result)
|
docs = self._generate_docs(blob, result)
|
||||||
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
class PDFRouterParser(BaseBlobParser):
|
||||||
|
"""
|
||||||
|
Load PDFs using different parsers based on the metadata of the PDF
|
||||||
|
or the body of the first page.
|
||||||
|
The routes are defined as a list of tuples, where each tuple contains
|
||||||
|
the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||||
|
The special key "page1" is to search in the first page with a regexp.
|
||||||
|
Use the route in the correct order, as the first matching route is used.
|
||||||
|
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||||
|
This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
|
||||||
|
|
||||||
|
Sample:
|
||||||
|
```python
|
||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||||
|
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||||
|
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||||
|
routes = [
|
||||||
|
# Name, keys with regex, parser
|
||||||
|
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||||
|
PyMuPDFParser()),
|
||||||
|
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||||
|
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
||||||
|
("defautl", {}, PyPDFium2Parser())
|
||||||
|
]
|
||||||
|
loader = PDFRouterLoader(filename, routes)
|
||||||
|
loader.load()
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
routes: list[
|
||||||
|
tuple[
|
||||||
|
str,
|
||||||
|
dict[str, Union[re.Pattern, str]],
|
||||||
|
BaseBlobParser,
|
||||||
|
]
|
||||||
|
],
|
||||||
|
*,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
import pypdf # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pypdf package not found, please install it with `pip install pypdf`"
|
||||||
|
)
|
||||||
|
super().__init__()
|
||||||
|
self.password = password
|
||||||
|
new_routes = []
|
||||||
|
for name, matchs, parser in routes:
|
||||||
|
new_matchs = {}
|
||||||
|
for k, v in matchs.items():
|
||||||
|
if isinstance(v, str):
|
||||||
|
v = re.compile(v)
|
||||||
|
new_matchs[k] = v
|
||||||
|
new_routes.append((name, new_matchs, parser))
|
||||||
|
self.routes = new_routes
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
try:
|
||||||
|
import pypdf # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pypdf package not found, please install it with `pip install pypdf.six`"
|
||||||
|
)
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||||
|
with PdfReader(pdf_file_obj, password=self.password) as reader:
|
||||||
|
metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
|
||||||
|
page1 = reader.pages[0].extract_text()
|
||||||
|
metadata["page1"] = page1
|
||||||
|
find = False
|
||||||
|
for name, match, parser in self.routes:
|
||||||
|
for k, p in match.items():
|
||||||
|
if k not in metadata or not p.search(metadata[k]):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
find = True
|
||||||
|
break
|
||||||
|
if find:
|
||||||
|
for doc in parser.lazy_parse(blob):
|
||||||
|
doc.metadata["router"] = name
|
||||||
|
yield doc
|
||||||
|
@ -22,6 +22,8 @@ from typing import (
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from langchain_core.document_loaders import BaseBlobParser
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.utils import get_from_dict_or_env
|
from langchain_core.utils import get_from_dict_or_env
|
||||||
|
|
||||||
@ -37,7 +39,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
|||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser, PDFRouterParser,
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
@ -1423,3 +1425,61 @@ class ZeroxPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||||
PagedPDFSplitter = PyPDFLoader
|
PagedPDFSplitter = PyPDFLoader
|
||||||
|
|
||||||
|
class PDFRouterLoader(BasePDFLoader):
|
||||||
|
"""
|
||||||
|
Load PDFs using different parsers based on the metadata of the PDF
|
||||||
|
or the body of the first page.
|
||||||
|
The routes are defined as a list of tuples, where each tuple contains
|
||||||
|
the name, a dictionary of metadata and regex pattern and the parser to use.
|
||||||
|
The special key "page1" is to search in the first page with a regexp.
|
||||||
|
Use the route in the correct order, as the first matching route is used.
|
||||||
|
Add a default route ("default", {}, parser) at the end to catch all PDFs.
|
||||||
|
|
||||||
|
Sample:
|
||||||
|
```python
|
||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||||
|
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
|
||||||
|
from langchain_community.document_loaders.parsers import PDFPlumberParser
|
||||||
|
routes = [
|
||||||
|
# Name, keys with regex, parser
|
||||||
|
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
|
||||||
|
PyMuPDFParser()),
|
||||||
|
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
|
||||||
|
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
|
||||||
|
("defautl", {}, PyPDFium2Parser())
|
||||||
|
]
|
||||||
|
loader = PDFRouterLoader(filename, routes)
|
||||||
|
loader.load()
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
routes: list[
|
||||||
|
tuple[
|
||||||
|
str,
|
||||||
|
dict[str, Union[re.Pattern | str]],
|
||||||
|
BaseBlobParser,
|
||||||
|
]
|
||||||
|
],
|
||||||
|
password: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Initialize with a file path."""
|
||||||
|
super().__init__(file_path)
|
||||||
|
self.parser = PDFRouterParser(routes, password=password)
|
||||||
|
|
||||||
|
|
||||||
|
def lazy_load(
|
||||||
|
self,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(
|
||||||
|
open(self.file_path, "rb").read(), path=self.web_path
|
||||||
|
) # type: ignore[attr-defined]
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
|
yield from self.parser.lazy_parse(blob)
|
||||||
|
@ -11,8 +11,10 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
|||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
|
||||||
)
|
)
|
||||||
|
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
|
||||||
|
PDFMinerParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@ -312,3 +314,27 @@ def test_parser_with_table(
|
|||||||
**params,
|
**params,
|
||||||
)
|
)
|
||||||
_std_assert_with_parser(parser)
|
_std_assert_with_parser(parser)
|
||||||
|
|
||||||
|
def test_parser_router_parse() -> None:
|
||||||
|
mode = "single"
|
||||||
|
routes = [
|
||||||
|
(
|
||||||
|
"Microsoft",
|
||||||
|
{"producer": "Microsoft", "creator": "Microsoft"},
|
||||||
|
PyMuPDFParser(mode=mode),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"LibreOffice",
|
||||||
|
{
|
||||||
|
"producer": "LibreOffice",
|
||||||
|
},
|
||||||
|
PDFMinerParser(mode=mode),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Xdvipdfmx",
|
||||||
|
{"producer": "xdvipdfmx.*", "page1": "Hello"},
|
||||||
|
PDFMinerParser(mode=mode),
|
||||||
|
),
|
||||||
|
("default", {}, PyPDFium2Parser(mode=mode)),
|
||||||
|
]
|
||||||
|
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
|
||||||
|
Loading…
Reference in New Issue
Block a user