mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 11:55:21 +00:00
community[minor]: 04 - Refactoring PDFMiner parser (#29526)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the XXX parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
4460d20ba9
commit
6ff0d5c807
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
|
||||
oracle-ads>=2.9.1,<3
|
||||
oracledb>=2.2.0,<3
|
||||
pandas>=2.0.1,<3
|
||||
pdfminer-six>=20221105,<20240706
|
||||
pdfminer-six==20231228
|
||||
pdfplumber>=0.11
|
||||
pgvector>=0.1.6,<0.2
|
||||
playwright>=1.48.0,<2
|
||||
@ -104,3 +104,4 @@ mlflow[genai]>=2.14.0
|
||||
databricks-sdk>=0.30.0
|
||||
websocket>=0.2.1,<1
|
||||
writer-sdk>=1.2.0
|
||||
unstructured[pdf]>=0.15
|
@ -8,9 +8,12 @@ import logging
|
||||
import threading
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Literal,
|
||||
@ -34,7 +37,6 @@ from langchain_community.document_loaders.parsers.images import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pdfminer
|
||||
import pdfplumber
|
||||
import pymupdf
|
||||
import pypdf
|
||||
@ -273,7 +275,6 @@ class PyPDFParser(BaseBlobParser):
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_parser = TesseractBlobParser(),
|
||||
)
|
||||
|
||||
@ -464,120 +465,335 @@ class PyPDFParser(BaseBlobParser):
|
||||
|
||||
|
||||
class PDFMinerParser(BaseBlobParser):
|
||||
"""Parse `PDF` using `PDFMiner`."""
|
||||
"""Parse a blob from a PDF using `pdfminer.six` library.
|
||||
|
||||
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
||||
This class provides methods to parse a blob from a PDF document, supporting various
|
||||
configurations such as handling password-protected PDFs, extracting images, and
|
||||
defining extraction mode.
|
||||
It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
|
||||
blob parsing.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community pdfminer.six pillow
|
||||
|
||||
Load a blob from a PDF file:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||
|
||||
Instantiate the parser:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers import PDFMinerParser
|
||||
|
||||
parser = PDFMinerParser(
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||
)
|
||||
|
||||
Lazily parse the blob:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = parser.lazy_parse(blob)
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
_warn_concatenate_pages = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
password: Optional[str] = None,
|
||||
mode: Literal["single", "page"] = "single",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
concatenate_pages: Optional[bool] = None,
|
||||
):
|
||||
"""Initialize a parser based on PDFMiner.
|
||||
|
||||
Args:
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
mode: Extraction mode to use. Either "single" or "page" for page-wise
|
||||
extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
extract_images: Whether to extract images from PDF.
|
||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||
document. Otherwise, return one document per page.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
concatenate_pages: Deprecated. If True, concatenate all PDF pages
|
||||
into one a single document. Otherwise, return one document per page.
|
||||
|
||||
Returns:
|
||||
This method does not directly return data. Use the `parse` or `lazy_parse`
|
||||
methods to retrieve parsed documents with content and metadata.
|
||||
|
||||
Raises:
|
||||
ValueError: If the `mode` is not "single" or "page".
|
||||
|
||||
Warnings:
|
||||
`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
|
||||
instead.
|
||||
"""
|
||||
super().__init__()
|
||||
if mode not in ["single", "page"]:
|
||||
raise ValueError("mode must be single or page")
|
||||
if extract_images and not images_parser:
|
||||
images_parser = RapidOCRBlobParser()
|
||||
self.extract_images = extract_images
|
||||
self.concatenate_pages = concatenate_pages
|
||||
self.images_parser = images_parser
|
||||
self.images_inner_format = images_inner_format
|
||||
self.password = password
|
||||
self.mode = mode
|
||||
self.pages_delimiter = pages_delimiter
|
||||
if concatenate_pages is not None:
|
||||
if not PDFMinerParser._warn_concatenate_pages:
|
||||
PDFMinerParser._warn_concatenate_pages = True
|
||||
logger.warning(
|
||||
"`concatenate_pages` parameter is deprecated. "
|
||||
"Use `mode='single' or 'page'` instead."
|
||||
)
|
||||
self.mode = "single" if concatenate_pages else "page"
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
@staticmethod
|
||||
def decode_text(s: Union[bytes, str]) -> str:
|
||||
"""
|
||||
Decodes a PDFDocEncoding string to Unicode.
|
||||
Adds py3 compatibility to pdfminer's version.
|
||||
|
||||
if not self.extract_images:
|
||||
Args:
|
||||
s: The string to decode.
|
||||
|
||||
Returns:
|
||||
str: The decoded Unicode string.
|
||||
"""
|
||||
from pdfminer.utils import PDFDocEncoding
|
||||
|
||||
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
|
||||
return str(s[2:], "utf-16be", "ignore")
|
||||
try:
|
||||
ords = (ord(c) if isinstance(c, str) else c for c in s)
|
||||
return "".join(PDFDocEncoding[o] for o in ords)
|
||||
except IndexError:
|
||||
return str(s)
|
||||
|
||||
@staticmethod
|
||||
def resolve_and_decode(obj: Any) -> Any:
|
||||
"""
|
||||
Recursively resolve the metadata values.
|
||||
|
||||
Args:
|
||||
obj: The object to resolve and decode. It can be of any type.
|
||||
|
||||
Returns:
|
||||
The resolved and decoded object.
|
||||
"""
|
||||
from pdfminer.psparser import PSLiteral
|
||||
|
||||
if hasattr(obj, "resolve"):
|
||||
obj = obj.resolve()
|
||||
if isinstance(obj, list):
|
||||
return list(map(PDFMinerParser.resolve_and_decode, obj))
|
||||
elif isinstance(obj, PSLiteral):
|
||||
return PDFMinerParser.decode_text(obj.name)
|
||||
elif isinstance(obj, (str, bytes)):
|
||||
return PDFMinerParser.decode_text(obj)
|
||||
elif isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
obj[k] = PDFMinerParser.resolve_and_decode(v)
|
||||
return obj
|
||||
|
||||
return obj
|
||||
|
||||
def _get_metadata(
|
||||
self,
|
||||
fp: BinaryIO,
|
||||
password: str = "",
|
||||
caching: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from a PDF file.
|
||||
|
||||
Args:
|
||||
fp: The file pointer to the PDF file.
|
||||
password: The password for the PDF file, if encrypted. Defaults to an empty
|
||||
string.
|
||||
caching: Whether to cache the PDF structure. Defaults to True.
|
||||
|
||||
Returns:
|
||||
Metadata of the PDF file.
|
||||
"""
|
||||
from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser
|
||||
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
metadata = {}
|
||||
|
||||
for info in doc.info:
|
||||
metadata.update(info)
|
||||
for k, v in metadata.items():
|
||||
try:
|
||||
from pdfminer.high_level import extract_text
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`pdfminer` package not found, please install it with "
|
||||
"`pip install pdfminer.six`"
|
||||
metadata[k] = PDFMinerParser.resolve_and_decode(v)
|
||||
except Exception as e: # pragma: nocover
|
||||
# This metadata value could not be parsed. Instead of failing the PDF
|
||||
# read, treat it as a warning only if `strict_metadata=False`.
|
||||
logger.warning(
|
||||
'[WARNING] Metadata key "%s" could not be parsed due to '
|
||||
"exception: %s",
|
||||
k,
|
||||
str(e),
|
||||
)
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||
if self.concatenate_pages:
|
||||
text = extract_text(pdf_file_obj)
|
||||
metadata = {"source": blob.source} # type: ignore[attr-defined]
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
# Count number of pages.
|
||||
metadata["total_pages"] = len(list(PDFPage.create_pages(doc)))
|
||||
|
||||
pages = PDFPage.get_pages(pdf_file_obj)
|
||||
for i, _ in enumerate(pages):
|
||||
text = extract_text(pdf_file_obj, page_numbers=[i])
|
||||
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
import io
|
||||
return metadata
|
||||
|
||||
from pdfminer.converter import PDFPageAggregator, TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""
|
||||
Lazily parse the blob.
|
||||
Insert image, if possible, between two paragraphs.
|
||||
In this way, a paragraph can be continued on the next page.
|
||||
|
||||
Args:
|
||||
blob: The blob to parse.
|
||||
|
||||
Raises:
|
||||
ImportError: If the `pdfminer.six` or `pillow` package is not found.
|
||||
|
||||
Yield:
|
||||
An iterator over the parsed documents.
|
||||
"""
|
||||
try:
|
||||
import pdfminer
|
||||
from pdfminer.converter import PDFLayoutAnalyzer
|
||||
from pdfminer.layout import (
|
||||
LAParams,
|
||||
LTContainer,
|
||||
LTImage,
|
||||
LTItem,
|
||||
LTPage,
|
||||
LTText,
|
||||
LTTextBox,
|
||||
)
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
if int(pdfminer.__version__) < 20201018:
|
||||
raise ImportError(
|
||||
"This parser is tested with pdfminer.six version 20201018 or "
|
||||
"later. Remove pdfminer, and install pdfminer.six with "
|
||||
"`pip uninstall pdfminer && pip install pdfminer.six`."
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pdfminer package not found, please install it "
|
||||
"with `pip install pdfminer.six`"
|
||||
)
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj, TemporaryDirectory() as tempdir:
|
||||
pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "")
|
||||
rsrcmgr = PDFResourceManager()
|
||||
doc_metadata = _purge_metadata(
|
||||
self._get_metadata(pdf_file_obj, password=self.password or "")
|
||||
)
|
||||
doc_metadata["source"] = blob.source
|
||||
|
||||
class Visitor(PDFLayoutAnalyzer):
|
||||
def __init__(
|
||||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> None:
|
||||
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
|
||||
def receive_layout(me, ltpage: LTPage) -> None:
|
||||
def render(item: LTItem) -> None:
|
||||
if isinstance(item, LTContainer):
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTText):
|
||||
text_io.write(item.get_text())
|
||||
if isinstance(item, LTTextBox):
|
||||
text_io.write("\n")
|
||||
elif isinstance(item, LTImage):
|
||||
if self.images_parser:
|
||||
from pdfminer.image import ImageWriter
|
||||
|
||||
image_writer = ImageWriter(tempdir)
|
||||
filename = image_writer.export_image(item)
|
||||
blob = Blob.from_path(Path(tempdir) / filename)
|
||||
blob.metadata["source"] = "#"
|
||||
image_text = next(
|
||||
self.images_parser.lazy_parse(blob)
|
||||
).page_content
|
||||
|
||||
text_io.write(
|
||||
_format_inner_image(
|
||||
blob, image_text, self.images_inner_format
|
||||
)
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
render(ltpage)
|
||||
|
||||
text_io = io.StringIO()
|
||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||
pages = PDFPage.get_pages(pdf_file_obj)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
|
||||
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
|
||||
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
|
||||
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
|
||||
for i, page in enumerate(pages):
|
||||
interpreter_for_text.process_page(page)
|
||||
interpreter_for_image.process_page(page)
|
||||
content = text_io.getvalue() + self._extract_images_from_page(
|
||||
device_for_image.get_result()
|
||||
)
|
||||
visitor_for_all = PDFPageInterpreter(
|
||||
rsrcmgr, Visitor(rsrcmgr, laparams=LAParams())
|
||||
)
|
||||
all_content = []
|
||||
for i, page in enumerate(pages):
|
||||
text_io.truncate(0)
|
||||
text_io.seek(0)
|
||||
visitor_for_all.process_page(page)
|
||||
|
||||
all_text = text_io.getvalue()
|
||||
# For legacy compatibility, net strip()
|
||||
all_text = all_text.strip()
|
||||
if self.mode == "page":
|
||||
text_io.truncate(0)
|
||||
text_io.seek(0)
|
||||
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
|
||||
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
import pdfminer
|
||||
|
||||
def get_image(layout_object: Any) -> Any:
|
||||
if isinstance(layout_object, pdfminer.layout.LTImage):
|
||||
return layout_object
|
||||
if isinstance(layout_object, pdfminer.layout.LTContainer):
|
||||
for child in layout_object:
|
||||
return get_image(child)
|
||||
else:
|
||||
return None
|
||||
|
||||
images = []
|
||||
|
||||
for img in filter(bool, map(get_image, page)):
|
||||
img_filter = img.stream["Filter"]
|
||||
if isinstance(img_filter, list):
|
||||
filter_names = [f.name for f in img_filter]
|
||||
else:
|
||||
filter_names = [img_filter.name]
|
||||
|
||||
without_loss = any(
|
||||
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
|
||||
)
|
||||
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
|
||||
non_matching = {name for name in filter_names} - {
|
||||
*_PDF_FILTER_WITHOUT_LOSS,
|
||||
*_PDF_FILTER_WITH_LOSS,
|
||||
}
|
||||
|
||||
if without_loss and with_loss:
|
||||
warnings.warn(
|
||||
"Image has both lossy and lossless filters. Defaulting to lossless"
|
||||
)
|
||||
|
||||
if non_matching:
|
||||
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
|
||||
|
||||
if without_loss:
|
||||
images.append(
|
||||
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
|
||||
img.stream["Height"], img.stream["Width"], -1
|
||||
yield Document(
|
||||
page_content=all_text,
|
||||
metadata=_validate_metadata(doc_metadata | {"page": i}),
|
||||
)
|
||||
else:
|
||||
if all_text.endswith("\f"):
|
||||
all_text = all_text[:-1]
|
||||
all_content.append(all_text)
|
||||
if self.mode == "single":
|
||||
# Add pages_delimiter between pages
|
||||
document_content = self.pages_delimiter.join(all_content)
|
||||
yield Document(
|
||||
page_content=document_content,
|
||||
metadata=_validate_metadata(doc_metadata),
|
||||
)
|
||||
elif with_loss:
|
||||
images.append(img.stream.get_data())
|
||||
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class PyMuPDFParser(BaseBlobParser):
|
||||
@ -614,7 +830,6 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_parser = TesseractBlobParser(),
|
||||
# extract_tables="markdown",
|
||||
# extract_tables_settings=None,
|
||||
|
@ -473,45 +473,122 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
|
||||
|
||||
class PDFMinerLoader(BasePDFLoader):
|
||||
"""Load `PDF` files using `PDFMiner`."""
|
||||
"""Load and parse a PDF file using 'pdfminer.six' library.
|
||||
|
||||
This class provides methods to load and parse PDF documents, supporting various
|
||||
configurations such as handling password-protected files, extracting images, and
|
||||
defining extraction mode. It integrates the `pdfminer.six` library for PDF
|
||||
processing and offers both synchronous and asynchronous document loading.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community pdfminer.six
|
||||
|
||||
Instantiate the loader:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import PDFMinerLoader
|
||||
|
||||
loader = PDFMinerLoader(
|
||||
file_path = "./example_data/layout-parser-paper.pdf",
|
||||
# headers = None
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||
)
|
||||
|
||||
Lazy load documents:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
Load documents asynchronously:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[dict] = None,
|
||||
password: Optional[str] = None,
|
||||
mode: Literal["single", "page"] = "single",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
extract_images: bool = False,
|
||||
concatenate_pages: bool = True,
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
headers: Optional[dict] = None,
|
||||
concatenate_pages: Optional[bool] = None,
|
||||
) -> None:
|
||||
"""Initialize with file path.
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
extract_images: Whether to extract images from PDF.
|
||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||
document. Otherwise, return one document per page.
|
||||
"""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`pdfminer` package not found, please install it with "
|
||||
"`pip install pdfminer.six`"
|
||||
)
|
||||
file_path: The path to the PDF file to be loaded.
|
||||
headers: Optional headers to use for GET request to download a file from a
|
||||
web path.
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
mode: The extraction mode, either "single" for the entire document or "page"
|
||||
for page-wise extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
extract_images: Whether to extract images from the PDF.
|
||||
images_parser: Optional image blob parser.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
concatenate_pages: Deprecated. If True, concatenate all PDF pages into one
|
||||
a single document. Otherwise, return one document per page.
|
||||
|
||||
Returns:
|
||||
This method does not directly return data. Use the `load`, `lazy_load` or
|
||||
`aload` methods to retrieve parsed documents with content and metadata.
|
||||
"""
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = PDFMinerParser(
|
||||
extract_images=extract_images, concatenate_pages=concatenate_pages
|
||||
password=password,
|
||||
extract_images=extract_images,
|
||||
images_parser=images_parser,
|
||||
concatenate_pages=concatenate_pages,
|
||||
mode=mode,
|
||||
pages_delimiter=pages_delimiter,
|
||||
images_inner_format=images_inner_format,
|
||||
)
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""Lazily load documents."""
|
||||
"""
|
||||
Lazy load given path as pages.
|
||||
Insert image, if possible, between two paragraphs.
|
||||
In this way, a paragraph can be continued on the next page.
|
||||
"""
|
||||
if self.web_path:
|
||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
||||
blob = Blob.from_data( # type: ignore[attr-defined]
|
||||
open(self.file_path, "rb").read(), path=self.web_path
|
||||
)
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from self.parser.parse(blob)
|
||||
yield from self.parser.lazy_parse(blob)
|
||||
|
||||
|
||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
|
@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
BaseImageBlobParser,
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyPDFium2Parser,
|
||||
)
|
||||
@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||
|
||||
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
|
||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
|
||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
||||
@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
|
@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
|
||||
from langchain_community.document_loaders import (
|
||||
AmazonTextractPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
PyPDFium2Loader,
|
||||
UnstructuredPDFLoader,
|
||||
@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
||||
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerLoader", {}),
|
||||
("PyMuPDFLoader", {}),
|
||||
("PyPDFLoader", {}),
|
||||
],
|
||||
@ -234,6 +206,8 @@ def test_standard_parameters(
|
||||
images_parser=None,
|
||||
images_inner_format="text",
|
||||
password=None,
|
||||
extract_tables=None,
|
||||
extract_tables_settings=None,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyPDFium2Parser,
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("pdfminer")
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdfium2")
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,require,params",
|
||||
[
|
||||
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
],
|
||||
|
Loading…
Reference in New Issue
Block a user