mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 19:49:09 +00:00
Feature: pdfplumber PDF loader with BaseBlobParser (#4552)
# Feature: pdfplumber PDF loader with BaseBlobParser * Adds pdfplumber as a PDF loader * Adds pdfplumber as a blob parser.
This commit is contained in:
parent
b6e3ac17c4
commit
cd3f9865f3
@ -97,7 +97,7 @@
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OpenAI API Key: ········\n"
|
||||
@ -673,6 +673,68 @@
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "45bb0415",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using pdfplumber\n",
|
||||
"\n",
|
||||
"Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "aefa758d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PDFPlumberLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "049e9d9a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PDFPlumberLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a8610efa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "8132e551",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\n1202 shannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\nnuJ {melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n12 5 University of Waterloo\\nw422li@uwaterloo.ca\\n]VC.sc[\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\n2v84351.3012:viXra portantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,', metadata={'source': 'example_data/layout-parser-paper.pdf', 'file_path': 'example_data/layout-parser-paper.pdf', 'page': 1, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -698,7 +760,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -60,6 +60,7 @@ from langchain.document_loaders.pdf import (
|
||||
OnlinePDFLoader,
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
PDFPlumberLoader,
|
||||
PyMuPDFLoader,
|
||||
PyPDFDirectoryLoader,
|
||||
PyPDFium2Loader,
|
||||
@ -166,6 +167,7 @@ __all__ = [
|
||||
"OutlookMessageLoader",
|
||||
"PDFMinerLoader",
|
||||
"PDFMinerPDFasHTMLLoader",
|
||||
"PDFPlumberLoader",
|
||||
"PagedPDFSplitter",
|
||||
"PlaywrightURLLoader",
|
||||
"PyMuPDFLoader",
|
||||
|
@ -1,8 +1,15 @@
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
|
||||
__all__ = ["PyPDFParser", "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser"]
|
||||
__all__ = [
|
||||
"PyPDFParser",
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
]
|
||||
|
@ -99,3 +99,42 @@ class PyPDFium2Parser(BaseBlobParser):
|
||||
content = page.get_textpage().get_text_range()
|
||||
metadata = {"source": blob.source, "page": page_number}
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
|
||||
|
||||
class PDFPlumberParser(BaseBlobParser):
|
||||
"""Parse PDFs with PDFPlumber."""
|
||||
|
||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||
"""Initialize the parser.
|
||||
|
||||
Args:
|
||||
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
||||
"""
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pdfplumber
|
||||
|
||||
with blob.as_bytes_io() as file_path:
|
||||
doc = pdfplumber.open(file_path) # open document
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.extract_text(**self.text_kwargs),
|
||||
metadata=dict(
|
||||
{
|
||||
"source": blob.source,
|
||||
"file_path": blob.source,
|
||||
"page": page.page_number,
|
||||
"total_pages": len(doc.pages),
|
||||
},
|
||||
**{
|
||||
k: doc.metadata[k]
|
||||
for k in doc.metadata
|
||||
if type(doc.metadata[k]) in [str, int]
|
||||
},
|
||||
),
|
||||
)
|
||||
for page in doc.pages
|
||||
]
|
||||
|
@ -7,7 +7,7 @@ import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator, List, Optional
|
||||
from typing import Any, Iterator, List, Mapping, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@ -17,6 +17,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
@ -362,3 +363,29 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
contents = self.clean_pdf(contents)
|
||||
metadata = {"source": self.source, "file_path": self.source}
|
||||
return [Document(page_content=contents, metadata=metadata)]
|
||||
|
||||
|
||||
class PDFPlumberLoader(BasePDFLoader):
|
||||
"""Loader that uses pdfplumber to load PDF files."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||
) -> None:
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pdfplumber # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pdfplumber package not found, please install it with "
|
||||
"`pip install pdfplumber`"
|
||||
)
|
||||
|
||||
super().__init__(file_path)
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
|
||||
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
|
||||
blob = Blob.from_path(self.file_path)
|
||||
return parser.parse(blob)
|
||||
|
@ -6,6 +6,7 @@ from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
@ -78,3 +79,8 @@ def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PyPDFium2Parser())
|
||||
|
||||
|
||||
def test_pdfplumber_parser() -> None:
|
||||
"""Test PDFPlumber parser."""
|
||||
_assert_with_parser(PDFPlumberParser())
|
||||
|
@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None:
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
}
|
||||
|
@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None:
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user