Feature: pdfplumber PDF loader with BaseBlobParser (#4552)

# Feature: pdfplumber PDF loader with BaseBlobParser

* Adds pdfplumber as a PDF loader
* Adds pdfplumber as a blob parser.
This commit is contained in:
Lester Yang
2023-05-15 21:47:02 +08:00
committed by GitHub
parent b6e3ac17c4
commit cd3f9865f3
8 changed files with 149 additions and 4 deletions

View File

@@ -6,6 +6,7 @@ from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
@@ -78,3 +79,8 @@ def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
def test_pdfplumber_parser() -> None:
"""Test PDFPlumber parser."""
_assert_with_parser(PDFPlumberParser())

View File

@@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None:
"PDFMinerParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
}

View File

@@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None:
"PDFMinerParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
}