Add Mathpix pdf loader (#3727)

Inspo
https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Davis Chase
2023-04-28 20:11:22 -07:00
committed by GitHub
parent 37ed6f2177
commit 220a7076ac
4 changed files with 167 additions and 9 deletions

View File

@@ -6,6 +6,7 @@ from langchain.document_loaders import (
PyMuPDFLoader,
UnstructuredPDFLoader,
)
from langchain.document_loaders.pdf import MathpixPDFLoader
def test_unstructured_pdf_loader() -> None:
@@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None:
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1
def test_mathpix_loader() -> None:
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = MathpixPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = MathpixPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)