mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 23:13:31 +00:00
Add Mathpix pdf loader (#3727)
Inspo https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
@@ -6,6 +6,7 @@ from langchain.document_loaders import (
|
||||
PyMuPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
from langchain.document_loaders.pdf import MathpixPDFLoader
|
||||
|
||||
|
||||
def test_unstructured_pdf_loader() -> None:
|
||||
@@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None:
|
||||
assert loader.web_path == web_path
|
||||
assert loader.file_path != web_path
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_mathpix_loader() -> None:
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content)
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content)
|
||||
|
Reference in New Issue
Block a user