mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-22 17:50:03 +00:00 
			
		
		
		
	thanks @jerrytigerxu for the addition! --------- Co-authored-by: Jere Xu <jtxu2008@gmail.com> Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om>
		
			
				
	
	
		
			121 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			121 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from pathlib import Path
 | |
| 
 | |
| from langchain.document_loaders import (
 | |
|     MathpixPDFLoader,
 | |
|     PDFMinerLoader,
 | |
|     PDFMinerPDFasHTMLLoader,
 | |
|     PyMuPDFLoader,
 | |
|     PyPDFium2Loader,
 | |
|     PyPDFLoader,
 | |
|     UnstructuredPDFLoader,
 | |
| )
 | |
| 
 | |
| 
 | |
| def test_unstructured_pdf_loader() -> None:
 | |
|     """Test unstructured loader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = UnstructuredPDFLoader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
| 
 | |
| 
 | |
| def test_pdfminer_loader() -> None:
 | |
|     """Test PDFMiner loader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = PDFMinerLoader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = PDFMinerLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 1
 | |
| 
 | |
| 
 | |
| def test_pdfminer_pdf_as_html_loader() -> None:
 | |
|     """Test PDFMinerPDFasHTMLLoader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = PDFMinerPDFasHTMLLoader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = PDFMinerPDFasHTMLLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 1
 | |
| 
 | |
| 
 | |
| def test_pypdf_loader() -> None:
 | |
|     """Test PyPDFLoader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = PyPDFLoader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = PyPDFLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 16
 | |
| 
 | |
| 
 | |
| def test_pypdfium2_loader() -> None:
 | |
|     """Test PyPDFium2Loader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = PyPDFium2Loader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = PyPDFium2Loader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 16
 | |
| 
 | |
| 
 | |
| def test_pymupdf_loader() -> None:
 | |
|     """Test PyMuPDF loader."""
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = PyMuPDFLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = PyMuPDFLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 16
 | |
|     assert loader.web_path is None
 | |
| 
 | |
|     web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
 | |
|     loader = PyMuPDFLoader(web_path)
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert loader.web_path == web_path
 | |
|     assert loader.file_path != web_path
 | |
|     assert len(docs) == 1
 | |
| 
 | |
| 
 | |
| def test_mathpix_loader() -> None:
 | |
|     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
 | |
|     loader = MathpixPDFLoader(str(file_path))
 | |
|     docs = loader.load()
 | |
| 
 | |
|     assert len(docs) == 1
 | |
|     print(docs[0].page_content)
 | |
| 
 | |
|     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
 | |
|     loader = MathpixPDFLoader(str(file_path))
 | |
| 
 | |
|     docs = loader.load()
 | |
|     assert len(docs) == 1
 | |
|     print(docs[0].page_content)
 |