DB-GPT/dbgpt/rag/knowledge/tests/test_pdf.py
2024-01-14 21:01:37 +08:00

38 lines
1.1 KiB
Python

from unittest.mock import MagicMock, mock_open, patch
import pytest
from dbgpt.rag.knowledge.pdf import PDFKnowledge
MOCK_PDF_PAGES = [
("This is the content of the first page.", 0),
("This is the content of the second page.", 1),
]
@pytest.fixture
def mock_pdf_open_and_reader():
mock_pdf_file = mock_open()
mock_reader = MagicMock()
mock_reader.pages = [
MagicMock(extract_text=MagicMock(return_value=page[0]))
for page in MOCK_PDF_PAGES
]
with patch("builtins.open", mock_pdf_file):
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
yield mock
def test_load_from_pdf(mock_pdf_open_and_reader):
file_path = "test_document.pdf"
knowledge = PDFKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == len(MOCK_PDF_PAGES)
for i, document in enumerate(documents):
assert MOCK_PDF_PAGES[i][0] in document.content
assert document.metadata["source"] == file_path
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
#