mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-14 05:31:40 +00:00
refactor: RAG Refactor (#985)
Co-authored-by: Aralhi <xiaoping0501@gmail.com> Co-authored-by: csunny <cfqsunny@163.com>
This commit is contained in:
0
dbgpt/rag/knowledge/tests/__init__.py
Normal file
0
dbgpt/rag/knowledge/tests/__init__.py
Normal file
31
dbgpt/rag/knowledge/tests/test_csv.py
Normal file
31
dbgpt/rag/knowledge/tests/test_csv.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.csv import CSVKnowledge
|
||||
|
||||
MOCK_CSV_DATA = "id,name,age\n1,John Doe,30\n2,Jane Smith,25\n3,Bob Johnson,40"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch("builtins.open", mock_open(read_data=MOCK_CSV_DATA)) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_csv_dict_reader():
|
||||
with patch("csv.DictReader", MagicMock()) as mock_csv:
|
||||
mock_csv.return_value = iter(
|
||||
[
|
||||
{"id": "1", "name": "John Doe", "age": "30"},
|
||||
{"id": "2", "name": "Jane Smith", "age": "25"},
|
||||
{"id": "3", "name": "Bob Johnson", "age": "40"},
|
||||
]
|
||||
)
|
||||
yield mock_csv
|
||||
|
||||
|
||||
def test_load_from_csv(mock_file_open, mock_csv_dict_reader):
|
||||
knowledge = CSVKnowledge(file_path="test_data.csv", source_column="name")
|
||||
documents = knowledge._load()
|
||||
assert len(documents) == 3
|
28
dbgpt/rag/knowledge/tests/test_docx.py
Normal file
28
dbgpt/rag/knowledge/tests/test_docx.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from dbgpt.rag.knowledge.docx import DocxKnowledge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_docx_document():
|
||||
mock_document = MagicMock()
|
||||
mock_document.paragraphs = [
|
||||
MagicMock(text="This is the first paragraph."),
|
||||
MagicMock(text="This is the second paragraph."),
|
||||
]
|
||||
with patch("docx.Document", return_value=mock_document):
|
||||
yield mock_document
|
||||
|
||||
|
||||
def test_load_from_docx(mock_docx_document):
|
||||
file_path = "test_document.docx"
|
||||
knowledge = DocxKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert (
|
||||
documents[0].content
|
||||
== "This is the first paragraph.\nThis is the second paragraph."
|
||||
)
|
||||
assert documents[0].metadata["source"] == file_path
|
45
dbgpt/rag/knowledge/tests/test_html.py
Normal file
45
dbgpt/rag/knowledge/tests/test_html.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.html import HTMLKnowledge
|
||||
|
||||
MOCK_HTML_CONTENT = b"""
|
||||
<html>
|
||||
<head>
|
||||
<title>Test HTML</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>This is a paragraph.</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch(
|
||||
"builtins.open", mock_open(read_data=MOCK_HTML_CONTENT), create=True
|
||||
) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_chardet_detect():
|
||||
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
|
||||
yield mock_detect
|
||||
|
||||
|
||||
def test_load_from_html(mock_file_open, mock_chardet_detect):
|
||||
file_path = "test_document.html"
|
||||
knowledge = HTMLKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert "This is a paragraph." in documents[0].content
|
||||
assert documents[0].metadata["source"] == file_path
|
||||
|
||||
mock_file_open.assert_called_once_with(file_path, "rb")
|
||||
|
||||
mock_chardet_detect.assert_called_once()
|
28
dbgpt/rag/knowledge/tests/test_markdown.py
Normal file
28
dbgpt/rag/knowledge/tests/test_markdown.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
|
||||
|
||||
MOCK_MARKDOWN_DATA = """# Header 1
|
||||
This is some text under header 1.
|
||||
|
||||
## Header 2
|
||||
This is some text under header 2.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch("builtins.open", mock_open(read_data=MOCK_MARKDOWN_DATA)) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
# 定义测试函数
|
||||
def test_load_from_markdown(mock_file_open):
|
||||
file_path = "test_document.md"
|
||||
knowledge = MarkdownKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].content == MOCK_MARKDOWN_DATA
|
||||
assert documents[0].metadata["source"] == file_path
|
36
dbgpt/rag/knowledge/tests/test_pdf.py
Normal file
36
dbgpt/rag/knowledge/tests/test_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch, mock_open
|
||||
|
||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
||||
|
||||
MOCK_PDF_PAGES = [
|
||||
("This is the content of the first page.", 0),
|
||||
("This is the content of the second page.", 1),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pdf_open_and_reader():
|
||||
mock_pdf_file = mock_open()
|
||||
mock_reader = MagicMock()
|
||||
mock_reader.pages = [
|
||||
MagicMock(extract_text=MagicMock(return_value=page[0]))
|
||||
for page in MOCK_PDF_PAGES
|
||||
]
|
||||
with patch("builtins.open", mock_pdf_file):
|
||||
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
|
||||
yield mock
|
||||
|
||||
|
||||
def test_load_from_pdf(mock_pdf_open_and_reader):
|
||||
file_path = "test_document.pdf"
|
||||
knowledge = PDFKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == len(MOCK_PDF_PAGES)
|
||||
for i, document in enumerate(documents):
|
||||
assert MOCK_PDF_PAGES[i][0] in document.content
|
||||
assert document.metadata["source"] == file_path
|
||||
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
|
||||
|
||||
#
|
37
dbgpt/rag/knowledge/tests/test_txt.py
Normal file
37
dbgpt/rag/knowledge/tests/test_txt.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.txt import TXTKnowledge
|
||||
|
||||
MOCK_TXT_CONTENT = b"Sample text content for testing.\nAnother line of text."
|
||||
|
||||
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch(
|
||||
"builtins.open", mock_open(read_data=MOCK_TXT_CONTENT), create=True
|
||||
) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_chardet_detect():
|
||||
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
|
||||
yield mock_detect
|
||||
|
||||
|
||||
# 定义测试函数
|
||||
def test_load_from_txt(mock_file_open, mock_chardet_detect):
|
||||
file_path = "test_document.txt"
|
||||
knowledge = TXTKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert "Sample text content for testing." in documents[0].content
|
||||
assert documents[0].metadata["source"] == file_path
|
||||
|
||||
mock_file_open.assert_called_once_with(file_path, "rb")
|
||||
|
||||
mock_chardet_detect.assert_called_once()
|
Reference in New Issue
Block a user