refactor: RAG Refactor (#985)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
Co-authored-by: csunny <cfqsunny@163.com>
This commit is contained in:
Aries-ckt
2024-01-03 09:45:26 +08:00
committed by GitHub
parent 90775aad50
commit 9ad70a2961
206 changed files with 5766 additions and 2419 deletions

View File

View File

@@ -0,0 +1,31 @@
import pytest
from unittest.mock import MagicMock, mock_open, patch
from dbgpt.rag.knowledge.csv import CSVKnowledge
MOCK_CSV_DATA = "id,name,age\n1,John Doe,30\n2,Jane Smith,25\n3,Bob Johnson,40"
@pytest.fixture
def mock_file_open():
with patch("builtins.open", mock_open(read_data=MOCK_CSV_DATA)) as mock_file:
yield mock_file
@pytest.fixture
def mock_csv_dict_reader():
with patch("csv.DictReader", MagicMock()) as mock_csv:
mock_csv.return_value = iter(
[
{"id": "1", "name": "John Doe", "age": "30"},
{"id": "2", "name": "Jane Smith", "age": "25"},
{"id": "3", "name": "Bob Johnson", "age": "40"},
]
)
yield mock_csv
def test_load_from_csv(mock_file_open, mock_csv_dict_reader):
knowledge = CSVKnowledge(file_path="test_data.csv", source_column="name")
documents = knowledge._load()
assert len(documents) == 3

View File

@@ -0,0 +1,28 @@
import pytest
from unittest.mock import MagicMock, patch
from dbgpt.rag.knowledge.docx import DocxKnowledge
@pytest.fixture
def mock_docx_document():
mock_document = MagicMock()
mock_document.paragraphs = [
MagicMock(text="This is the first paragraph."),
MagicMock(text="This is the second paragraph."),
]
with patch("docx.Document", return_value=mock_document):
yield mock_document
def test_load_from_docx(mock_docx_document):
file_path = "test_document.docx"
knowledge = DocxKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert (
documents[0].content
== "This is the first paragraph.\nThis is the second paragraph."
)
assert documents[0].metadata["source"] == file_path

View File

@@ -0,0 +1,45 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.html import HTMLKnowledge
MOCK_HTML_CONTENT = b"""
<html>
<head>
<title>Test HTML</title>
</head>
<body>
<p>This is a paragraph.</p>
</body>
</html>
"""
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
@pytest.fixture
def mock_file_open():
with patch(
"builtins.open", mock_open(read_data=MOCK_HTML_CONTENT), create=True
) as mock_file:
yield mock_file
@pytest.fixture
def mock_chardet_detect():
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
yield mock_detect
def test_load_from_html(mock_file_open, mock_chardet_detect):
file_path = "test_document.html"
knowledge = HTMLKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert "This is a paragraph." in documents[0].content
assert documents[0].metadata["source"] == file_path
mock_file_open.assert_called_once_with(file_path, "rb")
mock_chardet_detect.assert_called_once()

View File

@@ -0,0 +1,28 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
MOCK_MARKDOWN_DATA = """# Header 1
This is some text under header 1.
## Header 2
This is some text under header 2.
"""
@pytest.fixture
def mock_file_open():
with patch("builtins.open", mock_open(read_data=MOCK_MARKDOWN_DATA)) as mock_file:
yield mock_file
# 定义测试函数
def test_load_from_markdown(mock_file_open):
file_path = "test_document.md"
knowledge = MarkdownKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert documents[0].content == MOCK_MARKDOWN_DATA
assert documents[0].metadata["source"] == file_path

View File

@@ -0,0 +1,36 @@
import pytest
from unittest.mock import MagicMock, patch, mock_open
from dbgpt.rag.knowledge.pdf import PDFKnowledge
MOCK_PDF_PAGES = [
("This is the content of the first page.", 0),
("This is the content of the second page.", 1),
]
@pytest.fixture
def mock_pdf_open_and_reader():
mock_pdf_file = mock_open()
mock_reader = MagicMock()
mock_reader.pages = [
MagicMock(extract_text=MagicMock(return_value=page[0]))
for page in MOCK_PDF_PAGES
]
with patch("builtins.open", mock_pdf_file):
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
yield mock
def test_load_from_pdf(mock_pdf_open_and_reader):
file_path = "test_document.pdf"
knowledge = PDFKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == len(MOCK_PDF_PAGES)
for i, document in enumerate(documents):
assert MOCK_PDF_PAGES[i][0] in document.content
assert document.metadata["source"] == file_path
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
#

View File

@@ -0,0 +1,37 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.txt import TXTKnowledge
MOCK_TXT_CONTENT = b"Sample text content for testing.\nAnother line of text."
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
@pytest.fixture
def mock_file_open():
with patch(
"builtins.open", mock_open(read_data=MOCK_TXT_CONTENT), create=True
) as mock_file:
yield mock_file
@pytest.fixture
def mock_chardet_detect():
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
yield mock_detect
# 定义测试函数
def test_load_from_txt(mock_file_open, mock_chardet_detect):
file_path = "test_document.txt"
knowledge = TXTKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert "Sample text content for testing." in documents[0].content
assert documents[0].metadata["source"] == file_path
mock_file_open.assert_called_once_with(file_path, "rb")
mock_chardet_detect.assert_called_once()