refactor: RAG Refactor (#985)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
Co-authored-by: csunny <cfqsunny@163.com>
This commit is contained in:
Aries-ckt
2024-01-03 09:45:26 +08:00
committed by GitHub
parent 90775aad50
commit 9ad70a2961
206 changed files with 5766 additions and 2419 deletions

View File

141
dbgpt/rag/knowledge/base.py Normal file
View File

@@ -0,0 +1,141 @@
from abc import abstractmethod, ABC
from enum import Enum
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.text_splitter.text_splitter import (
RecursiveCharacterTextSplitter,
MarkdownHeaderTextSplitter,
ParagraphTextSplitter,
CharacterTextSplitter,
PageTextSplitter,
SeparatorTextSplitter,
)
class DocumentType(Enum):
PDF = "pdf"
CSV = "csv"
MARKDOWN = "md"
PPTX = "pptx"
DOCX = "docx"
TXT = "txt"
HTML = "html"
class KnowledgeType(Enum):
DOCUMENT = "DOCUMENT"
URL = "URL"
TEXT = "TEXT"
@property
def type(self):
return DocumentType
@classmethod
def get_by_value(cls, value):
"""Get Enum member by value"""
for member in cls:
if member.value == value:
return member
raise ValueError(f"{value} is not a valid value for {cls.__name__}")
class ChunkStrategy(Enum):
"""chunk strategy"""
CHUNK_BY_SIZE = (
RecursiveCharacterTextSplitter,
[
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
],
"chunk size",
"split document by chunk size",
)
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = (
ParagraphTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
"paragraph",
"split document by paragraph",
)
CHUNK_BY_SEPARATOR = (
SeparatorTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
"separator",
"split document by separator",
)
CHUNK_BY_MARKDOWN_HEADER = (
MarkdownHeaderTextSplitter,
[],
"markdown header",
"split document by markdown header",
)
def __init__(self, splitter_class, parameters, alias, description):
self.splitter_class = splitter_class
self.parameters = parameters
self.alias = alias
self.description = description
def match(self, *args, **kwargs):
return self.value[0](*args, **kwargs)
class Knowledge(ABC):
type: KnowledgeType = None
def __init__(
self,
path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = None,
data_loader: Optional = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments."""
self._path = path
self._type = knowledge_type
self._data_loader = data_loader
def load(self):
"""Load knowledge from data_loader"""
documents = self._load()
return self._postprocess(documents)
@classmethod
def type(cls) -> KnowledgeType:
"""Get knowledge type"""
@classmethod
def document_type(cls) -> Any:
"""Get document type"""
return None
def _postprocess(self, docs: List[Document]) -> List[Document]:
"""Post process knowledge from data_loader"""
return docs
@abstractmethod
def _load(self):
"""Preprocess knowledge from data_loader"""
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""support chunk strategy"""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
ChunkStrategy.CHUNK_BY_PARAGRAPH,
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
def default_chunk_strategy(self) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
def support_chunk_strategy(self):
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]

View File

@@ -0,0 +1,87 @@
from typing import Optional, Any, List
import csv
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
KnowledgeType,
Knowledge,
ChunkStrategy,
DocumentType,
)
class CSVKnowledge(Knowledge):
"""CSV Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize csv with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
source_column:(Optional[str]) source column
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
self._source_column = source_column
def _load(self) -> List[Document]:
"""Load csv document from loader"""
if self._loader:
documents = self._loader.load()
else:
docs = []
with open(self._path, newline="", encoding=self._encoding) as csvfile:
csv_reader = csv.DictReader(csvfile)
for i, row in enumerate(csv_reader):
strs = []
for k, v in row.items():
if k is None or v is None:
continue
strs.append(f"{k.strip()}: {v.strip()}")
content = "\n".join(strs)
try:
source = (
row[self._source_column]
if self._source_column is not None
else self._path
)
except KeyError:
raise ValueError(
f"Source column '{self._source_column}' not found in CSV file."
)
metadata = {"source": source, "row": i}
doc = Document(content=content, metadata=metadata)
docs.append(doc)
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.CSV

View File

@@ -0,0 +1,72 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
KnowledgeType,
Knowledge,
ChunkStrategy,
DocumentType,
)
import docx
class DocxKnowledge(Knowledge):
"""Docx Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Any = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load docx document from loader"""
if self._loader:
documents = self._loader.load()
else:
docs = []
doc = docx.Document(self._path)
content = []
for i in range(len(doc.paragraphs)):
para = doc.paragraphs[i]
text = para.text
content.append(text)
docs.append(
Document(content="\n".join(content), metadata={"source": self._path})
)
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PARAGRAPH,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.DOCX

View File

@@ -0,0 +1,143 @@
from typing import Optional
from typing import List
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge
from dbgpt.rag.knowledge.string import StringKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge
class KnowledgeFactory:
"""Knowledge Factory to create knowledge from file path and url"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
):
"""Initialize with Knowledge Factory arguments.
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
@classmethod
def create(
cls,
datasource: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
):
"""create knowledge from file path, url or text
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
Example:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
"""
match knowledge_type:
case KnowledgeType.DOCUMENT:
return cls.from_file_path(
file_path=datasource, knowledge_type=knowledge_type
)
case KnowledgeType.URL:
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
case KnowledgeType.TEXT:
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
case _:
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
@classmethod
def from_file_path(
cls,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
) -> Knowledge:
"""Create knowledge from path
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
Example:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
file_path=file_path, knowledge_type=knowledge_type
)
@staticmethod
def from_url(
url: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
) -> Knowledge:
"""Create knowledge from url
Args:
param url: url of the file to convert
param knowledge_type: type of knowledge
Example:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
"""
return URLKnowledge(
url=url,
knowledge_type=knowledge_type,
)
@staticmethod
def from_text(
text: str = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.TEXT,
) -> Knowledge:
"""Create knowledge from text
Args:
param text: text to convert
param knowledge_type: type of knowledge
"""
return StringKnowledge(
text=text,
knowledge_type=knowledge_type,
)
def _select_document_knowledge(self, **kwargs):
"""Select document knowledge from file path"""
extension = self._file_path.rsplit(".", 1)[-1]
knowledge_classes = self._get_knowledge_subclasses()
implementation = None
for cls in knowledge_classes:
if cls.document_type() and cls.document_type().value == extension:
implementation = cls(**kwargs)
if implementation is None:
raise Exception(f"Unsupported knowledge document type '{extension}'")
return implementation
@classmethod
def all_types(cls):
"""get all knowledge types"""
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
@classmethod
def subclasses(cls):
"""get all knowledge subclasses"""
return cls._get_knowledge_subclasses()
@staticmethod
def _get_knowledge_subclasses() -> List[Knowledge]:
"""get all knowledge subclasses"""
from dbgpt.rag.knowledge.base import Knowledge
from dbgpt.rag.knowledge.pdf import PDFKnowledge
from dbgpt.rag.knowledge.docx import DocxKnowledge
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
from dbgpt.rag.knowledge.csv import CSVKnowledge
from dbgpt.rag.knowledge.txt import TXTKnowledge
from dbgpt.rag.knowledge.pptx import PPTXKnowledge
from dbgpt.rag.knowledge.html import HTMLKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge
from dbgpt.rag.knowledge.string import StringKnowledge
return Knowledge.__subclasses__()

View File

@@ -0,0 +1,84 @@
from typing import Optional, Any, List
import chardet
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
)
class HTMLKnowledge(Knowledge):
"""HTML Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
def _load(self) -> List[Document]:
"""Load html document from loader"""
if self._loader:
documents = self._loader.load()
else:
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
if result["encoding"] is None:
text = raw_text.decode("utf-8")
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]
def _postprocess(self, documents: List[Document]):
i = 0
for d in documents:
import markdown
content = markdown.markdown(d.content)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["!doctype", "meta", "i.fa"]):
tag.extract()
documents[i].content = soup.get_text()
documents[i].content = documents[i].content.replace("\n", " ")
i += 1
return documents
@classmethod
def support_chunk_strategy(cls):
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.HTML

View File

View File

@@ -0,0 +1,65 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
KnowledgeType,
Knowledge,
ChunkStrategy,
DocumentType,
)
class MarkdownKnowledge(Knowledge):
"""Markdown Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
encoding:(Optional[str]) encoding
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load markdown document from loader"""
if self._loader:
documents = self._loader.load()
else:
with open(self._path, encoding=self._encoding, errors="ignore") as f:
markdown_text = f.read()
metadata = {"source": self._path}
documents = [Document(content=markdown_text, metadata=metadata)]
return documents
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.MARKDOWN

View File

@@ -0,0 +1,88 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
)
class PDFKnowledge(Knowledge):
"""PDF Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional = None,
language: Optional[str] = "zh",
**kwargs: Any,
) -> None:
"""Initialize with PDF Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._language = language
def _load(self) -> List[Document]:
"""Load pdf document from loader"""
if self._loader:
documents = self._loader.load()
else:
import pypdf
pages = []
documents = []
with open(self._path, "rb") as file:
reader = pypdf.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
pages.append((page.extract_text(), page_num))
# cleaned_pages = []
for page, page_num in pages:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
if self._language == "en":
words = list(line)
else:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
# cleaned_pages.append(page)
metadata = {"source": self._path, "page": page_num}
# text = "\f".join(cleaned_pages)
document = Document(content=page, metadata=metadata)
documents.append(document)
return documents
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.PDF

View File

@@ -0,0 +1,72 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
)
class PPTXKnowledge(Knowledge):
"""PPTX Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional = None,
language: Optional[str] = "zh",
**kwargs: Any,
) -> None:
"""Initialize with PDF Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._language = language
def _load(self) -> List[Document]:
"""Load pdf document from loader"""
if self._loader:
documents = self._loader.load()
else:
from pptx import Presentation
pr = Presentation(self._path)
docs = []
for slide in pr.slides:
content = ""
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
content += shape.text
docs.append(
Document(content=content, metadata={"source": slide.slide_id})
)
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.PPTX

View File

@@ -0,0 +1,48 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge, ChunkStrategy
class StringKnowledge(Knowledge):
"""String Knowledge"""
def __init__(
self,
text: str = None,
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
text:(str) text
knowledge_type:(KnowledgeType) knowledge type
encoding:(encoding) csv encoding
loader:(loader) loader
"""
self._text = text
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
def _load(self) -> List[Document]:
"""load raw text from loader"""
metadata = {"source": "raw text"}
docs = [Document(content=self._text, metadata=metadata)]
return docs
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls):
return KnowledgeType.TEXT

View File

View File

@@ -0,0 +1,31 @@
import pytest
from unittest.mock import MagicMock, mock_open, patch
from dbgpt.rag.knowledge.csv import CSVKnowledge
MOCK_CSV_DATA = "id,name,age\n1,John Doe,30\n2,Jane Smith,25\n3,Bob Johnson,40"
@pytest.fixture
def mock_file_open():
with patch("builtins.open", mock_open(read_data=MOCK_CSV_DATA)) as mock_file:
yield mock_file
@pytest.fixture
def mock_csv_dict_reader():
with patch("csv.DictReader", MagicMock()) as mock_csv:
mock_csv.return_value = iter(
[
{"id": "1", "name": "John Doe", "age": "30"},
{"id": "2", "name": "Jane Smith", "age": "25"},
{"id": "3", "name": "Bob Johnson", "age": "40"},
]
)
yield mock_csv
def test_load_from_csv(mock_file_open, mock_csv_dict_reader):
knowledge = CSVKnowledge(file_path="test_data.csv", source_column="name")
documents = knowledge._load()
assert len(documents) == 3

View File

@@ -0,0 +1,28 @@
import pytest
from unittest.mock import MagicMock, patch
from dbgpt.rag.knowledge.docx import DocxKnowledge
@pytest.fixture
def mock_docx_document():
mock_document = MagicMock()
mock_document.paragraphs = [
MagicMock(text="This is the first paragraph."),
MagicMock(text="This is the second paragraph."),
]
with patch("docx.Document", return_value=mock_document):
yield mock_document
def test_load_from_docx(mock_docx_document):
file_path = "test_document.docx"
knowledge = DocxKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert (
documents[0].content
== "This is the first paragraph.\nThis is the second paragraph."
)
assert documents[0].metadata["source"] == file_path

View File

@@ -0,0 +1,45 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.html import HTMLKnowledge
MOCK_HTML_CONTENT = b"""
<html>
<head>
<title>Test HTML</title>
</head>
<body>
<p>This is a paragraph.</p>
</body>
</html>
"""
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
@pytest.fixture
def mock_file_open():
with patch(
"builtins.open", mock_open(read_data=MOCK_HTML_CONTENT), create=True
) as mock_file:
yield mock_file
@pytest.fixture
def mock_chardet_detect():
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
yield mock_detect
def test_load_from_html(mock_file_open, mock_chardet_detect):
file_path = "test_document.html"
knowledge = HTMLKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert "This is a paragraph." in documents[0].content
assert documents[0].metadata["source"] == file_path
mock_file_open.assert_called_once_with(file_path, "rb")
mock_chardet_detect.assert_called_once()

View File

@@ -0,0 +1,28 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
MOCK_MARKDOWN_DATA = """# Header 1
This is some text under header 1.
## Header 2
This is some text under header 2.
"""
@pytest.fixture
def mock_file_open():
with patch("builtins.open", mock_open(read_data=MOCK_MARKDOWN_DATA)) as mock_file:
yield mock_file
# 定义测试函数
def test_load_from_markdown(mock_file_open):
file_path = "test_document.md"
knowledge = MarkdownKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert documents[0].content == MOCK_MARKDOWN_DATA
assert documents[0].metadata["source"] == file_path

View File

@@ -0,0 +1,36 @@
import pytest
from unittest.mock import MagicMock, patch, mock_open
from dbgpt.rag.knowledge.pdf import PDFKnowledge
MOCK_PDF_PAGES = [
("This is the content of the first page.", 0),
("This is the content of the second page.", 1),
]
@pytest.fixture
def mock_pdf_open_and_reader():
mock_pdf_file = mock_open()
mock_reader = MagicMock()
mock_reader.pages = [
MagicMock(extract_text=MagicMock(return_value=page[0]))
for page in MOCK_PDF_PAGES
]
with patch("builtins.open", mock_pdf_file):
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
yield mock
def test_load_from_pdf(mock_pdf_open_and_reader):
file_path = "test_document.pdf"
knowledge = PDFKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == len(MOCK_PDF_PAGES)
for i, document in enumerate(documents):
assert MOCK_PDF_PAGES[i][0] in document.content
assert document.metadata["source"] == file_path
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
#

View File

@@ -0,0 +1,37 @@
import pytest
from unittest.mock import mock_open, patch
from dbgpt.rag.knowledge.txt import TXTKnowledge
MOCK_TXT_CONTENT = b"Sample text content for testing.\nAnother line of text."
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
@pytest.fixture
def mock_file_open():
with patch(
"builtins.open", mock_open(read_data=MOCK_TXT_CONTENT), create=True
) as mock_file:
yield mock_file
@pytest.fixture
def mock_chardet_detect():
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
yield mock_detect
# 定义测试函数
def test_load_from_txt(mock_file_open, mock_chardet_detect):
file_path = "test_document.txt"
knowledge = TXTKnowledge(file_path=file_path)
documents = knowledge._load()
assert len(documents) == 1
assert "Sample text content for testing." in documents[0].content
assert documents[0].metadata["source"] == file_path
mock_file_open.assert_called_once_with(file_path, "rb")
mock_chardet_detect.assert_called_once()

View File

@@ -0,0 +1,68 @@
from typing import Optional, Any, List
import chardet
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
)
class TXTKnowledge(Knowledge):
"""TXT Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
def _load(self) -> List[Document]:
"""Load txt document from loader"""
if self._loader:
documents = self._loader.load()
else:
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
if result["encoding"] is None:
text = raw_text.decode("utf-8")
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls):
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.TXT

View File

@@ -0,0 +1,55 @@
from typing import Optional, Any, List
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge, ChunkStrategy
class URLKnowledge(Knowledge):
def __init__(
self,
url: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.URL,
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
url:(Optional[str]) url
knowledge_type:(KnowledgeType) knowledge type
source_column:(Optional[str]) source column
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
"""
self._path = url
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
self._source_column = source_column
def _load(self) -> List[Document]:
"""Fetch URL document from loader"""
if self._loader:
documents = self._loader.load()
else:
from langchain.document_loaders import WebBaseLoader
web_reader = WebBaseLoader(web_path=self._path)
documents = web_reader.load()
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls):
return KnowledgeType.URL