mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-15 14:11:14 +00:00
refactor: RAG Refactor (#985)
Co-authored-by: Aralhi <xiaoping0501@gmail.com> Co-authored-by: csunny <cfqsunny@163.com>
This commit is contained in:
0
dbgpt/rag/knowledge/__init__.py
Normal file
0
dbgpt/rag/knowledge/__init__.py
Normal file
141
dbgpt/rag/knowledge/base.py
Normal file
141
dbgpt/rag/knowledge/base.py
Normal file
@@ -0,0 +1,141 @@
|
||||
from abc import abstractmethod, ABC
|
||||
from enum import Enum
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.text_splitter.text_splitter import (
|
||||
RecursiveCharacterTextSplitter,
|
||||
MarkdownHeaderTextSplitter,
|
||||
ParagraphTextSplitter,
|
||||
CharacterTextSplitter,
|
||||
PageTextSplitter,
|
||||
SeparatorTextSplitter,
|
||||
)
|
||||
|
||||
|
||||
class DocumentType(Enum):
|
||||
PDF = "pdf"
|
||||
CSV = "csv"
|
||||
MARKDOWN = "md"
|
||||
PPTX = "pptx"
|
||||
DOCX = "docx"
|
||||
TXT = "txt"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class KnowledgeType(Enum):
|
||||
DOCUMENT = "DOCUMENT"
|
||||
URL = "URL"
|
||||
TEXT = "TEXT"
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return DocumentType
|
||||
|
||||
@classmethod
|
||||
def get_by_value(cls, value):
|
||||
"""Get Enum member by value"""
|
||||
for member in cls:
|
||||
if member.value == value:
|
||||
return member
|
||||
raise ValueError(f"{value} is not a valid value for {cls.__name__}")
|
||||
|
||||
|
||||
class ChunkStrategy(Enum):
|
||||
"""chunk strategy"""
|
||||
|
||||
CHUNK_BY_SIZE = (
|
||||
RecursiveCharacterTextSplitter,
|
||||
[
|
||||
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
|
||||
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
|
||||
],
|
||||
"chunk size",
|
||||
"split document by chunk size",
|
||||
)
|
||||
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
||||
CHUNK_BY_PARAGRAPH = (
|
||||
ParagraphTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
"paragraph",
|
||||
"split document by paragraph",
|
||||
)
|
||||
CHUNK_BY_SEPARATOR = (
|
||||
SeparatorTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
"separator",
|
||||
"split document by separator",
|
||||
)
|
||||
CHUNK_BY_MARKDOWN_HEADER = (
|
||||
MarkdownHeaderTextSplitter,
|
||||
[],
|
||||
"markdown header",
|
||||
"split document by markdown header",
|
||||
)
|
||||
|
||||
def __init__(self, splitter_class, parameters, alias, description):
|
||||
self.splitter_class = splitter_class
|
||||
self.parameters = parameters
|
||||
self.alias = alias
|
||||
self.description = description
|
||||
|
||||
def match(self, *args, **kwargs):
|
||||
return self.value[0](*args, **kwargs)
|
||||
|
||||
|
||||
class Knowledge(ABC):
|
||||
type: KnowledgeType = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = None,
|
||||
data_loader: Optional = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments."""
|
||||
self._path = path
|
||||
self._type = knowledge_type
|
||||
self._data_loader = data_loader
|
||||
|
||||
def load(self):
|
||||
"""Load knowledge from data_loader"""
|
||||
documents = self._load()
|
||||
return self._postprocess(documents)
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Get knowledge type"""
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> Any:
|
||||
"""Get document type"""
|
||||
return None
|
||||
|
||||
def _postprocess(self, docs: List[Document]) -> List[Document]:
|
||||
"""Post process knowledge from data_loader"""
|
||||
return docs
|
||||
|
||||
@abstractmethod
|
||||
def _load(self):
|
||||
"""Preprocess knowledge from data_loader"""
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""support chunk strategy"""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
ChunkStrategy.CHUNK_BY_PARAGRAPH,
|
||||
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
def default_chunk_strategy(self) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
def support_chunk_strategy(self):
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
87
dbgpt/rag/knowledge/csv.py
Normal file
87
dbgpt/rag/knowledge/csv.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from typing import Optional, Any, List
|
||||
import csv
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
KnowledgeType,
|
||||
Knowledge,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class CSVKnowledge(Knowledge):
|
||||
"""CSV Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
source_column: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize csv with Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
source_column:(Optional[str]) source column
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
self._source_column = source_column
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load csv document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
docs = []
|
||||
with open(self._path, newline="", encoding=self._encoding) as csvfile:
|
||||
csv_reader = csv.DictReader(csvfile)
|
||||
for i, row in enumerate(csv_reader):
|
||||
strs = []
|
||||
for k, v in row.items():
|
||||
if k is None or v is None:
|
||||
continue
|
||||
strs.append(f"{k.strip()}: {v.strip()}")
|
||||
content = "\n".join(strs)
|
||||
try:
|
||||
source = (
|
||||
row[self._source_column]
|
||||
if self._source_column is not None
|
||||
else self._path
|
||||
)
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
f"Source column '{self._source_column}' not found in CSV file."
|
||||
)
|
||||
metadata = {"source": source, "row": i}
|
||||
doc = Document(content=content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
return docs
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.CSV
|
72
dbgpt/rag/knowledge/docx.py
Normal file
72
dbgpt/rag/knowledge/docx.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
KnowledgeType,
|
||||
Knowledge,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
import docx
|
||||
|
||||
|
||||
class DocxKnowledge(Knowledge):
|
||||
"""Docx Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Any = KnowledgeType.DOCUMENT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load docx document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
docs = []
|
||||
doc = docx.Document(self._path)
|
||||
content = []
|
||||
for i in range(len(doc.paragraphs)):
|
||||
para = doc.paragraphs[i]
|
||||
text = para.text
|
||||
content.append(text)
|
||||
docs.append(
|
||||
Document(content="\n".join(content), metadata={"source": self._path})
|
||||
)
|
||||
return docs
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PARAGRAPH,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.DOCX
|
143
dbgpt/rag/knowledge/factory.py
Normal file
143
dbgpt/rag/knowledge/factory.py
Normal file
@@ -0,0 +1,143 @@
|
||||
from typing import Optional
|
||||
from typing import List
|
||||
|
||||
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge
|
||||
from dbgpt.rag.knowledge.url import URLKnowledge
|
||||
|
||||
|
||||
class KnowledgeFactory:
|
||||
"""Knowledge Factory to create knowledge from file path and url"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
):
|
||||
"""Initialize with Knowledge Factory arguments.
|
||||
Args:
|
||||
param file_path: path of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
"""
|
||||
self._file_path = file_path
|
||||
self._knowledge_type = knowledge_type
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
datasource: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
):
|
||||
"""create knowledge from file path, url or text
|
||||
Args:
|
||||
datasource: path of the file to convert
|
||||
knowledge_type: type of knowledge
|
||||
Example:
|
||||
.. code-block:: python
|
||||
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
|
||||
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
|
||||
"""
|
||||
match knowledge_type:
|
||||
case KnowledgeType.DOCUMENT:
|
||||
return cls.from_file_path(
|
||||
file_path=datasource, knowledge_type=knowledge_type
|
||||
)
|
||||
case KnowledgeType.URL:
|
||||
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
|
||||
case KnowledgeType.TEXT:
|
||||
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
|
||||
case _:
|
||||
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
|
||||
|
||||
@classmethod
|
||||
def from_file_path(
|
||||
cls,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from path
|
||||
Args:
|
||||
param file_path: path of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
Example:
|
||||
.. code-block:: python
|
||||
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
|
||||
"""
|
||||
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
|
||||
return factory._select_document_knowledge(
|
||||
file_path=file_path, knowledge_type=knowledge_type
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_url(
|
||||
url: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from url
|
||||
Args:
|
||||
param url: url of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
Example:
|
||||
.. code-block:: python
|
||||
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
|
||||
"""
|
||||
return URLKnowledge(
|
||||
url=url,
|
||||
knowledge_type=knowledge_type,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_text(
|
||||
text: str = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.TEXT,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from text
|
||||
Args:
|
||||
param text: text to convert
|
||||
param knowledge_type: type of knowledge
|
||||
"""
|
||||
return StringKnowledge(
|
||||
text=text,
|
||||
knowledge_type=knowledge_type,
|
||||
)
|
||||
|
||||
def _select_document_knowledge(self, **kwargs):
|
||||
"""Select document knowledge from file path"""
|
||||
extension = self._file_path.rsplit(".", 1)[-1]
|
||||
knowledge_classes = self._get_knowledge_subclasses()
|
||||
implementation = None
|
||||
for cls in knowledge_classes:
|
||||
if cls.document_type() and cls.document_type().value == extension:
|
||||
implementation = cls(**kwargs)
|
||||
if implementation is None:
|
||||
raise Exception(f"Unsupported knowledge document type '{extension}'")
|
||||
return implementation
|
||||
|
||||
@classmethod
|
||||
def all_types(cls):
|
||||
"""get all knowledge types"""
|
||||
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
|
||||
|
||||
@classmethod
|
||||
def subclasses(cls):
|
||||
"""get all knowledge subclasses"""
|
||||
return cls._get_knowledge_subclasses()
|
||||
|
||||
@staticmethod
|
||||
def _get_knowledge_subclasses() -> List[Knowledge]:
|
||||
"""get all knowledge subclasses"""
|
||||
from dbgpt.rag.knowledge.base import Knowledge
|
||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
||||
from dbgpt.rag.knowledge.docx import DocxKnowledge
|
||||
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
|
||||
from dbgpt.rag.knowledge.csv import CSVKnowledge
|
||||
from dbgpt.rag.knowledge.txt import TXTKnowledge
|
||||
from dbgpt.rag.knowledge.pptx import PPTXKnowledge
|
||||
from dbgpt.rag.knowledge.html import HTMLKnowledge
|
||||
from dbgpt.rag.knowledge.url import URLKnowledge
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge
|
||||
|
||||
return Knowledge.__subclasses__()
|
84
dbgpt/rag/knowledge/html.py
Normal file
84
dbgpt/rag/knowledge/html.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
import chardet
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
Knowledge,
|
||||
KnowledgeType,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class HTMLKnowledge(Knowledge):
|
||||
"""HTML Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load html document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
with open(self._path, "rb") as f:
|
||||
raw_text = f.read()
|
||||
result = chardet.detect(raw_text)
|
||||
if result["encoding"] is None:
|
||||
text = raw_text.decode("utf-8")
|
||||
else:
|
||||
text = raw_text.decode(result["encoding"])
|
||||
metadata = {"source": self._path}
|
||||
return [Document(content=text, metadata=metadata)]
|
||||
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
def _postprocess(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
import markdown
|
||||
|
||||
content = markdown.markdown(d.content)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["!doctype", "meta", "i.fa"]):
|
||||
tag.extract()
|
||||
documents[i].content = soup.get_text()
|
||||
documents[i].content = documents[i].content.replace("\n", " ")
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls):
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.HTML
|
0
dbgpt/rag/knowledge/json.py
Normal file
0
dbgpt/rag/knowledge/json.py
Normal file
65
dbgpt/rag/knowledge/markdown.py
Normal file
65
dbgpt/rag/knowledge/markdown.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
KnowledgeType,
|
||||
Knowledge,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class MarkdownKnowledge(Knowledge):
|
||||
"""Markdown Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(Optional[str]) encoding
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load markdown document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
with open(self._path, encoding=self._encoding, errors="ignore") as f:
|
||||
markdown_text = f.read()
|
||||
metadata = {"source": self._path}
|
||||
documents = [Document(content=markdown_text, metadata=metadata)]
|
||||
return documents
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.MARKDOWN
|
88
dbgpt/rag/knowledge/pdf.py
Normal file
88
dbgpt/rag/knowledge/pdf.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
Knowledge,
|
||||
KnowledgeType,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class PDFKnowledge(Knowledge):
|
||||
"""PDF Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional = None,
|
||||
language: Optional[str] = "zh",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with PDF Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load pdf document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
import pypdf
|
||||
|
||||
pages = []
|
||||
documents = []
|
||||
with open(self._path, "rb") as file:
|
||||
reader = pypdf.PdfReader(file)
|
||||
for page_num in range(len(reader.pages)):
|
||||
page = reader.pages[page_num]
|
||||
pages.append((page.extract_text(), page_num))
|
||||
|
||||
# cleaned_pages = []
|
||||
for page, page_num in pages:
|
||||
lines = page.splitlines()
|
||||
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if self._language == "en":
|
||||
words = list(line)
|
||||
else:
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||
cleaned_lines.append(line)
|
||||
page = "\n".join(cleaned_lines)
|
||||
# cleaned_pages.append(page)
|
||||
metadata = {"source": self._path, "page": page_num}
|
||||
# text = "\f".join(cleaned_pages)
|
||||
document = Document(content=page, metadata=metadata)
|
||||
documents.append(document)
|
||||
return documents
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.PDF
|
72
dbgpt/rag/knowledge/pptx.py
Normal file
72
dbgpt/rag/knowledge/pptx.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
Knowledge,
|
||||
KnowledgeType,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class PPTXKnowledge(Knowledge):
|
||||
"""PPTX Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional = None,
|
||||
language: Optional[str] = "zh",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with PDF Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load pdf document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
from pptx import Presentation
|
||||
|
||||
pr = Presentation(self._path)
|
||||
docs = []
|
||||
for slide in pr.slides:
|
||||
content = ""
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text:
|
||||
content += shape.text
|
||||
docs.append(
|
||||
Document(content=content, metadata={"source": slide.slide_id})
|
||||
)
|
||||
return docs
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.PPTX
|
48
dbgpt/rag/knowledge/string.py
Normal file
48
dbgpt/rag/knowledge/string.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge, ChunkStrategy
|
||||
|
||||
|
||||
class StringKnowledge(Knowledge):
|
||||
"""String Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
text:(str) text
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(encoding) csv encoding
|
||||
loader:(loader) loader
|
||||
"""
|
||||
self._text = text
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""load raw text from loader"""
|
||||
metadata = {"source": "raw text"}
|
||||
docs = [Document(content=self._text, metadata=metadata)]
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls):
|
||||
return KnowledgeType.TEXT
|
0
dbgpt/rag/knowledge/tests/__init__.py
Normal file
0
dbgpt/rag/knowledge/tests/__init__.py
Normal file
31
dbgpt/rag/knowledge/tests/test_csv.py
Normal file
31
dbgpt/rag/knowledge/tests/test_csv.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.csv import CSVKnowledge
|
||||
|
||||
MOCK_CSV_DATA = "id,name,age\n1,John Doe,30\n2,Jane Smith,25\n3,Bob Johnson,40"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch("builtins.open", mock_open(read_data=MOCK_CSV_DATA)) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_csv_dict_reader():
|
||||
with patch("csv.DictReader", MagicMock()) as mock_csv:
|
||||
mock_csv.return_value = iter(
|
||||
[
|
||||
{"id": "1", "name": "John Doe", "age": "30"},
|
||||
{"id": "2", "name": "Jane Smith", "age": "25"},
|
||||
{"id": "3", "name": "Bob Johnson", "age": "40"},
|
||||
]
|
||||
)
|
||||
yield mock_csv
|
||||
|
||||
|
||||
def test_load_from_csv(mock_file_open, mock_csv_dict_reader):
|
||||
knowledge = CSVKnowledge(file_path="test_data.csv", source_column="name")
|
||||
documents = knowledge._load()
|
||||
assert len(documents) == 3
|
28
dbgpt/rag/knowledge/tests/test_docx.py
Normal file
28
dbgpt/rag/knowledge/tests/test_docx.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from dbgpt.rag.knowledge.docx import DocxKnowledge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_docx_document():
|
||||
mock_document = MagicMock()
|
||||
mock_document.paragraphs = [
|
||||
MagicMock(text="This is the first paragraph."),
|
||||
MagicMock(text="This is the second paragraph."),
|
||||
]
|
||||
with patch("docx.Document", return_value=mock_document):
|
||||
yield mock_document
|
||||
|
||||
|
||||
def test_load_from_docx(mock_docx_document):
|
||||
file_path = "test_document.docx"
|
||||
knowledge = DocxKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert (
|
||||
documents[0].content
|
||||
== "This is the first paragraph.\nThis is the second paragraph."
|
||||
)
|
||||
assert documents[0].metadata["source"] == file_path
|
45
dbgpt/rag/knowledge/tests/test_html.py
Normal file
45
dbgpt/rag/knowledge/tests/test_html.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.html import HTMLKnowledge
|
||||
|
||||
MOCK_HTML_CONTENT = b"""
|
||||
<html>
|
||||
<head>
|
||||
<title>Test HTML</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>This is a paragraph.</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch(
|
||||
"builtins.open", mock_open(read_data=MOCK_HTML_CONTENT), create=True
|
||||
) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_chardet_detect():
|
||||
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
|
||||
yield mock_detect
|
||||
|
||||
|
||||
def test_load_from_html(mock_file_open, mock_chardet_detect):
|
||||
file_path = "test_document.html"
|
||||
knowledge = HTMLKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert "This is a paragraph." in documents[0].content
|
||||
assert documents[0].metadata["source"] == file_path
|
||||
|
||||
mock_file_open.assert_called_once_with(file_path, "rb")
|
||||
|
||||
mock_chardet_detect.assert_called_once()
|
28
dbgpt/rag/knowledge/tests/test_markdown.py
Normal file
28
dbgpt/rag/knowledge/tests/test_markdown.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
|
||||
|
||||
MOCK_MARKDOWN_DATA = """# Header 1
|
||||
This is some text under header 1.
|
||||
|
||||
## Header 2
|
||||
This is some text under header 2.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch("builtins.open", mock_open(read_data=MOCK_MARKDOWN_DATA)) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
# 定义测试函数
|
||||
def test_load_from_markdown(mock_file_open):
|
||||
file_path = "test_document.md"
|
||||
knowledge = MarkdownKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].content == MOCK_MARKDOWN_DATA
|
||||
assert documents[0].metadata["source"] == file_path
|
36
dbgpt/rag/knowledge/tests/test_pdf.py
Normal file
36
dbgpt/rag/knowledge/tests/test_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch, mock_open
|
||||
|
||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
||||
|
||||
MOCK_PDF_PAGES = [
|
||||
("This is the content of the first page.", 0),
|
||||
("This is the content of the second page.", 1),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pdf_open_and_reader():
|
||||
mock_pdf_file = mock_open()
|
||||
mock_reader = MagicMock()
|
||||
mock_reader.pages = [
|
||||
MagicMock(extract_text=MagicMock(return_value=page[0]))
|
||||
for page in MOCK_PDF_PAGES
|
||||
]
|
||||
with patch("builtins.open", mock_pdf_file):
|
||||
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
|
||||
yield mock
|
||||
|
||||
|
||||
def test_load_from_pdf(mock_pdf_open_and_reader):
|
||||
file_path = "test_document.pdf"
|
||||
knowledge = PDFKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == len(MOCK_PDF_PAGES)
|
||||
for i, document in enumerate(documents):
|
||||
assert MOCK_PDF_PAGES[i][0] in document.content
|
||||
assert document.metadata["source"] == file_path
|
||||
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
|
||||
|
||||
#
|
37
dbgpt/rag/knowledge/tests/test_txt.py
Normal file
37
dbgpt/rag/knowledge/tests/test_txt.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pytest
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
from dbgpt.rag.knowledge.txt import TXTKnowledge
|
||||
|
||||
MOCK_TXT_CONTENT = b"Sample text content for testing.\nAnother line of text."
|
||||
|
||||
MOCK_CHARDET_RESULT = {"encoding": "utf-8", "confidence": 0.99}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_file_open():
|
||||
with patch(
|
||||
"builtins.open", mock_open(read_data=MOCK_TXT_CONTENT), create=True
|
||||
) as mock_file:
|
||||
yield mock_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_chardet_detect():
|
||||
with patch("chardet.detect", return_value=MOCK_CHARDET_RESULT) as mock_detect:
|
||||
yield mock_detect
|
||||
|
||||
|
||||
# 定义测试函数
|
||||
def test_load_from_txt(mock_file_open, mock_chardet_detect):
|
||||
file_path = "test_document.txt"
|
||||
knowledge = TXTKnowledge(file_path=file_path)
|
||||
documents = knowledge._load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert "Sample text content for testing." in documents[0].content
|
||||
assert documents[0].metadata["source"] == file_path
|
||||
|
||||
mock_file_open.assert_called_once_with(file_path, "rb")
|
||||
|
||||
mock_chardet_detect.assert_called_once()
|
68
dbgpt/rag/knowledge/txt.py
Normal file
68
dbgpt/rag/knowledge/txt.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
import chardet
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
Knowledge,
|
||||
KnowledgeType,
|
||||
ChunkStrategy,
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
|
||||
class TXTKnowledge(Knowledge):
|
||||
"""TXT Knowledge"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load txt document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
with open(self._path, "rb") as f:
|
||||
raw_text = f.read()
|
||||
result = chardet.detect(raw_text)
|
||||
if result["encoding"] is None:
|
||||
text = raw_text.decode("utf-8")
|
||||
else:
|
||||
text = raw_text.decode(result["encoding"])
|
||||
metadata = {"source": self._path}
|
||||
return [Document(content=text, metadata=metadata)]
|
||||
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls):
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
return DocumentType.TXT
|
55
dbgpt/rag/knowledge/url.py
Normal file
55
dbgpt/rag/knowledge/url.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from typing import Optional, Any, List
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge, ChunkStrategy
|
||||
|
||||
|
||||
class URLKnowledge(Knowledge):
|
||||
def __init__(
|
||||
self,
|
||||
url: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.URL,
|
||||
source_column: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
Args:
|
||||
url:(Optional[str]) url
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
source_column:(Optional[str]) source column
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = url
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
self._source_column = source_column
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Fetch URL document from loader"""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
|
||||
web_reader = WebBaseLoader(web_path=self._path)
|
||||
documents = web_reader.load()
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls):
|
||||
return KnowledgeType.URL
|
Reference in New Issue
Block a user