feat(rag): Support RAG SDK (#1322)

This commit is contained in:
Fangyin Cheng
2024-03-22 15:36:57 +08:00
committed by GitHub
parent e65732d6e4
commit 8a17099dd2
69 changed files with 1332 additions and 558 deletions

View File

@@ -1,23 +1,50 @@
"""Module Of Knowledge."""
from .base import ChunkStrategy, Knowledge, KnowledgeType # noqa: F401
from .csv import CSVKnowledge # noqa: F401
from .docx import DocxKnowledge # noqa: F401
from .factory import KnowledgeFactory # noqa: F401
from .html import HTMLKnowledge # noqa: F401
from .markdown import MarkdownKnowledge # noqa: F401
from .pdf import PDFKnowledge # noqa: F401
from .pptx import PPTXKnowledge # noqa: F401
from .string import StringKnowledge # noqa: F401
from .txt import TXTKnowledge # noqa: F401
from .url import URLKnowledge # noqa: F401
from typing import Any, Dict
__ALL__ = [
_MODULE_CACHE: Dict[str, Any] = {}
def __getattr__(name: str):
# Lazy load
import importlib
if name in _MODULE_CACHE:
return _MODULE_CACHE[name]
_LIBS = {
"KnowledgeFactory": "factory",
"Knowledge": "base",
"KnowledgeType": "base",
"ChunkStrategy": "base",
"CSVKnowledge": "csv",
"DatasourceKnowledge": "datasource",
"DocxKnowledge": "docx",
"HTMLKnowledge": "html",
"MarkdownKnowledge": "markdown",
"PDFKnowledge": "pdf",
"PPTXKnowledge": "pptx",
"StringKnowledge": "string",
"TXTKnowledge": "txt",
"URLKnowledge": "url",
}
if name in _LIBS:
module_path = "." + _LIBS[name]
module = importlib.import_module(module_path, __name__)
attr = getattr(module, name)
_MODULE_CACHE[name] = attr
return attr
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
__all__ = [
"KnowledgeFactory",
"Knowledge",
"KnowledgeType",
"ChunkStrategy",
"CSVKnowledge",
"DatasourceKnowledge",
"DocxKnowledge",
"HTMLKnowledge",
"MarkdownKnowledge",

View File

@@ -25,6 +25,7 @@ class DocumentType(Enum):
DOCX = "docx"
TXT = "txt"
HTML = "html"
DATASOURCE = "datasource"
class KnowledgeType(Enum):

View File

@@ -0,0 +1,57 @@
"""Datasource Knowledge."""
from typing import Any, List, Optional
from dbgpt.core import Document
from dbgpt.datasource import BaseConnector
from ..summary.rdbms_db_summary import _parse_db_summary
from .base import ChunkStrategy, DocumentType, Knowledge, KnowledgeType
class DatasourceKnowledge(Knowledge):
"""Datasource Knowledge."""
def __init__(
self,
connector: BaseConnector,
summary_template: str = "{table_name}({columns})",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
**kwargs: Any,
) -> None:
"""Create Datasource Knowledge with Knowledge arguments.
Args:
path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
data_loader(Any, optional): loader
"""
self._connector = connector
self._summary_template = summary_template
super().__init__(knowledge_type=knowledge_type, **kwargs)
def _load(self) -> List[Document]:
"""Load datasource document from data_loader."""
docs = []
for table_summary in _parse_db_summary(self._connector, self._summary_template):
docs.append(
Document(content=table_summary, metadata={"source": "database"})
)
return docs
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def type(cls) -> KnowledgeType:
"""Knowledge type of Datasource."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.DATASOURCE

View File

@@ -156,6 +156,7 @@ class KnowledgeFactory:
"""Get all knowledge subclasses."""
from dbgpt.rag.knowledge.base import Knowledge # noqa: F401
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
from dbgpt.rag.knowledge.datasource import DatasourceKnowledge # noqa: F401
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401