refactor: RAG Refactor (#985)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
Co-authored-by: csunny <cfqsunny@163.com>
This commit is contained in:
Aries-ckt
2024-01-03 09:45:26 +08:00
committed by GitHub
parent 90775aad50
commit 9ad70a2961
206 changed files with 5766 additions and 2419 deletions

View File

@@ -0,0 +1,68 @@
from typing import Optional, Any, List
import chardet
from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
)
class TXTKnowledge(Knowledge):
"""TXT Knowledge"""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
def _load(self) -> List[Document]:
"""Load txt document from loader"""
if self._loader:
documents = self._loader.load()
else:
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
if result["encoding"] is None:
text = raw_text.decode("utf-8")
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls):
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
return DocumentType.TXT