[Feature] ChatKnowledge Support Excel Document (#1524)

Co-authored-by: 无剑 <zhuzhenchao@changeme.com>
This commit is contained in:
tonyzhu 2024-05-16 16:23:51 +08:00 committed by GitHub
parent a9087c3853
commit a7d62eaeb5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 118 additions and 0 deletions

View File

@ -27,6 +27,7 @@ def __getattr__(name: str):
"StringKnowledge": "string",
"TXTKnowledge": "txt",
"URLKnowledge": "url",
"ExcelKnowledge": "xlsx",
}
if name in _LIBS:
@ -53,4 +54,5 @@ __all__ = [
"StringKnowledge",
"TXTKnowledge",
"URLKnowledge",
"ExcelKnowledge",
]

View File

@ -26,6 +26,7 @@ class DocumentType(Enum):
TXT = "txt"
HTML = "html"
DATASOURCE = "datasource"
EXCEL = "xlsx"
class KnowledgeType(Enum):

View File

@ -0,0 +1,114 @@
"""Excel Knowledge."""
from typing import Any, Dict, List, Optional, Union
import pandas as pd
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)
class ExcelKnowledge(Knowledge):
"""Excel Knowledge."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create xlsx Knowledge with Knowledge arguments.
Args:
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
source_column(str, optional): source column
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
self._source_column = source_column
def _load(self) -> List[Document]:
"""Load csv document from loader."""
if self._loader:
documents = self._loader.load()
else:
docs = []
if not self._path:
raise ValueError("file path is required")
excel_file = pd.ExcelFile(self._path)
sheet_names = excel_file.sheet_names
for sheet_name in sheet_names:
df = excel_file.parse(sheet_name)
for index, row in df.iterrows():
strs = []
for column_name, column_value in row.items():
if column_name is None or column_value is None:
continue
column_name = str(column_name)
column_value = str(column_value)
strs.append(f"{column_name.strip()}: {column_value.strip()}")
content = "\n".join(strs)
try:
source = (
row[self._source_column]
if self._source_column is not None
else self._path
)
except KeyError:
raise ValueError(
f"Source column '{self._source_column}' not in CSV "
f"file."
)
metadata = {"source": source, "row": index}
if self._metadata:
metadata.update(self._metadata) # type: ignore
doc = Document(content=content, metadata=metadata)
docs.append(doc)
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Knowledge type of CSV."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.EXCEL

View File

@ -170,6 +170,7 @@ class KnowledgeFactory:
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
from dbgpt.rag.knowledge.datasource import DatasourceKnowledge # noqa: F401
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
from dbgpt.rag.knowledge.excel import ExcelKnowledge # noqa: F401
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401
from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401