[Feature] ChatKnowledge Support Excel Document (#1524)

Co-authored-by: 无剑 <zhuzhenchao@changeme.com>
2025-07-22 20:01:46 +00:00 · 2024-05-16 16:23:51 +08:00 · 2024-05-16 16:23:51 +08:00 · a7d62eaeb5
commit a7d62eaeb5
parent a9087c3853
4 changed files with 118 additions and 0 deletions
--- a/dbgpt/rag/knowledge/init.py
+++ b/dbgpt/rag/knowledge/init.py
@ -27,6 +27,7 @@ def __getattr__(name: str):
        "StringKnowledge": "string",
        "TXTKnowledge": "txt",
        "URLKnowledge": "url",
+        "ExcelKnowledge": "xlsx",
    }

    if name in _LIBS:
@ -53,4 +54,5 @@ __all__ = [
    "StringKnowledge",
    "TXTKnowledge",
    "URLKnowledge",
+    "ExcelKnowledge",
 ]
--- a/dbgpt/rag/knowledge/base.py
+++ b/dbgpt/rag/knowledge/base.py
@ -26,6 +26,7 @@ class DocumentType(Enum):
    TXT = "txt"
    HTML = "html"
    DATASOURCE = "datasource"
+    EXCEL = "xlsx"


 class KnowledgeType(Enum):
--- a/dbgpt/rag/knowledge/excel.py
+++ b/dbgpt/rag/knowledge/excel.py
@ -0,0 +1,114 @@
+"""Excel Knowledge."""
+from typing import Any, Dict, List, Optional, Union
+
+import pandas as pd
+
+from dbgpt.core import Document
+from dbgpt.rag.knowledge.base import (
+    ChunkStrategy,
+    DocumentType,
+    Knowledge,
+    KnowledgeType,
+)
+
+
+class ExcelKnowledge(Knowledge):
+    """Excel Knowledge."""
+
+    def __init__(
+        self,
+        file_path: Optional[str] = None,
+        knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
+        source_column: Optional[str] = None,
+        encoding: Optional[str] = "utf-8",
+        loader: Optional[Any] = None,
+        metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Create xlsx Knowledge with Knowledge arguments.
+
+        Args:
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            source_column(str, optional): source column
+            encoding(str, optional): csv encoding
+            loader(Any, optional): loader
+        """
+        super().__init__(
+            path=file_path,
+            knowledge_type=knowledge_type,
+            data_loader=loader,
+            metadata=metadata,
+            **kwargs,
+        )
+        self._encoding = encoding
+        self._source_column = source_column
+
+    def _load(self) -> List[Document]:
+        """Load csv document from loader."""
+        if self._loader:
+            documents = self._loader.load()
+        else:
+            docs = []
+            if not self._path:
+                raise ValueError("file path is required")
+
+            excel_file = pd.ExcelFile(self._path)
+            sheet_names = excel_file.sheet_names
+            for sheet_name in sheet_names:
+                df = excel_file.parse(sheet_name)
+                for index, row in df.iterrows():
+                    strs = []
+                    for column_name, column_value in row.items():
+                        if column_name is None or column_value is None:
+                            continue
+
+                        column_name = str(column_name)
+                        column_value = str(column_value)
+                        strs.append(f"{column_name.strip()}: {column_value.strip()}")
+
+                    content = "\n".join(strs)
+                    try:
+                        source = (
+                            row[self._source_column]
+                            if self._source_column is not None
+                            else self._path
+                        )
+                    except KeyError:
+                        raise ValueError(
+                            f"Source column '{self._source_column}' not in CSV "
+                            f"file."
+                        )
+
+                    metadata = {"source": source, "row": index}
+                    if self._metadata:
+                        metadata.update(self._metadata)  # type: ignore
+                    doc = Document(content=content, metadata=metadata)
+                    docs.append(doc)
+
+            return docs
+
+        return [Document.langchain2doc(lc_document) for lc_document in documents]
+
+    @classmethod
+    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
+        return [
+            ChunkStrategy.CHUNK_BY_SIZE,
+            ChunkStrategy.CHUNK_BY_SEPARATOR,
+        ]
+
+    @classmethod
+    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
+        return ChunkStrategy.CHUNK_BY_SIZE
+
+    @classmethod
+    def type(cls) -> KnowledgeType:
+        """Knowledge type of CSV."""
+        return KnowledgeType.DOCUMENT
+
+    @classmethod
+    def document_type(cls) -> DocumentType:
+        """Return document type."""
+        return DocumentType.EXCEL
--- a/dbgpt/rag/knowledge/factory.py
+++ b/dbgpt/rag/knowledge/factory.py
@ -170,6 +170,7 @@ class KnowledgeFactory:
        from dbgpt.rag.knowledge.csv import CSVKnowledge  # noqa: F401
        from dbgpt.rag.knowledge.datasource import DatasourceKnowledge  # noqa: F401
        from dbgpt.rag.knowledge.docx import DocxKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.excel import ExcelKnowledge  # noqa: F401
        from dbgpt.rag.knowledge.html import HTMLKnowledge  # noqa: F401
        from dbgpt.rag.knowledge.markdown import MarkdownKnowledge  # noqa: F401
        from dbgpt.rag.knowledge.pdf import PDFKnowledge  # noqa: F401