diff --git a/dbgpt/rag/knowledge/__init__.py b/dbgpt/rag/knowledge/__init__.py index 0208cdf6b..5d1aff23f 100644 --- a/dbgpt/rag/knowledge/__init__.py +++ b/dbgpt/rag/knowledge/__init__.py @@ -27,6 +27,7 @@ def __getattr__(name: str): "StringKnowledge": "string", "TXTKnowledge": "txt", "URLKnowledge": "url", + "ExcelKnowledge": "xlsx", } if name in _LIBS: @@ -53,4 +54,5 @@ __all__ = [ "StringKnowledge", "TXTKnowledge", "URLKnowledge", + "ExcelKnowledge", ] diff --git a/dbgpt/rag/knowledge/base.py b/dbgpt/rag/knowledge/base.py index 50355e25d..892c2db4f 100644 --- a/dbgpt/rag/knowledge/base.py +++ b/dbgpt/rag/knowledge/base.py @@ -26,6 +26,7 @@ class DocumentType(Enum): TXT = "txt" HTML = "html" DATASOURCE = "datasource" + EXCEL = "xlsx" class KnowledgeType(Enum): diff --git a/dbgpt/rag/knowledge/excel.py b/dbgpt/rag/knowledge/excel.py new file mode 100644 index 000000000..e5dcacfcb --- /dev/null +++ b/dbgpt/rag/knowledge/excel.py @@ -0,0 +1,114 @@ +"""Excel Knowledge.""" +from typing import Any, Dict, List, Optional, Union + +import pandas as pd + +from dbgpt.core import Document +from dbgpt.rag.knowledge.base import ( + ChunkStrategy, + DocumentType, + Knowledge, + KnowledgeType, +) + + +class ExcelKnowledge(Knowledge): + """Excel Knowledge.""" + + def __init__( + self, + file_path: Optional[str] = None, + knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT, + source_column: Optional[str] = None, + encoding: Optional[str] = "utf-8", + loader: Optional[Any] = None, + metadata: Optional[Dict[str, Union[str, List[str]]]] = None, + **kwargs: Any, + ) -> None: + """Create xlsx Knowledge with Knowledge arguments. + + Args: + file_path(str, optional): file path + knowledge_type(KnowledgeType, optional): knowledge type + source_column(str, optional): source column + encoding(str, optional): csv encoding + loader(Any, optional): loader + """ + super().__init__( + path=file_path, + knowledge_type=knowledge_type, + data_loader=loader, + metadata=metadata, + **kwargs, + ) + self._encoding = encoding + self._source_column = source_column + + def _load(self) -> List[Document]: + """Load csv document from loader.""" + if self._loader: + documents = self._loader.load() + else: + docs = [] + if not self._path: + raise ValueError("file path is required") + + excel_file = pd.ExcelFile(self._path) + sheet_names = excel_file.sheet_names + for sheet_name in sheet_names: + df = excel_file.parse(sheet_name) + for index, row in df.iterrows(): + strs = [] + for column_name, column_value in row.items(): + if column_name is None or column_value is None: + continue + + column_name = str(column_name) + column_value = str(column_value) + strs.append(f"{column_name.strip()}: {column_value.strip()}") + + content = "\n".join(strs) + try: + source = ( + row[self._source_column] + if self._source_column is not None + else self._path + ) + except KeyError: + raise ValueError( + f"Source column '{self._source_column}' not in CSV " + f"file." + ) + + metadata = {"source": source, "row": index} + if self._metadata: + metadata.update(self._metadata) # type: ignore + doc = Document(content=content, metadata=metadata) + docs.append(doc) + + return docs + + return [Document.langchain2doc(lc_document) for lc_document in documents] + + @classmethod + def support_chunk_strategy(cls) -> List[ChunkStrategy]: + """Return support chunk strategy.""" + return [ + ChunkStrategy.CHUNK_BY_SIZE, + ChunkStrategy.CHUNK_BY_SEPARATOR, + ] + + @classmethod + def default_chunk_strategy(cls) -> ChunkStrategy: + """Return default chunk strategy.""" + return ChunkStrategy.CHUNK_BY_SIZE + + @classmethod + def type(cls) -> KnowledgeType: + """Knowledge type of CSV.""" + return KnowledgeType.DOCUMENT + + @classmethod + def document_type(cls) -> DocumentType: + """Return document type.""" + return DocumentType.EXCEL diff --git a/dbgpt/rag/knowledge/factory.py b/dbgpt/rag/knowledge/factory.py index 3a06bbe9f..ef40410f7 100644 --- a/dbgpt/rag/knowledge/factory.py +++ b/dbgpt/rag/knowledge/factory.py @@ -170,6 +170,7 @@ class KnowledgeFactory: from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401 from dbgpt.rag.knowledge.datasource import DatasourceKnowledge # noqa: F401 from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401 + from dbgpt.rag.knowledge.excel import ExcelKnowledge # noqa: F401 from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401 from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401 from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401