DB-GPT/dbgpt/rag/knowledge/factory.py
tonyzhu a7d62eaeb5
[Feature] ChatKnowledge Support Excel Document (#1524)
Co-authored-by: 无剑 <zhuzhenchao@changeme.com>
2024-05-16 16:23:51 +08:00

183 lines
6.5 KiB
Python

"""Knowledge Factory to create knowledge from file path and url."""
from typing import Dict, List, Optional, Type, Union
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge
class KnowledgeFactory:
"""Knowledge Factory to create knowledge from file path and url."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create Knowledge Factory with file path and knowledge type.
Args:
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
self._metadata = metadata
@classmethod
def create(
cls,
datasource: str = "",
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create knowledge from file path, url or text.
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
metadata: Optional[Dict[str, Union[str, List[str]]]]
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
url_knowlege = KnowledgeFactory.create(
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
)
doc_knowlege = KnowledgeFactory.create(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
match knowledge_type:
case KnowledgeType.DOCUMENT:
return cls.from_file_path(
file_path=datasource,
knowledge_type=knowledge_type,
metadata=metadata,
)
case KnowledgeType.URL:
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
case KnowledgeType.TEXT:
return cls.from_text(
text=datasource, knowledge_type=knowledge_type, metadata=metadata
)
case _:
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
@classmethod
def from_file_path(
cls,
file_path: str = "",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from path.
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
doc_knowlege = KnowledgeFactory.create(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
)
@staticmethod
def from_url(
url: str = "",
knowledge_type: KnowledgeType = KnowledgeType.URL,
) -> Knowledge:
"""Create knowledge from url.
Args:
param url: url of the file to convert
param knowledge_type: type of knowledge
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
url_knowlege = KnowledgeFactory.create(
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
)
"""
return URLKnowledge(
url=url,
knowledge_type=knowledge_type,
)
@staticmethod
def from_text(
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from text.
Args:
param text: text to convert
param knowledge_type: type of knowledge
"""
return StringKnowledge(
text=text,
knowledge_type=knowledge_type,
metadata=metadata,
)
def _select_document_knowledge(self, **kwargs):
"""Select document knowledge from file path."""
extension = self._file_path.rsplit(".", 1)[-1]
knowledge_classes = self._get_knowledge_subclasses()
implementation = None
for cls in knowledge_classes:
if cls.document_type() and cls.document_type().value == extension:
implementation = cls(**kwargs)
if implementation is None:
raise Exception(f"Unsupported knowledge document type '{extension}'")
return implementation
@classmethod
def all_types(cls):
"""Get all knowledge types."""
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
@classmethod
def subclasses(cls) -> List["Type[Knowledge]"]:
"""Get all knowledge subclasses."""
return cls._get_knowledge_subclasses()
@staticmethod
def _get_knowledge_subclasses() -> List["Type[Knowledge]"]:
"""Get all knowledge subclasses."""
from dbgpt.rag.knowledge.base import Knowledge # noqa: F401
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
from dbgpt.rag.knowledge.datasource import DatasourceKnowledge # noqa: F401
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
from dbgpt.rag.knowledge.excel import ExcelKnowledge # noqa: F401
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401
from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401
from dbgpt.rag.knowledge.pptx import PPTXKnowledge # noqa: F401
from dbgpt.rag.knowledge.string import StringKnowledge # noqa: F401
from dbgpt.rag.knowledge.txt import TXTKnowledge # noqa: F401
from dbgpt.rag.knowledge.url import URLKnowledge # noqa: F401
return Knowledge.__subclasses__()