mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-04 01:50:08 +00:00
183 lines
6.5 KiB
Python
183 lines
6.5 KiB
Python
"""Knowledge Factory to create knowledge from file path and url."""
|
|
from typing import Dict, List, Optional, Type, Union
|
|
|
|
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
|
|
from dbgpt.rag.knowledge.string import StringKnowledge
|
|
from dbgpt.rag.knowledge.url import URLKnowledge
|
|
|
|
|
|
class KnowledgeFactory:
|
|
"""Knowledge Factory to create knowledge from file path and url."""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Optional[str] = None,
|
|
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
|
):
|
|
"""Create Knowledge Factory with file path and knowledge type.
|
|
|
|
Args:
|
|
file_path(str, optional): file path
|
|
knowledge_type(KnowledgeType, optional): knowledge type
|
|
"""
|
|
self._file_path = file_path
|
|
self._knowledge_type = knowledge_type
|
|
self._metadata = metadata
|
|
|
|
@classmethod
|
|
def create(
|
|
cls,
|
|
datasource: str = "",
|
|
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
|
):
|
|
"""Create knowledge from file path, url or text.
|
|
|
|
Args:
|
|
datasource: path of the file to convert
|
|
knowledge_type: type of knowledge
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]]
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
|
|
|
url_knowlege = KnowledgeFactory.create(
|
|
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
|
|
)
|
|
doc_knowlege = KnowledgeFactory.create(
|
|
datasource="path/to/document.pdf",
|
|
knowledge_type=KnowledgeType.DOCUMENT,
|
|
)
|
|
|
|
"""
|
|
match knowledge_type:
|
|
case KnowledgeType.DOCUMENT:
|
|
return cls.from_file_path(
|
|
file_path=datasource,
|
|
knowledge_type=knowledge_type,
|
|
metadata=metadata,
|
|
)
|
|
case KnowledgeType.URL:
|
|
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
|
|
case KnowledgeType.TEXT:
|
|
return cls.from_text(
|
|
text=datasource, knowledge_type=knowledge_type, metadata=metadata
|
|
)
|
|
case _:
|
|
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
|
|
|
|
@classmethod
|
|
def from_file_path(
|
|
cls,
|
|
file_path: str = "",
|
|
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
|
) -> Knowledge:
|
|
"""Create knowledge from path.
|
|
|
|
Args:
|
|
param file_path: path of the file to convert
|
|
param knowledge_type: type of knowledge
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
|
|
|
doc_knowlege = KnowledgeFactory.create(
|
|
datasource="path/to/document.pdf",
|
|
knowledge_type=KnowledgeType.DOCUMENT,
|
|
)
|
|
|
|
"""
|
|
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
|
|
return factory._select_document_knowledge(
|
|
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
|
|
)
|
|
|
|
@staticmethod
|
|
def from_url(
|
|
url: str = "",
|
|
knowledge_type: KnowledgeType = KnowledgeType.URL,
|
|
) -> Knowledge:
|
|
"""Create knowledge from url.
|
|
|
|
Args:
|
|
param url: url of the file to convert
|
|
param knowledge_type: type of knowledge
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
|
|
|
url_knowlege = KnowledgeFactory.create(
|
|
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
|
|
)
|
|
"""
|
|
return URLKnowledge(
|
|
url=url,
|
|
knowledge_type=knowledge_type,
|
|
)
|
|
|
|
@staticmethod
|
|
def from_text(
|
|
text: str = "",
|
|
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
|
) -> Knowledge:
|
|
"""Create knowledge from text.
|
|
|
|
Args:
|
|
param text: text to convert
|
|
param knowledge_type: type of knowledge
|
|
"""
|
|
return StringKnowledge(
|
|
text=text,
|
|
knowledge_type=knowledge_type,
|
|
metadata=metadata,
|
|
)
|
|
|
|
def _select_document_knowledge(self, **kwargs):
|
|
"""Select document knowledge from file path."""
|
|
extension = self._file_path.rsplit(".", 1)[-1]
|
|
knowledge_classes = self._get_knowledge_subclasses()
|
|
implementation = None
|
|
for cls in knowledge_classes:
|
|
if cls.document_type() and cls.document_type().value == extension:
|
|
implementation = cls(**kwargs)
|
|
if implementation is None:
|
|
raise Exception(f"Unsupported knowledge document type '{extension}'")
|
|
return implementation
|
|
|
|
@classmethod
|
|
def all_types(cls):
|
|
"""Get all knowledge types."""
|
|
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
|
|
|
|
@classmethod
|
|
def subclasses(cls) -> List["Type[Knowledge]"]:
|
|
"""Get all knowledge subclasses."""
|
|
return cls._get_knowledge_subclasses()
|
|
|
|
@staticmethod
|
|
def _get_knowledge_subclasses() -> List["Type[Knowledge]"]:
|
|
"""Get all knowledge subclasses."""
|
|
from dbgpt.rag.knowledge.base import Knowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.datasource import DatasourceKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.excel import ExcelKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.pptx import PPTXKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.string import StringKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.txt import TXTKnowledge # noqa: F401
|
|
from dbgpt.rag.knowledge.url import URLKnowledge # noqa: F401
|
|
|
|
return Knowledge.__subclasses__()
|