mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-16 22:51:24 +00:00
chore: Add pylint for DB-GPT rag lib (#1267)
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
"""Module Of Knowledge."""
|
||||
|
||||
from .base import ChunkStrategy, Knowledge, KnowledgeType # noqa: F401
|
||||
from .csv import CSVKnowledge # noqa: F401
|
||||
from .docx import DocxKnowledge # noqa: F401
|
||||
from .factory import KnowledgeFactory # noqa: F401
|
||||
from .html import HTMLKnowledge # noqa: F401
|
||||
from .markdown import MarkdownKnowledge # noqa: F401
|
||||
from .pdf import PDFKnowledge # noqa: F401
|
||||
from .pptx import PPTXKnowledge # noqa: F401
|
||||
from .string import StringKnowledge # noqa: F401
|
||||
from .txt import TXTKnowledge # noqa: F401
|
||||
from .url import URLKnowledge # noqa: F401
|
||||
|
||||
__ALL__ = [
|
||||
"KnowledgeFactory",
|
||||
"Knowledge",
|
||||
"KnowledgeType",
|
||||
"ChunkStrategy",
|
||||
"CSVKnowledge",
|
||||
"DocxKnowledge",
|
||||
"HTMLKnowledge",
|
||||
"MarkdownKnowledge",
|
||||
"PDFKnowledge",
|
||||
"PPTXKnowledge",
|
||||
"StringKnowledge",
|
||||
"TXTKnowledge",
|
||||
"URLKnowledge",
|
||||
]
|
||||
|
@@ -1,19 +1,23 @@
|
||||
"""Module for Knowledge Base."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, List, Optional, Tuple, Type
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
from dbgpt.rag.text_splitter.text_splitter import (
|
||||
CharacterTextSplitter,
|
||||
MarkdownHeaderTextSplitter,
|
||||
PageTextSplitter,
|
||||
ParagraphTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
SeparatorTextSplitter,
|
||||
TextSplitter,
|
||||
)
|
||||
|
||||
|
||||
class DocumentType(Enum):
|
||||
"""Document Type Enum."""
|
||||
|
||||
PDF = "pdf"
|
||||
CSV = "csv"
|
||||
MARKDOWN = "md"
|
||||
@@ -24,27 +28,40 @@ class DocumentType(Enum):
|
||||
|
||||
|
||||
class KnowledgeType(Enum):
|
||||
"""Knowledge Type Enum."""
|
||||
|
||||
DOCUMENT = "DOCUMENT"
|
||||
URL = "URL"
|
||||
TEXT = "TEXT"
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
"""Get type."""
|
||||
return DocumentType
|
||||
|
||||
@classmethod
|
||||
def get_by_value(cls, value):
|
||||
"""Get Enum member by value"""
|
||||
def get_by_value(cls, value) -> "KnowledgeType":
|
||||
"""Get Enum member by value.
|
||||
|
||||
Args:
|
||||
value(any): value
|
||||
|
||||
Returns:
|
||||
KnowledgeType: Enum member
|
||||
"""
|
||||
for member in cls:
|
||||
if member.value == value:
|
||||
return member
|
||||
raise ValueError(f"{value} is not a valid value for {cls.__name__}")
|
||||
|
||||
|
||||
class ChunkStrategy(Enum):
|
||||
"""chunk strategy"""
|
||||
_STRATEGY_ENUM_TYPE = Tuple[Type[TextSplitter], List, str, str]
|
||||
|
||||
CHUNK_BY_SIZE = (
|
||||
|
||||
class ChunkStrategy(Enum):
|
||||
"""Chunk Strategy Enum."""
|
||||
|
||||
CHUNK_BY_SIZE: _STRATEGY_ENUM_TYPE = (
|
||||
RecursiveCharacterTextSplitter,
|
||||
[
|
||||
{
|
||||
@@ -63,8 +80,13 @@ class ChunkStrategy(Enum):
|
||||
"chunk size",
|
||||
"split document by chunk size",
|
||||
)
|
||||
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
||||
CHUNK_BY_PARAGRAPH = (
|
||||
CHUNK_BY_PAGE: _STRATEGY_ENUM_TYPE = (
|
||||
PageTextSplitter,
|
||||
[],
|
||||
"page",
|
||||
"split document by page",
|
||||
)
|
||||
CHUNK_BY_PARAGRAPH: _STRATEGY_ENUM_TYPE = (
|
||||
ParagraphTextSplitter,
|
||||
[
|
||||
{
|
||||
@@ -77,7 +99,7 @@ class ChunkStrategy(Enum):
|
||||
"paragraph",
|
||||
"split document by paragraph",
|
||||
)
|
||||
CHUNK_BY_SEPARATOR = (
|
||||
CHUNK_BY_SEPARATOR: _STRATEGY_ENUM_TYPE = (
|
||||
SeparatorTextSplitter,
|
||||
[
|
||||
{
|
||||
@@ -90,13 +112,14 @@ class ChunkStrategy(Enum):
|
||||
"param_name": "enable_merge",
|
||||
"param_type": "boolean",
|
||||
"default_value": False,
|
||||
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
|
||||
"description": "Whether to merge according to the chunk_size after "
|
||||
"splitting by the separator.",
|
||||
},
|
||||
],
|
||||
"separator",
|
||||
"split document by separator",
|
||||
)
|
||||
CHUNK_BY_MARKDOWN_HEADER = (
|
||||
CHUNK_BY_MARKDOWN_HEADER: _STRATEGY_ENUM_TYPE = (
|
||||
MarkdownHeaderTextSplitter,
|
||||
[],
|
||||
"markdown header",
|
||||
@@ -104,24 +127,26 @@ class ChunkStrategy(Enum):
|
||||
)
|
||||
|
||||
def __init__(self, splitter_class, parameters, alias, description):
|
||||
"""Create a new ChunkStrategy with the given splitter_class."""
|
||||
self.splitter_class = splitter_class
|
||||
self.parameters = parameters
|
||||
self.alias = alias
|
||||
self.description = description
|
||||
|
||||
def match(self, *args, **kwargs):
|
||||
def match(self, *args, **kwargs) -> TextSplitter:
|
||||
"""Match and build splitter."""
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
return self.value[0](*args, **kwargs)
|
||||
|
||||
|
||||
class Knowledge(ABC):
|
||||
type: KnowledgeType = None
|
||||
"""Knowledge Base Class."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = None,
|
||||
data_loader: Optional = None,
|
||||
data_loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments."""
|
||||
@@ -130,30 +155,31 @@ class Knowledge(ABC):
|
||||
self._data_loader = data_loader
|
||||
|
||||
def load(self):
|
||||
"""Load knowledge from data_loader"""
|
||||
"""Load knowledge from data_loader."""
|
||||
documents = self._load()
|
||||
return self._postprocess(documents)
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Get knowledge type"""
|
||||
"""Get knowledge type."""
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> Any:
|
||||
"""Get document type"""
|
||||
"""Get document type."""
|
||||
return None
|
||||
|
||||
def _postprocess(self, docs: List[Document]) -> List[Document]:
|
||||
"""Post process knowledge from data_loader"""
|
||||
"""Post process knowledge from data_loader."""
|
||||
return docs
|
||||
|
||||
@abstractmethod
|
||||
def _load(self):
|
||||
"""Preprocess knowledge from data_loader"""
|
||||
"""Preprocess knowledge from data_loader."""
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""support chunk strategy"""
|
||||
"""Return supported chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
@@ -162,11 +188,11 @@ class Knowledge(ABC):
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
def default_chunk_strategy(self) -> ChunkStrategy:
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy.
|
||||
|
||||
def support_chunk_strategy(self):
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
Returns:
|
||||
ChunkStrategy: default chunk strategy
|
||||
"""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""CSV Knowledge."""
|
||||
import csv
|
||||
from typing import Any, List, Optional
|
||||
|
||||
@@ -11,7 +12,7 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class CSVKnowledge(Knowledge):
|
||||
"""CSV Knowledge"""
|
||||
"""CSV Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -22,13 +23,14 @@ class CSVKnowledge(Knowledge):
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize csv with Knowledge arguments.
|
||||
"""Create CSV Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
source_column:(Optional[str]) source column
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
source_column(str, optional): source column
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
@@ -37,11 +39,13 @@ class CSVKnowledge(Knowledge):
|
||||
self._source_column = source_column
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load csv document from loader"""
|
||||
"""Load csv document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
docs = []
|
||||
if not self._path:
|
||||
raise ValueError("file path is required")
|
||||
with open(self._path, newline="", encoding=self._encoding) as csvfile:
|
||||
csv_reader = csv.DictReader(csvfile)
|
||||
for i, row in enumerate(csv_reader):
|
||||
@@ -59,7 +63,8 @@ class CSVKnowledge(Knowledge):
|
||||
)
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
f"Source column '{self._source_column}' not found in CSV file."
|
||||
f"Source column '{self._source_column}' not found in CSV "
|
||||
f"file."
|
||||
)
|
||||
metadata = {"source": source, "row": i}
|
||||
doc = Document(content=content, metadata=metadata)
|
||||
@@ -70,6 +75,7 @@ class CSVKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
@@ -77,12 +83,15 @@ class CSVKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Knowledge type of CSV."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Return document type."""
|
||||
return DocumentType.CSV
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""Docx Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import docx
|
||||
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class DocxKnowledge(Knowledge):
|
||||
"""Docx Knowledge"""
|
||||
"""Docx Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -22,12 +23,13 @@ class DocxKnowledge(Knowledge):
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create Docx Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
@@ -35,7 +37,7 @@ class DocxKnowledge(Knowledge):
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load docx document from loader"""
|
||||
"""Load docx document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
@@ -54,6 +56,7 @@ class DocxKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PARAGRAPH,
|
||||
@@ -62,12 +65,15 @@ class DocxKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Return document type."""
|
||||
return DocumentType.DOCX
|
||||
|
@@ -1,4 +1,5 @@
|
||||
from typing import List, Optional
|
||||
"""Knowledge Factory to create knowledge from file path and url."""
|
||||
from typing import List, Optional, Type
|
||||
|
||||
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge
|
||||
@@ -6,17 +7,18 @@ from dbgpt.rag.knowledge.url import URLKnowledge
|
||||
|
||||
|
||||
class KnowledgeFactory:
|
||||
"""Knowledge Factory to create knowledge from file path and url"""
|
||||
"""Knowledge Factory to create knowledge from file path and url."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
):
|
||||
"""Initialize with Knowledge Factory arguments.
|
||||
"""Create Knowledge Factory with file path and knowledge type.
|
||||
|
||||
Args:
|
||||
param file_path: path of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
"""
|
||||
self._file_path = file_path
|
||||
self._knowledge_type = knowledge_type
|
||||
@@ -24,16 +26,16 @@ class KnowledgeFactory:
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
datasource: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
datasource: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
):
|
||||
"""create knowledge from file path, url or text
|
||||
"""Create knowledge from file path, url or text.
|
||||
|
||||
Args:
|
||||
datasource: path of the file to convert
|
||||
knowledge_type: type of knowledge
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
@@ -62,17 +64,16 @@ class KnowledgeFactory:
|
||||
@classmethod
|
||||
def from_file_path(
|
||||
cls,
|
||||
file_path: Optional[str] = None,
|
||||
file_path: str = "",
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from path
|
||||
"""Create knowledge from path.
|
||||
|
||||
Args:
|
||||
param file_path: path of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
@@ -81,7 +82,6 @@ class KnowledgeFactory:
|
||||
datasource="path/to/document.pdf",
|
||||
knowledge_type=KnowledgeType.DOCUMENT,
|
||||
)
|
||||
|
||||
"""
|
||||
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
|
||||
return factory._select_document_knowledge(
|
||||
@@ -90,17 +90,16 @@ class KnowledgeFactory:
|
||||
|
||||
@staticmethod
|
||||
def from_url(
|
||||
url: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
|
||||
url: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.URL,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from url
|
||||
"""Create knowledge from url.
|
||||
|
||||
Args:
|
||||
param url: url of the file to convert
|
||||
param knowledge_type: type of knowledge
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
@@ -108,7 +107,6 @@ class KnowledgeFactory:
|
||||
url_knowlege = KnowledgeFactory.create(
|
||||
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
|
||||
)
|
||||
|
||||
"""
|
||||
return URLKnowledge(
|
||||
url=url,
|
||||
@@ -117,10 +115,11 @@ class KnowledgeFactory:
|
||||
|
||||
@staticmethod
|
||||
def from_text(
|
||||
text: str = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.TEXT,
|
||||
text: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from text
|
||||
"""Create knowledge from text.
|
||||
|
||||
Args:
|
||||
param text: text to convert
|
||||
param knowledge_type: type of knowledge
|
||||
@@ -131,7 +130,7 @@ class KnowledgeFactory:
|
||||
)
|
||||
|
||||
def _select_document_knowledge(self, **kwargs):
|
||||
"""Select document knowledge from file path"""
|
||||
"""Select document knowledge from file path."""
|
||||
extension = self._file_path.rsplit(".", 1)[-1]
|
||||
knowledge_classes = self._get_knowledge_subclasses()
|
||||
implementation = None
|
||||
@@ -144,26 +143,26 @@ class KnowledgeFactory:
|
||||
|
||||
@classmethod
|
||||
def all_types(cls):
|
||||
"""get all knowledge types"""
|
||||
"""Get all knowledge types."""
|
||||
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
|
||||
|
||||
@classmethod
|
||||
def subclasses(cls):
|
||||
"""get all knowledge subclasses"""
|
||||
def subclasses(cls) -> List["Type[Knowledge]"]:
|
||||
"""Get all knowledge subclasses."""
|
||||
return cls._get_knowledge_subclasses()
|
||||
|
||||
@staticmethod
|
||||
def _get_knowledge_subclasses() -> List[Knowledge]:
|
||||
"""get all knowledge subclasses"""
|
||||
from dbgpt.rag.knowledge.base import Knowledge
|
||||
from dbgpt.rag.knowledge.csv import CSVKnowledge
|
||||
from dbgpt.rag.knowledge.docx import DocxKnowledge
|
||||
from dbgpt.rag.knowledge.html import HTMLKnowledge
|
||||
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
|
||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
||||
from dbgpt.rag.knowledge.pptx import PPTXKnowledge
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge
|
||||
from dbgpt.rag.knowledge.txt import TXTKnowledge
|
||||
from dbgpt.rag.knowledge.url import URLKnowledge
|
||||
def _get_knowledge_subclasses() -> List["Type[Knowledge]"]:
|
||||
"""Get all knowledge subclasses."""
|
||||
from dbgpt.rag.knowledge.base import Knowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.pptx import PPTXKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.txt import TXTKnowledge # noqa: F401
|
||||
from dbgpt.rag.knowledge.url import URLKnowledge # noqa: F401
|
||||
|
||||
return Knowledge.__subclasses__()
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""HTML Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import chardet
|
||||
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class HTMLKnowledge(Knowledge):
|
||||
"""HTML Knowledge"""
|
||||
"""HTML Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -21,21 +22,24 @@ class HTMLKnowledge(Knowledge):
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create HTML Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load html document from loader"""
|
||||
"""Load html document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
if not self._path:
|
||||
raise ValueError("file path is required")
|
||||
with open(self._path, "rb") as f:
|
||||
raw_text = f.read()
|
||||
result = chardet.detect(raw_text)
|
||||
@@ -49,10 +53,9 @@ class HTMLKnowledge(Knowledge):
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
def _postprocess(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
import markdown
|
||||
import markdown
|
||||
|
||||
for i, d in enumerate(documents):
|
||||
content = markdown.markdown(d.content)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -61,11 +64,11 @@ class HTMLKnowledge(Knowledge):
|
||||
tag.extract()
|
||||
documents[i].content = soup.get_text()
|
||||
documents[i].content = documents[i].content.replace("\n", " ")
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls):
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
@@ -73,12 +76,15 @@ class HTMLKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Return document type."""
|
||||
return DocumentType.HTML
|
||||
|
@@ -0,0 +1 @@
|
||||
"""Knowledge JSON."""
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""Markdown Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
@@ -10,7 +11,7 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class MarkdownKnowledge(Knowledge):
|
||||
"""Markdown Knowledge"""
|
||||
"""Markdown Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -20,12 +21,13 @@ class MarkdownKnowledge(Knowledge):
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create Markdown Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(Optional[str]) encoding
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
@@ -33,10 +35,12 @@ class MarkdownKnowledge(Knowledge):
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load markdown document from loader"""
|
||||
"""Load markdown document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
if not self._path:
|
||||
raise ValueError("file path is required")
|
||||
with open(self._path, encoding=self._encoding, errors="ignore") as f:
|
||||
markdown_text = f.read()
|
||||
metadata = {"source": self._path}
|
||||
@@ -46,6 +50,7 @@ class MarkdownKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
|
||||
@@ -54,12 +59,15 @@ class MarkdownKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Return document type."""
|
||||
return DocumentType.MARKDOWN
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""PDF Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
@@ -10,21 +11,23 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class PDFKnowledge(Knowledge):
|
||||
"""PDF Knowledge"""
|
||||
"""PDF Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional = None,
|
||||
loader: Optional[Any] = None,
|
||||
language: Optional[str] = "zh",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with PDF Knowledge arguments.
|
||||
"""Create PDF Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
loader(Any, optional): loader
|
||||
language(str, optional): language
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
@@ -32,7 +35,7 @@ class PDFKnowledge(Knowledge):
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load pdf document from loader"""
|
||||
"""Load pdf document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
@@ -40,11 +43,13 @@ class PDFKnowledge(Knowledge):
|
||||
|
||||
pages = []
|
||||
documents = []
|
||||
if not self._path:
|
||||
raise ValueError("file path is required")
|
||||
with open(self._path, "rb") as file:
|
||||
reader = pypdf.PdfReader(file)
|
||||
for page_num in range(len(reader.pages)):
|
||||
page = reader.pages[page_num]
|
||||
pages.append((page.extract_text(), page_num))
|
||||
_page = reader.pages[page_num]
|
||||
pages.append((_page.extract_text(), page_num))
|
||||
|
||||
# cleaned_pages = []
|
||||
for page, page_num in pages:
|
||||
@@ -53,10 +58,9 @@ class PDFKnowledge(Knowledge):
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if self._language == "en":
|
||||
words = list(line)
|
||||
words = list(line) # noqa: F841
|
||||
else:
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||
words = line.split() # noqa: F841
|
||||
cleaned_lines.append(line)
|
||||
page = "\n".join(cleaned_lines)
|
||||
# cleaned_pages.append(page)
|
||||
@@ -69,6 +73,7 @@ class PDFKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
@@ -77,12 +82,15 @@ class PDFKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Document type of PDF."""
|
||||
return DocumentType.PDF
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""PPTX Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
@@ -10,17 +11,18 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class PPTXKnowledge(Knowledge):
|
||||
"""PPTX Knowledge"""
|
||||
"""PPTX Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional = None,
|
||||
loader: Optional[Any] = None,
|
||||
language: Optional[str] = "zh",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with PDF Knowledge arguments.
|
||||
"""Create PPTX knowledge with PDF Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
@@ -32,7 +34,7 @@ class PPTXKnowledge(Knowledge):
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load pdf document from loader"""
|
||||
"""Load pdf document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
@@ -53,6 +55,11 @@ class PPTXKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy.
|
||||
|
||||
Returns:
|
||||
List[ChunkStrategy]: support chunk strategy
|
||||
"""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_PAGE,
|
||||
@@ -61,12 +68,27 @@ class PPTXKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy.
|
||||
|
||||
Returns:
|
||||
ChunkStrategy: default chunk strategy
|
||||
"""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Knowledge type of PPTX.
|
||||
|
||||
Returns:
|
||||
KnowledgeType: knowledge type
|
||||
"""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Document type of PPTX.
|
||||
|
||||
Returns:
|
||||
DocumentType: document type
|
||||
"""
|
||||
return DocumentType.PPTX
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""String Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
@@ -5,22 +6,23 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
|
||||
|
||||
|
||||
class StringKnowledge(Knowledge):
|
||||
"""String Knowledge"""
|
||||
"""String Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str = None,
|
||||
text: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create String knowledge parameters.
|
||||
|
||||
Args:
|
||||
text:(str) text
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
encoding:(encoding) csv encoding
|
||||
loader:(loader) loader
|
||||
text(str): text
|
||||
knowledge_type(KnowledgeType): knowledge type
|
||||
encoding(str): encoding
|
||||
loader(Any): loader
|
||||
"""
|
||||
self._text = text
|
||||
self._type = knowledge_type
|
||||
@@ -28,21 +30,25 @@ class StringKnowledge(Knowledge):
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""load raw text from loader"""
|
||||
"""Load raw text from loader."""
|
||||
metadata = {"source": "raw text"}
|
||||
docs = [Document(content=self._text, metadata=metadata)]
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls):
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.TEXT
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""TXT Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import chardet
|
||||
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
|
||||
|
||||
|
||||
class TXTKnowledge(Knowledge):
|
||||
"""TXT Knowledge"""
|
||||
"""TXT Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -21,21 +22,24 @@ class TXTKnowledge(Knowledge):
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create TXT Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
file_path:(Optional[str]) file path
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
file_path(str, optional): file path
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load txt document from loader"""
|
||||
"""Load txt document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
if not self._path:
|
||||
raise ValueError("file path is required")
|
||||
with open(self._path, "rb") as f:
|
||||
raw_text = f.read()
|
||||
result = chardet.detect(raw_text)
|
||||
@@ -50,6 +54,7 @@ class TXTKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls):
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
@@ -57,12 +62,15 @@ class TXTKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls) -> KnowledgeType:
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.DOCUMENT
|
||||
|
||||
@classmethod
|
||||
def document_type(cls) -> DocumentType:
|
||||
"""Return document type."""
|
||||
return DocumentType.TXT
|
||||
|
@@ -1,3 +1,4 @@
|
||||
"""URL Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from dbgpt.rag.chunk import Document
|
||||
@@ -5,22 +6,25 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
|
||||
|
||||
|
||||
class URLKnowledge(Knowledge):
|
||||
"""URL Knowledge."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: Optional[str] = None,
|
||||
url: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.URL,
|
||||
source_column: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments.
|
||||
"""Create URL Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
url:(Optional[str]) url
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
source_column:(Optional[str]) source column
|
||||
encoding:(Optional[str]) csv encoding
|
||||
loader:(Optional[Any]) loader
|
||||
url(str, optional): url
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
source_column(str, optional): source column
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = url
|
||||
self._type = knowledge_type
|
||||
@@ -29,7 +33,7 @@ class URLKnowledge(Knowledge):
|
||||
self._source_column = source_column
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Fetch URL document from loader"""
|
||||
"""Fetch URL document from loader."""
|
||||
if self._loader:
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
@@ -41,6 +45,7 @@ class URLKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
"""Return support chunk strategy."""
|
||||
return [
|
||||
ChunkStrategy.CHUNK_BY_SIZE,
|
||||
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||
@@ -48,8 +53,10 @@ class URLKnowledge(Knowledge):
|
||||
|
||||
@classmethod
|
||||
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||
"""Return default chunk strategy."""
|
||||
return ChunkStrategy.CHUNK_BY_SIZE
|
||||
|
||||
@classmethod
|
||||
def type(cls):
|
||||
"""Return knowledge type."""
|
||||
return KnowledgeType.URL
|
||||
|
Reference in New Issue
Block a user