chore: Add pylint for DB-GPT rag lib (#1267)

2025-09-16 22:51:24 +00:00 · 2024-03-07 23:27:43 +08:00
parent aaaf34db17
commit 7446817340
70 changed files with 1135 additions and 587 deletions
--- a/dbgpt/rag/knowledge/init.py
+++ b/dbgpt/rag/knowledge/init.py
@@ -0,0 +1,29 @@
+"""Module Of Knowledge."""
+
+from .base import ChunkStrategy, Knowledge, KnowledgeType  # noqa: F401
+from .csv import CSVKnowledge  # noqa: F401
+from .docx import DocxKnowledge  # noqa: F401
+from .factory import KnowledgeFactory  # noqa: F401
+from .html import HTMLKnowledge  # noqa: F401
+from .markdown import MarkdownKnowledge  # noqa: F401
+from .pdf import PDFKnowledge  # noqa: F401
+from .pptx import PPTXKnowledge  # noqa: F401
+from .string import StringKnowledge  # noqa: F401
+from .txt import TXTKnowledge  # noqa: F401
+from .url import URLKnowledge  # noqa: F401
+
+__ALL__ = [
+    "KnowledgeFactory",
+    "Knowledge",
+    "KnowledgeType",
+    "ChunkStrategy",
+    "CSVKnowledge",
+    "DocxKnowledge",
+    "HTMLKnowledge",
+    "MarkdownKnowledge",
+    "PDFKnowledge",
+    "PPTXKnowledge",
+    "StringKnowledge",
+    "TXTKnowledge",
+    "URLKnowledge",
+]
--- a/dbgpt/rag/knowledge/base.py
+++ b/dbgpt/rag/knowledge/base.py
@@ -1,19 +1,23 @@
+"""Module for Knowledge Base."""
+
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple, Type

 from dbgpt.rag.chunk import Document
 from dbgpt.rag.text_splitter.text_splitter import (
-    CharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    PageTextSplitter,
    ParagraphTextSplitter,
    RecursiveCharacterTextSplitter,
    SeparatorTextSplitter,
+    TextSplitter,
 )


 class DocumentType(Enum):
+    """Document Type Enum."""
+
    PDF = "pdf"
    CSV = "csv"
    MARKDOWN = "md"
@@ -24,27 +28,40 @@ class DocumentType(Enum):


 class KnowledgeType(Enum):
+    """Knowledge Type Enum."""
+
    DOCUMENT = "DOCUMENT"
    URL = "URL"
    TEXT = "TEXT"

    @property
    def type(self):
+        """Get type."""
        return DocumentType

    @classmethod
-    def get_by_value(cls, value):
-        """Get Enum member by value"""
+    def get_by_value(cls, value) -> "KnowledgeType":
+        """Get Enum member by value.
+
+        Args:
+            value(any): value
+
+        Returns:
+            KnowledgeType: Enum member
+        """
        for member in cls:
            if member.value == value:
                return member
        raise ValueError(f"{value} is not a valid value for {cls.__name__}")


-class ChunkStrategy(Enum):
-    """chunk strategy"""
+_STRATEGY_ENUM_TYPE = Tuple[Type[TextSplitter], List, str, str]

-    CHUNK_BY_SIZE = (
+
+class ChunkStrategy(Enum):
+    """Chunk Strategy Enum."""
+
+    CHUNK_BY_SIZE: _STRATEGY_ENUM_TYPE = (
        RecursiveCharacterTextSplitter,
        [
            {
@@ -63,8 +80,13 @@ class ChunkStrategy(Enum):
        "chunk size",
        "split document by chunk size",
    )
-    CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
-    CHUNK_BY_PARAGRAPH = (
+    CHUNK_BY_PAGE: _STRATEGY_ENUM_TYPE = (
+        PageTextSplitter,
+        [],
+        "page",
+        "split document by page",
+    )
+    CHUNK_BY_PARAGRAPH: _STRATEGY_ENUM_TYPE = (
        ParagraphTextSplitter,
        [
            {
@@ -77,7 +99,7 @@ class ChunkStrategy(Enum):
        "paragraph",
        "split document by paragraph",
    )
-    CHUNK_BY_SEPARATOR = (
+    CHUNK_BY_SEPARATOR: _STRATEGY_ENUM_TYPE = (
        SeparatorTextSplitter,
        [
            {
@@ -90,13 +112,14 @@ class ChunkStrategy(Enum):
                "param_name": "enable_merge",
                "param_type": "boolean",
                "default_value": False,
-                "description": "Whether to merge according to the chunk_size after splitting by the separator.",
+                "description": "Whether to merge according to the chunk_size after "
+                "splitting by the separator.",
            },
        ],
        "separator",
        "split document by separator",
    )
-    CHUNK_BY_MARKDOWN_HEADER = (
+    CHUNK_BY_MARKDOWN_HEADER: _STRATEGY_ENUM_TYPE = (
        MarkdownHeaderTextSplitter,
        [],
        "markdown header",
@@ -104,24 +127,26 @@ class ChunkStrategy(Enum):
    )

    def __init__(self, splitter_class, parameters, alias, description):
+        """Create a new ChunkStrategy with the given splitter_class."""
        self.splitter_class = splitter_class
        self.parameters = parameters
        self.alias = alias
        self.description = description

-    def match(self, *args, **kwargs):
+    def match(self, *args, **kwargs) -> TextSplitter:
+        """Match and build splitter."""
        kwargs = {k: v for k, v in kwargs.items() if v is not None}
        return self.value[0](*args, **kwargs)


 class Knowledge(ABC):
-    type: KnowledgeType = None
+    """Knowledge Base Class."""

    def __init__(
        self,
        path: Optional[str] = None,
        knowledge_type: Optional[KnowledgeType] = None,
-        data_loader: Optional = None,
+        data_loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize with Knowledge arguments."""
@@ -130,30 +155,31 @@ class Knowledge(ABC):
        self._data_loader = data_loader

    def load(self):
-        """Load knowledge from data_loader"""
+        """Load knowledge from data_loader."""
        documents = self._load()
        return self._postprocess(documents)

    @classmethod
+    @abstractmethod
    def type(cls) -> KnowledgeType:
-        """Get knowledge type"""
+        """Get knowledge type."""

    @classmethod
    def document_type(cls) -> Any:
-        """Get document type"""
+        """Get document type."""
        return None

    def _postprocess(self, docs: List[Document]) -> List[Document]:
-        """Post process knowledge from data_loader"""
+        """Post process knowledge from data_loader."""
        return docs

    @abstractmethod
    def _load(self):
-        """Preprocess knowledge from data_loader"""
+        """Preprocess knowledge from data_loader."""

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
-        """support chunk strategy"""
+        """Return supported chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_PAGE,
@@ -162,11 +188,11 @@ class Knowledge(ABC):
            ChunkStrategy.CHUNK_BY_SEPARATOR,
        ]

-    def default_chunk_strategy(self) -> ChunkStrategy:
-        return ChunkStrategy.CHUNK_BY_SIZE
+    @classmethod
+    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy.

-    def support_chunk_strategy(self):
-        return [
-            ChunkStrategy.CHUNK_BY_SIZE,
-            ChunkStrategy.CHUNK_BY_SEPARATOR,
-        ]
+        Returns:
+            ChunkStrategy: default chunk strategy
+        """
+        return ChunkStrategy.CHUNK_BY_SIZE
--- a/dbgpt/rag/knowledge/csv.py
+++ b/dbgpt/rag/knowledge/csv.py
@@ -1,3 +1,4 @@
+"""CSV Knowledge."""
 import csv
 from typing import Any, List, Optional

@@ -11,7 +12,7 @@ from dbgpt.rag.knowledge.base import (


 class CSVKnowledge(Knowledge):
-    """CSV Knowledge"""
+    """CSV Knowledge."""

    def __init__(
        self,
@@ -22,13 +23,14 @@ class CSVKnowledge(Knowledge):
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize csv with Knowledge arguments.
+        """Create CSV Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            source_column:(Optional[str]) source column
-            encoding:(Optional[str]) csv encoding
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            source_column(str, optional): source column
+            encoding(str, optional): csv encoding
+            loader(Any, optional): loader
        """
        self._path = file_path
        self._type = knowledge_type
@@ -37,11 +39,13 @@ class CSVKnowledge(Knowledge):
        self._source_column = source_column

    def _load(self) -> List[Document]:
-        """Load csv document from loader"""
+        """Load csv document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
            docs = []
+            if not self._path:
+                raise ValueError("file path is required")
            with open(self._path, newline="", encoding=self._encoding) as csvfile:
                csv_reader = csv.DictReader(csvfile)
                for i, row in enumerate(csv_reader):
@@ -59,7 +63,8 @@ class CSVKnowledge(Knowledge):
                        )
                    except KeyError:
                        raise ValueError(
-                            f"Source column '{self._source_column}' not found in CSV file."
+                            f"Source column '{self._source_column}' not found in CSV "
+                            f"file."
                        )
                    metadata = {"source": source, "row": i}
                    doc = Document(content=content, metadata=metadata)
@@ -70,6 +75,7 @@ class CSVKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -77,12 +83,15 @@ class CSVKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Knowledge type of CSV."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Return document type."""
        return DocumentType.CSV
--- a/dbgpt/rag/knowledge/docx.py
+++ b/dbgpt/rag/knowledge/docx.py
@@ -1,3 +1,4 @@
+"""Docx Knowledge."""
 from typing import Any, List, Optional

 import docx
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (


 class DocxKnowledge(Knowledge):
-    """Docx Knowledge"""
+    """Docx Knowledge."""

    def __init__(
        self,
@@ -22,12 +23,13 @@ class DocxKnowledge(Knowledge):
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create Docx Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            encoding:(Optional[str]) csv encoding
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            encoding(str, optional): csv encoding
+            loader(Any, optional): loader
        """
        self._path = file_path
        self._type = knowledge_type
@@ -35,7 +37,7 @@ class DocxKnowledge(Knowledge):
        self._encoding = encoding

    def _load(self) -> List[Document]:
-        """Load docx document from loader"""
+        """Load docx document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
@@ -54,6 +56,7 @@ class DocxKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_PARAGRAPH,
@@ -62,12 +65,15 @@ class DocxKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Return knowledge type."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Return document type."""
        return DocumentType.DOCX
--- a/dbgpt/rag/knowledge/factory.py
+++ b/dbgpt/rag/knowledge/factory.py
@@ -1,4 +1,5 @@
-from typing import List, Optional
+"""Knowledge Factory to create knowledge from file path and url."""
+from typing import List, Optional, Type

 from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
 from dbgpt.rag.knowledge.string import StringKnowledge
@@ -6,17 +7,18 @@ from dbgpt.rag.knowledge.url import URLKnowledge


 class KnowledgeFactory:
-    """Knowledge Factory to create knowledge from file path and url"""
+    """Knowledge Factory to create knowledge from file path and url."""

    def __init__(
        self,
        file_path: Optional[str] = None,
        knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
    ):
-        """Initialize with Knowledge Factory arguments.
+        """Create Knowledge Factory with file path and knowledge type.
+
        Args:
-            param file_path: path of the file to convert
-            param knowledge_type: type of knowledge
+            file_path(str, optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
        """
        self._file_path = file_path
        self._knowledge_type = knowledge_type
@@ -24,16 +26,16 @@ class KnowledgeFactory:
    @classmethod
    def create(
        cls,
-        datasource: Optional[str] = None,
-        knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
+        datasource: str = "",
+        knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
    ):
-        """create knowledge from file path, url or text
+        """Create knowledge from file path, url or text.
+
        Args:
             datasource: path of the file to convert
             knowledge_type: type of knowledge

        Examples:
-
            .. code-block:: python

                from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -62,17 +64,16 @@ class KnowledgeFactory:
    @classmethod
    def from_file_path(
        cls,
-        file_path: Optional[str] = None,
+        file_path: str = "",
        knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
    ) -> Knowledge:
-        """Create knowledge from path
+        """Create knowledge from path.

        Args:
            param file_path: path of the file to convert
            param knowledge_type: type of knowledge

        Examples:
-
            .. code-block:: python

                from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -81,7 +82,6 @@ class KnowledgeFactory:
                    datasource="path/to/document.pdf",
                    knowledge_type=KnowledgeType.DOCUMENT,
                )
-
        """
        factory = cls(file_path=file_path, knowledge_type=knowledge_type)
        return factory._select_document_knowledge(
@@ -90,17 +90,16 @@ class KnowledgeFactory:

    @staticmethod
    def from_url(
-        url: Optional[str] = None,
-        knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
+        url: str = "",
+        knowledge_type: KnowledgeType = KnowledgeType.URL,
    ) -> Knowledge:
-        """Create knowledge from url
+        """Create knowledge from url.

        Args:
            param url: url of the file to convert
            param knowledge_type: type of knowledge

        Examples:
-
            .. code-block:: python

                from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -108,7 +107,6 @@ class KnowledgeFactory:
                url_knowlege = KnowledgeFactory.create(
                    datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
                )
-
        """
        return URLKnowledge(
            url=url,
@@ -117,10 +115,11 @@ class KnowledgeFactory:

    @staticmethod
    def from_text(
-        text: str = None,
-        knowledge_type: Optional[KnowledgeType] = KnowledgeType.TEXT,
+        text: str = "",
+        knowledge_type: KnowledgeType = KnowledgeType.TEXT,
    ) -> Knowledge:
-        """Create knowledge from text
+        """Create knowledge from text.
+
        Args:
            param text: text to convert
            param knowledge_type: type of knowledge
@@ -131,7 +130,7 @@ class KnowledgeFactory:
        )

    def _select_document_knowledge(self, **kwargs):
-        """Select document knowledge from file path"""
+        """Select document knowledge from file path."""
        extension = self._file_path.rsplit(".", 1)[-1]
        knowledge_classes = self._get_knowledge_subclasses()
        implementation = None
@@ -144,26 +143,26 @@ class KnowledgeFactory:

    @classmethod
    def all_types(cls):
-        """get all knowledge types"""
+        """Get all knowledge types."""
        return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]

    @classmethod
-    def subclasses(cls):
-        """get all knowledge subclasses"""
+    def subclasses(cls) -> List["Type[Knowledge]"]:
+        """Get all knowledge subclasses."""
        return cls._get_knowledge_subclasses()

    @staticmethod
-    def _get_knowledge_subclasses() -> List[Knowledge]:
-        """get all knowledge subclasses"""
-        from dbgpt.rag.knowledge.base import Knowledge
-        from dbgpt.rag.knowledge.csv import CSVKnowledge
-        from dbgpt.rag.knowledge.docx import DocxKnowledge
-        from dbgpt.rag.knowledge.html import HTMLKnowledge
-        from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
-        from dbgpt.rag.knowledge.pdf import PDFKnowledge
-        from dbgpt.rag.knowledge.pptx import PPTXKnowledge
-        from dbgpt.rag.knowledge.string import StringKnowledge
-        from dbgpt.rag.knowledge.txt import TXTKnowledge
-        from dbgpt.rag.knowledge.url import URLKnowledge
+    def _get_knowledge_subclasses() -> List["Type[Knowledge]"]:
+        """Get all knowledge subclasses."""
+        from dbgpt.rag.knowledge.base import Knowledge  # noqa: F401
+        from dbgpt.rag.knowledge.csv import CSVKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.docx import DocxKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.html import HTMLKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.markdown import MarkdownKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.pdf import PDFKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.pptx import PPTXKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.string import StringKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.txt import TXTKnowledge  # noqa: F401
+        from dbgpt.rag.knowledge.url import URLKnowledge  # noqa: F401

        return Knowledge.__subclasses__()
--- a/dbgpt/rag/knowledge/html.py
+++ b/dbgpt/rag/knowledge/html.py
@@ -1,3 +1,4 @@
+"""HTML Knowledge."""
 from typing import Any, List, Optional

 import chardet
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (


 class HTMLKnowledge(Knowledge):
-    """HTML Knowledge"""
+    """HTML Knowledge."""

    def __init__(
        self,
@@ -21,21 +22,24 @@ class HTMLKnowledge(Knowledge):
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create HTML Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            loader(Any, optional): loader
        """
        self._path = file_path
        self._type = knowledge_type
        self._loader = loader

    def _load(self) -> List[Document]:
-        """Load html document from loader"""
+        """Load html document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
+            if not self._path:
+                raise ValueError("file path is required")
            with open(self._path, "rb") as f:
                raw_text = f.read()
                result = chardet.detect(raw_text)
@@ -49,10 +53,9 @@ class HTMLKnowledge(Knowledge):
        return [Document.langchain2doc(lc_document) for lc_document in documents]

    def _postprocess(self, documents: List[Document]):
-        i = 0
-        for d in documents:
-            import markdown
+        import markdown

+        for i, d in enumerate(documents):
            content = markdown.markdown(d.content)
            from bs4 import BeautifulSoup

@@ -61,11 +64,11 @@ class HTMLKnowledge(Knowledge):
                tag.extract()
            documents[i].content = soup.get_text()
            documents[i].content = documents[i].content.replace("\n", " ")
-            i += 1
        return documents

    @classmethod
    def support_chunk_strategy(cls):
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -73,12 +76,15 @@ class HTMLKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Return knowledge type."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Return document type."""
        return DocumentType.HTML
--- a/dbgpt/rag/knowledge/json.py
+++ b/dbgpt/rag/knowledge/json.py
@@ -0,0 +1 @@
+"""Knowledge JSON."""
--- a/dbgpt/rag/knowledge/markdown.py
+++ b/dbgpt/rag/knowledge/markdown.py
@@ -1,3 +1,4 @@
+"""Markdown Knowledge."""
 from typing import Any, List, Optional

 from dbgpt.rag.chunk import Document
@@ -10,7 +11,7 @@ from dbgpt.rag.knowledge.base import (


 class MarkdownKnowledge(Knowledge):
-    """Markdown Knowledge"""
+    """Markdown Knowledge."""

    def __init__(
        self,
@@ -20,12 +21,13 @@ class MarkdownKnowledge(Knowledge):
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create Markdown Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            encoding:(Optional[str])  encoding
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            encoding(str, optional): csv encoding
+            loader(Any, optional): loader
        """
        self._path = file_path
        self._type = knowledge_type
@@ -33,10 +35,12 @@ class MarkdownKnowledge(Knowledge):
        self._encoding = encoding

    def _load(self) -> List[Document]:
-        """Load markdown document from loader"""
+        """Load markdown document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
+            if not self._path:
+                raise ValueError("file path is required")
            with open(self._path, encoding=self._encoding, errors="ignore") as f:
                markdown_text = f.read()
                metadata = {"source": self._path}
@@ -46,6 +50,7 @@ class MarkdownKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
@@ -54,12 +59,15 @@ class MarkdownKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Return knowledge type."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Return document type."""
        return DocumentType.MARKDOWN
--- a/dbgpt/rag/knowledge/pdf.py
+++ b/dbgpt/rag/knowledge/pdf.py
@@ -1,3 +1,4 @@
+"""PDF Knowledge."""
 from typing import Any, List, Optional

 from dbgpt.rag.chunk import Document
@@ -10,21 +11,23 @@ from dbgpt.rag.knowledge.base import (


 class PDFKnowledge(Knowledge):
-    """PDF Knowledge"""
+    """PDF Knowledge."""

    def __init__(
        self,
        file_path: Optional[str] = None,
        knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
-        loader: Optional = None,
+        loader: Optional[Any] = None,
        language: Optional[str] = "zh",
        **kwargs: Any,
    ) -> None:
-        """Initialize with PDF Knowledge arguments.
+        """Create PDF Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            loader(Any, optional): loader
+            language(str, optional): language
        """
        self._path = file_path
        self._type = knowledge_type
@@ -32,7 +35,7 @@ class PDFKnowledge(Knowledge):
        self._language = language

    def _load(self) -> List[Document]:
-        """Load pdf document from loader"""
+        """Load pdf document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
@@ -40,11 +43,13 @@ class PDFKnowledge(Knowledge):

            pages = []
            documents = []
+            if not self._path:
+                raise ValueError("file path is required")
            with open(self._path, "rb") as file:
                reader = pypdf.PdfReader(file)
                for page_num in range(len(reader.pages)):
-                    page = reader.pages[page_num]
-                    pages.append((page.extract_text(), page_num))
+                    _page = reader.pages[page_num]
+                    pages.append((_page.extract_text(), page_num))

            # cleaned_pages = []
            for page, page_num in pages:
@@ -53,10 +58,9 @@ class PDFKnowledge(Knowledge):
                cleaned_lines = []
                for line in lines:
                    if self._language == "en":
-                        words = list(line)
+                        words = list(line)  # noqa: F841
                    else:
-                        words = line.split()
-                    digits = [word for word in words if any(i.isdigit() for i in word)]
+                        words = line.split()  # noqa: F841
                    cleaned_lines.append(line)
                page = "\n".join(cleaned_lines)
                # cleaned_pages.append(page)
@@ -69,6 +73,7 @@ class PDFKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_PAGE,
@@ -77,12 +82,15 @@ class PDFKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Return knowledge type."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Document type of PDF."""
        return DocumentType.PDF
--- a/dbgpt/rag/knowledge/pptx.py
+++ b/dbgpt/rag/knowledge/pptx.py
@@ -1,3 +1,4 @@
+"""PPTX Knowledge."""
 from typing import Any, List, Optional

 from dbgpt.rag.chunk import Document
@@ -10,17 +11,18 @@ from dbgpt.rag.knowledge.base import (


 class PPTXKnowledge(Knowledge):
-    """PPTX Knowledge"""
+    """PPTX Knowledge."""

    def __init__(
        self,
        file_path: Optional[str] = None,
        knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
-        loader: Optional = None,
+        loader: Optional[Any] = None,
        language: Optional[str] = "zh",
        **kwargs: Any,
    ) -> None:
-        """Initialize with PDF Knowledge arguments.
+        """Create PPTX knowledge with PDF Knowledge arguments.
+
        Args:
            file_path:(Optional[str]) file path
            knowledge_type:(KnowledgeType) knowledge type
@@ -32,7 +34,7 @@ class PPTXKnowledge(Knowledge):
        self._language = language

    def _load(self) -> List[Document]:
-        """Load pdf document from loader"""
+        """Load pdf document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
@@ -53,6 +55,11 @@ class PPTXKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy.
+
+        Returns:
+            List[ChunkStrategy]: support chunk strategy
+        """
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_PAGE,
@@ -61,12 +68,27 @@ class PPTXKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy.
+
+        Returns:
+            ChunkStrategy: default chunk strategy
+        """
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Knowledge type of PPTX.
+
+        Returns:
+            KnowledgeType: knowledge type
+        """
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Document type of PPTX.
+
+        Returns:
+            DocumentType: document type
+        """
        return DocumentType.PPTX
--- a/dbgpt/rag/knowledge/string.py
+++ b/dbgpt/rag/knowledge/string.py
@@ -1,3 +1,4 @@
+"""String Knowledge."""
 from typing import Any, List, Optional

 from dbgpt.rag.chunk import Document
@@ -5,22 +6,23 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType


 class StringKnowledge(Knowledge):
-    """String Knowledge"""
+    """String Knowledge."""

    def __init__(
        self,
-        text: str = None,
+        text: str = "",
        knowledge_type: KnowledgeType = KnowledgeType.TEXT,
        encoding: Optional[str] = "utf-8",
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create String knowledge parameters.
+
        Args:
-            text:(str) text
-            knowledge_type:(KnowledgeType) knowledge type
-            encoding:(encoding) csv encoding
-            loader:(loader) loader
+            text(str): text
+            knowledge_type(KnowledgeType): knowledge type
+            encoding(str): encoding
+            loader(Any): loader
        """
        self._text = text
        self._type = knowledge_type
@@ -28,21 +30,25 @@ class StringKnowledge(Knowledge):
        self._encoding = encoding

    def _load(self) -> List[Document]:
-        """load raw text from loader"""
+        """Load raw text from loader."""
        metadata = {"source": "raw text"}
        docs = [Document(content=self._text, metadata=metadata)]
        return docs

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
        ]

+    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls):
+        """Return knowledge type."""
        return KnowledgeType.TEXT
--- a/dbgpt/rag/knowledge/txt.py
+++ b/dbgpt/rag/knowledge/txt.py
@@ -1,3 +1,4 @@
+"""TXT Knowledge."""
 from typing import Any, List, Optional

 import chardet
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (


 class TXTKnowledge(Knowledge):
-    """TXT Knowledge"""
+    """TXT Knowledge."""

    def __init__(
        self,
@@ -21,21 +22,24 @@ class TXTKnowledge(Knowledge):
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create TXT Knowledge with Knowledge arguments.
+
        Args:
-            file_path:(Optional[str]) file path
-            knowledge_type:(KnowledgeType) knowledge type
-            loader:(Optional[Any]) loader
+            file_path(str,  optional): file path
+            knowledge_type(KnowledgeType, optional): knowledge type
+            loader(Any, optional): loader
        """
        self._path = file_path
        self._type = knowledge_type
        self._loader = loader

    def _load(self) -> List[Document]:
-        """Load txt document from loader"""
+        """Load txt document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
+            if not self._path:
+                raise ValueError("file path is required")
            with open(self._path, "rb") as f:
                raw_text = f.read()
                result = chardet.detect(raw_text)
@@ -50,6 +54,7 @@ class TXTKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls):
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -57,12 +62,15 @@ class TXTKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls) -> KnowledgeType:
+        """Return knowledge type."""
        return KnowledgeType.DOCUMENT

    @classmethod
    def document_type(cls) -> DocumentType:
+        """Return document type."""
        return DocumentType.TXT
--- a/dbgpt/rag/knowledge/url.py
+++ b/dbgpt/rag/knowledge/url.py
@@ -1,3 +1,4 @@
+"""URL Knowledge."""
 from typing import Any, List, Optional

 from dbgpt.rag.chunk import Document
@@ -5,22 +6,25 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType


 class URLKnowledge(Knowledge):
+    """URL Knowledge."""
+
    def __init__(
        self,
-        url: Optional[str] = None,
+        url: str = "",
        knowledge_type: KnowledgeType = KnowledgeType.URL,
        source_column: Optional[str] = None,
        encoding: Optional[str] = "utf-8",
        loader: Optional[Any] = None,
        **kwargs: Any,
    ) -> None:
-        """Initialize with Knowledge arguments.
+        """Create URL Knowledge with Knowledge arguments.
+
        Args:
-            url:(Optional[str]) url
-            knowledge_type:(KnowledgeType) knowledge type
-            source_column:(Optional[str]) source column
-            encoding:(Optional[str]) csv encoding
-            loader:(Optional[Any]) loader
+            url(str,  optional): url
+            knowledge_type(KnowledgeType, optional): knowledge type
+            source_column(str, optional): source column
+            encoding(str, optional): csv encoding
+            loader(Any, optional): loader
        """
        self._path = url
        self._type = knowledge_type
@@ -29,7 +33,7 @@ class URLKnowledge(Knowledge):
        self._source_column = source_column

    def _load(self) -> List[Document]:
-        """Fetch URL document from loader"""
+        """Fetch URL document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
@@ -41,6 +45,7 @@ class URLKnowledge(Knowledge):

    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
+        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -48,8 +53,10 @@ class URLKnowledge(Knowledge):

    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
+        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE

    @classmethod
    def type(cls):
+        """Return knowledge type."""
        return KnowledgeType.URL