chore: Add pylint for DB-GPT rag lib (#1267)

This commit is contained in:
Fangyin Cheng
2024-03-07 23:27:43 +08:00
committed by GitHub
parent aaaf34db17
commit 7446817340
70 changed files with 1135 additions and 587 deletions

View File

@@ -0,0 +1,29 @@
"""Module Of Knowledge."""
from .base import ChunkStrategy, Knowledge, KnowledgeType # noqa: F401
from .csv import CSVKnowledge # noqa: F401
from .docx import DocxKnowledge # noqa: F401
from .factory import KnowledgeFactory # noqa: F401
from .html import HTMLKnowledge # noqa: F401
from .markdown import MarkdownKnowledge # noqa: F401
from .pdf import PDFKnowledge # noqa: F401
from .pptx import PPTXKnowledge # noqa: F401
from .string import StringKnowledge # noqa: F401
from .txt import TXTKnowledge # noqa: F401
from .url import URLKnowledge # noqa: F401
__ALL__ = [
"KnowledgeFactory",
"Knowledge",
"KnowledgeType",
"ChunkStrategy",
"CSVKnowledge",
"DocxKnowledge",
"HTMLKnowledge",
"MarkdownKnowledge",
"PDFKnowledge",
"PPTXKnowledge",
"StringKnowledge",
"TXTKnowledge",
"URLKnowledge",
]

View File

@@ -1,19 +1,23 @@
"""Module for Knowledge Base."""
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, List, Optional
from typing import Any, List, Optional, Tuple, Type
from dbgpt.rag.chunk import Document
from dbgpt.rag.text_splitter.text_splitter import (
CharacterTextSplitter,
MarkdownHeaderTextSplitter,
PageTextSplitter,
ParagraphTextSplitter,
RecursiveCharacterTextSplitter,
SeparatorTextSplitter,
TextSplitter,
)
class DocumentType(Enum):
"""Document Type Enum."""
PDF = "pdf"
CSV = "csv"
MARKDOWN = "md"
@@ -24,27 +28,40 @@ class DocumentType(Enum):
class KnowledgeType(Enum):
"""Knowledge Type Enum."""
DOCUMENT = "DOCUMENT"
URL = "URL"
TEXT = "TEXT"
@property
def type(self):
"""Get type."""
return DocumentType
@classmethod
def get_by_value(cls, value):
"""Get Enum member by value"""
def get_by_value(cls, value) -> "KnowledgeType":
"""Get Enum member by value.
Args:
value(any): value
Returns:
KnowledgeType: Enum member
"""
for member in cls:
if member.value == value:
return member
raise ValueError(f"{value} is not a valid value for {cls.__name__}")
class ChunkStrategy(Enum):
"""chunk strategy"""
_STRATEGY_ENUM_TYPE = Tuple[Type[TextSplitter], List, str, str]
CHUNK_BY_SIZE = (
class ChunkStrategy(Enum):
"""Chunk Strategy Enum."""
CHUNK_BY_SIZE: _STRATEGY_ENUM_TYPE = (
RecursiveCharacterTextSplitter,
[
{
@@ -63,8 +80,13 @@ class ChunkStrategy(Enum):
"chunk size",
"split document by chunk size",
)
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = (
CHUNK_BY_PAGE: _STRATEGY_ENUM_TYPE = (
PageTextSplitter,
[],
"page",
"split document by page",
)
CHUNK_BY_PARAGRAPH: _STRATEGY_ENUM_TYPE = (
ParagraphTextSplitter,
[
{
@@ -77,7 +99,7 @@ class ChunkStrategy(Enum):
"paragraph",
"split document by paragraph",
)
CHUNK_BY_SEPARATOR = (
CHUNK_BY_SEPARATOR: _STRATEGY_ENUM_TYPE = (
SeparatorTextSplitter,
[
{
@@ -90,13 +112,14 @@ class ChunkStrategy(Enum):
"param_name": "enable_merge",
"param_type": "boolean",
"default_value": False,
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
"description": "Whether to merge according to the chunk_size after "
"splitting by the separator.",
},
],
"separator",
"split document by separator",
)
CHUNK_BY_MARKDOWN_HEADER = (
CHUNK_BY_MARKDOWN_HEADER: _STRATEGY_ENUM_TYPE = (
MarkdownHeaderTextSplitter,
[],
"markdown header",
@@ -104,24 +127,26 @@ class ChunkStrategy(Enum):
)
def __init__(self, splitter_class, parameters, alias, description):
"""Create a new ChunkStrategy with the given splitter_class."""
self.splitter_class = splitter_class
self.parameters = parameters
self.alias = alias
self.description = description
def match(self, *args, **kwargs):
def match(self, *args, **kwargs) -> TextSplitter:
"""Match and build splitter."""
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return self.value[0](*args, **kwargs)
class Knowledge(ABC):
type: KnowledgeType = None
"""Knowledge Base Class."""
def __init__(
self,
path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = None,
data_loader: Optional = None,
data_loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments."""
@@ -130,30 +155,31 @@ class Knowledge(ABC):
self._data_loader = data_loader
def load(self):
"""Load knowledge from data_loader"""
"""Load knowledge from data_loader."""
documents = self._load()
return self._postprocess(documents)
@classmethod
@abstractmethod
def type(cls) -> KnowledgeType:
"""Get knowledge type"""
"""Get knowledge type."""
@classmethod
def document_type(cls) -> Any:
"""Get document type"""
"""Get document type."""
return None
def _postprocess(self, docs: List[Document]) -> List[Document]:
"""Post process knowledge from data_loader"""
"""Post process knowledge from data_loader."""
return docs
@abstractmethod
def _load(self):
"""Preprocess knowledge from data_loader"""
"""Preprocess knowledge from data_loader."""
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""support chunk strategy"""
"""Return supported chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
@@ -162,11 +188,11 @@ class Knowledge(ABC):
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
def default_chunk_strategy(self) -> ChunkStrategy:
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy.
def support_chunk_strategy(self):
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
Returns:
ChunkStrategy: default chunk strategy
"""
return ChunkStrategy.CHUNK_BY_SIZE

View File

@@ -1,3 +1,4 @@
"""CSV Knowledge."""
import csv
from typing import Any, List, Optional
@@ -11,7 +12,7 @@ from dbgpt.rag.knowledge.base import (
class CSVKnowledge(Knowledge):
"""CSV Knowledge"""
"""CSV Knowledge."""
def __init__(
self,
@@ -22,13 +23,14 @@ class CSVKnowledge(Knowledge):
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize csv with Knowledge arguments.
"""Create CSV Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
source_column:(Optional[str]) source column
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
source_column(str, optional): source column
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
@@ -37,11 +39,13 @@ class CSVKnowledge(Knowledge):
self._source_column = source_column
def _load(self) -> List[Document]:
"""Load csv document from loader"""
"""Load csv document from loader."""
if self._loader:
documents = self._loader.load()
else:
docs = []
if not self._path:
raise ValueError("file path is required")
with open(self._path, newline="", encoding=self._encoding) as csvfile:
csv_reader = csv.DictReader(csvfile)
for i, row in enumerate(csv_reader):
@@ -59,7 +63,8 @@ class CSVKnowledge(Knowledge):
)
except KeyError:
raise ValueError(
f"Source column '{self._source_column}' not found in CSV file."
f"Source column '{self._source_column}' not found in CSV "
f"file."
)
metadata = {"source": source, "row": i}
doc = Document(content=content, metadata=metadata)
@@ -70,6 +75,7 @@ class CSVKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -77,12 +83,15 @@ class CSVKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Knowledge type of CSV."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.CSV

View File

@@ -1,3 +1,4 @@
"""Docx Knowledge."""
from typing import Any, List, Optional
import docx
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
class DocxKnowledge(Knowledge):
"""Docx Knowledge"""
"""Docx Knowledge."""
def __init__(
self,
@@ -22,12 +23,13 @@ class DocxKnowledge(Knowledge):
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create Docx Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
@@ -35,7 +37,7 @@ class DocxKnowledge(Knowledge):
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load docx document from loader"""
"""Load docx document from loader."""
if self._loader:
documents = self._loader.load()
else:
@@ -54,6 +56,7 @@ class DocxKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PARAGRAPH,
@@ -62,12 +65,15 @@ class DocxKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.DOCX

View File

@@ -1,4 +1,5 @@
from typing import List, Optional
"""Knowledge Factory to create knowledge from file path and url."""
from typing import List, Optional, Type
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
@@ -6,17 +7,18 @@ from dbgpt.rag.knowledge.url import URLKnowledge
class KnowledgeFactory:
"""Knowledge Factory to create knowledge from file path and url"""
"""Knowledge Factory to create knowledge from file path and url."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
):
"""Initialize with Knowledge Factory arguments.
"""Create Knowledge Factory with file path and knowledge type.
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
@@ -24,16 +26,16 @@ class KnowledgeFactory:
@classmethod
def create(
cls,
datasource: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
datasource: str = "",
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
):
"""create knowledge from file path, url or text
"""Create knowledge from file path, url or text.
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -62,17 +64,16 @@ class KnowledgeFactory:
@classmethod
def from_file_path(
cls,
file_path: Optional[str] = None,
file_path: str = "",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
) -> Knowledge:
"""Create knowledge from path
"""Create knowledge from path.
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -81,7 +82,6 @@ class KnowledgeFactory:
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
@@ -90,17 +90,16 @@ class KnowledgeFactory:
@staticmethod
def from_url(
url: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
url: str = "",
knowledge_type: KnowledgeType = KnowledgeType.URL,
) -> Knowledge:
"""Create knowledge from url
"""Create knowledge from url.
Args:
param url: url of the file to convert
param knowledge_type: type of knowledge
Examples:
.. code-block:: python
from dbgpt.rag.knowledge.factory import KnowledgeFactory
@@ -108,7 +107,6 @@ class KnowledgeFactory:
url_knowlege = KnowledgeFactory.create(
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
)
"""
return URLKnowledge(
url=url,
@@ -117,10 +115,11 @@ class KnowledgeFactory:
@staticmethod
def from_text(
text: str = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.TEXT,
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
) -> Knowledge:
"""Create knowledge from text
"""Create knowledge from text.
Args:
param text: text to convert
param knowledge_type: type of knowledge
@@ -131,7 +130,7 @@ class KnowledgeFactory:
)
def _select_document_knowledge(self, **kwargs):
"""Select document knowledge from file path"""
"""Select document knowledge from file path."""
extension = self._file_path.rsplit(".", 1)[-1]
knowledge_classes = self._get_knowledge_subclasses()
implementation = None
@@ -144,26 +143,26 @@ class KnowledgeFactory:
@classmethod
def all_types(cls):
"""get all knowledge types"""
"""Get all knowledge types."""
return [knowledge.type().value for knowledge in cls._get_knowledge_subclasses()]
@classmethod
def subclasses(cls):
"""get all knowledge subclasses"""
def subclasses(cls) -> List["Type[Knowledge]"]:
"""Get all knowledge subclasses."""
return cls._get_knowledge_subclasses()
@staticmethod
def _get_knowledge_subclasses() -> List[Knowledge]:
"""get all knowledge subclasses"""
from dbgpt.rag.knowledge.base import Knowledge
from dbgpt.rag.knowledge.csv import CSVKnowledge
from dbgpt.rag.knowledge.docx import DocxKnowledge
from dbgpt.rag.knowledge.html import HTMLKnowledge
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
from dbgpt.rag.knowledge.pdf import PDFKnowledge
from dbgpt.rag.knowledge.pptx import PPTXKnowledge
from dbgpt.rag.knowledge.string import StringKnowledge
from dbgpt.rag.knowledge.txt import TXTKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge
def _get_knowledge_subclasses() -> List["Type[Knowledge]"]:
"""Get all knowledge subclasses."""
from dbgpt.rag.knowledge.base import Knowledge # noqa: F401
from dbgpt.rag.knowledge.csv import CSVKnowledge # noqa: F401
from dbgpt.rag.knowledge.docx import DocxKnowledge # noqa: F401
from dbgpt.rag.knowledge.html import HTMLKnowledge # noqa: F401
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge # noqa: F401
from dbgpt.rag.knowledge.pdf import PDFKnowledge # noqa: F401
from dbgpt.rag.knowledge.pptx import PPTXKnowledge # noqa: F401
from dbgpt.rag.knowledge.string import StringKnowledge # noqa: F401
from dbgpt.rag.knowledge.txt import TXTKnowledge # noqa: F401
from dbgpt.rag.knowledge.url import URLKnowledge # noqa: F401
return Knowledge.__subclasses__()

View File

@@ -1,3 +1,4 @@
"""HTML Knowledge."""
from typing import Any, List, Optional
import chardet
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
class HTMLKnowledge(Knowledge):
"""HTML Knowledge"""
"""HTML Knowledge."""
def __init__(
self,
@@ -21,21 +22,24 @@ class HTMLKnowledge(Knowledge):
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create HTML Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
def _load(self) -> List[Document]:
"""Load html document from loader"""
"""Load html document from loader."""
if self._loader:
documents = self._loader.load()
else:
if not self._path:
raise ValueError("file path is required")
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
@@ -49,10 +53,9 @@ class HTMLKnowledge(Knowledge):
return [Document.langchain2doc(lc_document) for lc_document in documents]
def _postprocess(self, documents: List[Document]):
i = 0
for d in documents:
import markdown
import markdown
for i, d in enumerate(documents):
content = markdown.markdown(d.content)
from bs4 import BeautifulSoup
@@ -61,11 +64,11 @@ class HTMLKnowledge(Knowledge):
tag.extract()
documents[i].content = soup.get_text()
documents[i].content = documents[i].content.replace("\n", " ")
i += 1
return documents
@classmethod
def support_chunk_strategy(cls):
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -73,12 +76,15 @@ class HTMLKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.HTML

View File

@@ -0,0 +1 @@
"""Knowledge JSON."""

View File

@@ -1,3 +1,4 @@
"""Markdown Knowledge."""
from typing import Any, List, Optional
from dbgpt.rag.chunk import Document
@@ -10,7 +11,7 @@ from dbgpt.rag.knowledge.base import (
class MarkdownKnowledge(Knowledge):
"""Markdown Knowledge"""
"""Markdown Knowledge."""
def __init__(
self,
@@ -20,12 +21,13 @@ class MarkdownKnowledge(Knowledge):
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create Markdown Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
encoding:(Optional[str]) encoding
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
@@ -33,10 +35,12 @@ class MarkdownKnowledge(Knowledge):
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load markdown document from loader"""
"""Load markdown document from loader."""
if self._loader:
documents = self._loader.load()
else:
if not self._path:
raise ValueError("file path is required")
with open(self._path, encoding=self._encoding, errors="ignore") as f:
markdown_text = f.read()
metadata = {"source": self._path}
@@ -46,6 +50,7 @@ class MarkdownKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER,
@@ -54,12 +59,15 @@ class MarkdownKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_MARKDOWN_HEADER
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.MARKDOWN

View File

@@ -1,3 +1,4 @@
"""PDF Knowledge."""
from typing import Any, List, Optional
from dbgpt.rag.chunk import Document
@@ -10,21 +11,23 @@ from dbgpt.rag.knowledge.base import (
class PDFKnowledge(Knowledge):
"""PDF Knowledge"""
"""PDF Knowledge."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional = None,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
**kwargs: Any,
) -> None:
"""Initialize with PDF Knowledge arguments.
"""Create PDF Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
language(str, optional): language
"""
self._path = file_path
self._type = knowledge_type
@@ -32,7 +35,7 @@ class PDFKnowledge(Knowledge):
self._language = language
def _load(self) -> List[Document]:
"""Load pdf document from loader"""
"""Load pdf document from loader."""
if self._loader:
documents = self._loader.load()
else:
@@ -40,11 +43,13 @@ class PDFKnowledge(Knowledge):
pages = []
documents = []
if not self._path:
raise ValueError("file path is required")
with open(self._path, "rb") as file:
reader = pypdf.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
pages.append((page.extract_text(), page_num))
_page = reader.pages[page_num]
pages.append((_page.extract_text(), page_num))
# cleaned_pages = []
for page, page_num in pages:
@@ -53,10 +58,9 @@ class PDFKnowledge(Knowledge):
cleaned_lines = []
for line in lines:
if self._language == "en":
words = list(line)
words = list(line) # noqa: F841
else:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
words = line.split() # noqa: F841
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
# cleaned_pages.append(page)
@@ -69,6 +73,7 @@ class PDFKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
@@ -77,12 +82,15 @@ class PDFKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Document type of PDF."""
return DocumentType.PDF

View File

@@ -1,3 +1,4 @@
"""PPTX Knowledge."""
from typing import Any, List, Optional
from dbgpt.rag.chunk import Document
@@ -10,17 +11,18 @@ from dbgpt.rag.knowledge.base import (
class PPTXKnowledge(Knowledge):
"""PPTX Knowledge"""
"""PPTX Knowledge."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional = None,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
**kwargs: Any,
) -> None:
"""Initialize with PDF Knowledge arguments.
"""Create PPTX knowledge with PDF Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
@@ -32,7 +34,7 @@ class PPTXKnowledge(Knowledge):
self._language = language
def _load(self) -> List[Document]:
"""Load pdf document from loader"""
"""Load pdf document from loader."""
if self._loader:
documents = self._loader.load()
else:
@@ -53,6 +55,11 @@ class PPTXKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy.
Returns:
List[ChunkStrategy]: support chunk strategy
"""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PAGE,
@@ -61,12 +68,27 @@ class PPTXKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy.
Returns:
ChunkStrategy: default chunk strategy
"""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Knowledge type of PPTX.
Returns:
KnowledgeType: knowledge type
"""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Document type of PPTX.
Returns:
DocumentType: document type
"""
return DocumentType.PPTX

View File

@@ -1,3 +1,4 @@
"""String Knowledge."""
from typing import Any, List, Optional
from dbgpt.rag.chunk import Document
@@ -5,22 +6,23 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
class StringKnowledge(Knowledge):
"""String Knowledge"""
"""String Knowledge."""
def __init__(
self,
text: str = None,
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create String knowledge parameters.
Args:
text:(str) text
knowledge_type:(KnowledgeType) knowledge type
encoding:(encoding) csv encoding
loader:(loader) loader
text(str): text
knowledge_type(KnowledgeType): knowledge type
encoding(str): encoding
loader(Any): loader
"""
self._text = text
self._type = knowledge_type
@@ -28,21 +30,25 @@ class StringKnowledge(Knowledge):
self._encoding = encoding
def _load(self) -> List[Document]:
"""load raw text from loader"""
"""Load raw text from loader."""
metadata = {"source": "raw text"}
docs = [Document(content=self._text, metadata=metadata)]
return docs
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls):
"""Return knowledge type."""
return KnowledgeType.TEXT

View File

@@ -1,3 +1,4 @@
"""TXT Knowledge."""
from typing import Any, List, Optional
import chardet
@@ -12,7 +13,7 @@ from dbgpt.rag.knowledge.base import (
class TXTKnowledge(Knowledge):
"""TXT Knowledge"""
"""TXT Knowledge."""
def __init__(
self,
@@ -21,21 +22,24 @@ class TXTKnowledge(Knowledge):
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create TXT Knowledge with Knowledge arguments.
Args:
file_path:(Optional[str]) file path
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
def _load(self) -> List[Document]:
"""Load txt document from loader"""
"""Load txt document from loader."""
if self._loader:
documents = self._loader.load()
else:
if not self._path:
raise ValueError("file path is required")
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
@@ -50,6 +54,7 @@ class TXTKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls):
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -57,12 +62,15 @@ class TXTKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.TXT

View File

@@ -1,3 +1,4 @@
"""URL Knowledge."""
from typing import Any, List, Optional
from dbgpt.rag.chunk import Document
@@ -5,22 +6,25 @@ from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
class URLKnowledge(Knowledge):
"""URL Knowledge."""
def __init__(
self,
url: Optional[str] = None,
url: str = "",
knowledge_type: KnowledgeType = KnowledgeType.URL,
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments.
"""Create URL Knowledge with Knowledge arguments.
Args:
url:(Optional[str]) url
knowledge_type:(KnowledgeType) knowledge type
source_column:(Optional[str]) source column
encoding:(Optional[str]) csv encoding
loader:(Optional[Any]) loader
url(str, optional): url
knowledge_type(KnowledgeType, optional): knowledge type
source_column(str, optional): source column
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = url
self._type = knowledge_type
@@ -29,7 +33,7 @@ class URLKnowledge(Knowledge):
self._source_column = source_column
def _load(self) -> List[Document]:
"""Fetch URL document from loader"""
"""Fetch URL document from loader."""
if self._loader:
documents = self._loader.load()
else:
@@ -41,6 +45,7 @@ class URLKnowledge(Knowledge):
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
@@ -48,8 +53,10 @@ class URLKnowledge(Knowledge):
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls):
"""Return knowledge type."""
return KnowledgeType.URL