DB-GPT/dbgpt/rag/knowledge/html.py

98 lines
3.0 KiB
Python

"""HTML Knowledge."""
from typing import Any, Dict, List, Optional, Union
import chardet
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)
class HTMLKnowledge(Knowledge):
"""HTML Knowledge."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create HTML Knowledge with Knowledge arguments.
Args:
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
def _load(self) -> List[Document]:
"""Load html document from loader."""
if self._loader:
documents = self._loader.load()
else:
if not self._path:
raise ValueError("file path is required")
with open(self._path, "rb") as f:
raw_text = f.read()
result = chardet.detect(raw_text)
if result["encoding"] is None:
text = raw_text.decode("utf-8")
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]
def _postprocess(self, documents: List[Document]):
import markdown
for i, d in enumerate(documents):
content = markdown.markdown(d.content)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["!doctype", "meta", "i.fa"]):
tag.extract()
documents[i].content = soup.get_text()
documents[i].content = documents[i].content.replace("\n", " ")
return documents
@classmethod
def support_chunk_strategy(cls):
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.HTML