mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-06 10:54:29 +00:00
105 lines
3.3 KiB
Python
105 lines
3.3 KiB
Python
"""Docx Knowledge."""
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
import docx
|
|
from docx.opc.oxml import parse_xml
|
|
from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
|
|
|
|
from dbgpt.core import Document
|
|
from dbgpt.rag.knowledge.base import (
|
|
ChunkStrategy,
|
|
DocumentType,
|
|
Knowledge,
|
|
KnowledgeType,
|
|
)
|
|
|
|
|
|
def load_from_xml_v2(base_uri, rels_item_xml):
|
|
"""Return |_SerializedRelationships| instance loaded with the relationships.
|
|
|
|
contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
|
|
"""
|
|
srels = _SerializedRelationships()
|
|
if rels_item_xml is not None:
|
|
rels_elm = parse_xml(rels_item_xml)
|
|
for rel_elm in rels_elm.Relationship_lst:
|
|
if rel_elm.target_ref in ("../NULL", "NULL"):
|
|
continue
|
|
srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
|
|
return srels
|
|
|
|
|
|
class DocxKnowledge(Knowledge):
|
|
"""Docx Knowledge."""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Optional[str] = None,
|
|
knowledge_type: Any = KnowledgeType.DOCUMENT,
|
|
encoding: Optional[str] = "utf-8",
|
|
loader: Optional[Any] = None,
|
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
|
**kwargs: Any,
|
|
) -> None:
|
|
"""Create Docx Knowledge with Knowledge arguments.
|
|
|
|
Args:
|
|
file_path(str, optional): file path
|
|
knowledge_type(KnowledgeType, optional): knowledge type
|
|
encoding(str, optional): csv encoding
|
|
loader(Any, optional): loader
|
|
"""
|
|
super().__init__(
|
|
path=file_path,
|
|
knowledge_type=knowledge_type,
|
|
data_loader=loader,
|
|
metadata=metadata,
|
|
**kwargs,
|
|
)
|
|
self._encoding = encoding
|
|
|
|
def _load(self) -> List[Document]:
|
|
"""Load docx document from loader."""
|
|
if self._loader:
|
|
documents = self._loader.load()
|
|
else:
|
|
docs = []
|
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore
|
|
doc = docx.Document(self._path)
|
|
content = []
|
|
|
|
for i in range(len(doc.paragraphs)):
|
|
para = doc.paragraphs[i]
|
|
text = para.text
|
|
content.append(text)
|
|
metadata = {"source": self._path}
|
|
if self._metadata:
|
|
metadata.update(self._metadata) # type: ignore
|
|
docs.append(Document(content="\n".join(content), metadata=metadata))
|
|
return docs
|
|
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
|
|
|
@classmethod
|
|
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
|
"""Return support chunk strategy."""
|
|
return [
|
|
ChunkStrategy.CHUNK_BY_SIZE,
|
|
ChunkStrategy.CHUNK_BY_PARAGRAPH,
|
|
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
|
]
|
|
|
|
@classmethod
|
|
def default_chunk_strategy(cls) -> ChunkStrategy:
|
|
"""Return default chunk strategy."""
|
|
return ChunkStrategy.CHUNK_BY_SIZE
|
|
|
|
@classmethod
|
|
def type(cls) -> KnowledgeType:
|
|
"""Return knowledge type."""
|
|
return KnowledgeType.DOCUMENT
|
|
|
|
@classmethod
|
|
def document_type(cls) -> DocumentType:
|
|
"""Return document type."""
|
|
return DocumentType.DOCX
|