DB-GPT/dbgpt/rag/knowledge/docx.py

105 lines
3.3 KiB
Python

"""Docx Knowledge."""
from typing import Any, Dict, List, Optional, Union
import docx
from docx.opc.oxml import parse_xml
from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)
def load_from_xml_v2(base_uri, rels_item_xml):
"""Return |_SerializedRelationships| instance loaded with the relationships.
contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ("../NULL", "NULL"):
continue
srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
return srels
class DocxKnowledge(Knowledge):
"""Docx Knowledge."""
def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Any = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Docx Knowledge with Knowledge arguments.
Args:
file_path(str, optional): file path
knowledge_type(KnowledgeType, optional): knowledge type
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load docx document from loader."""
if self._loader:
documents = self._loader.load()
else:
docs = []
_SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore
doc = docx.Document(self._path)
content = []
for i in range(len(doc.paragraphs)):
para = doc.paragraphs[i]
text = para.text
content.append(text)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content="\n".join(content), metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
"""Return support chunk strategy."""
return [
ChunkStrategy.CHUNK_BY_SIZE,
ChunkStrategy.CHUNK_BY_PARAGRAPH,
ChunkStrategy.CHUNK_BY_SEPARATOR,
]
@classmethod
def default_chunk_strategy(cls) -> ChunkStrategy:
"""Return default chunk strategy."""
return ChunkStrategy.CHUNK_BY_SIZE
@classmethod
def type(cls) -> KnowledgeType:
"""Return knowledge type."""
return KnowledgeType.DOCUMENT
@classmethod
def document_type(cls) -> DocumentType:
"""Return document type."""
return DocumentType.DOCX