diff --git a/dbgpt/rag/knowledge/docx.py b/dbgpt/rag/knowledge/docx.py index b10276347..7c1ecaa9f 100644 --- a/dbgpt/rag/knowledge/docx.py +++ b/dbgpt/rag/knowledge/docx.py @@ -2,6 +2,8 @@ from typing import Any, Dict, List, Optional, Union import docx +from docx.opc.oxml import parse_xml +from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships from dbgpt.core import Document from dbgpt.rag.knowledge.base import ( @@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import ( ) +def load_from_xml_v2(base_uri, rels_item_xml): + """Return |_SerializedRelationships| instance loaded with the relationships. + + contained in *rels_item_xml*.collection if *rels_item_xml* is |None|. + """ + srels = _SerializedRelationships() + if rels_item_xml is not None: + rels_elm = parse_xml(rels_item_xml) + for rel_elm in rels_elm.Relationship_lst: + if rel_elm.target_ref in ("../NULL", "NULL"): + continue + srels._srels.append(_SerializedRelationship(base_uri, rel_elm)) + return srels + + class DocxKnowledge(Knowledge): """Docx Knowledge.""" @@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge): documents = self._loader.load() else: docs = [] + _SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore doc = docx.Document(self._path) content = [] + for i in range(len(doc.paragraphs)): para = doc.paragraphs[i] text = para.text