From 630d644cc4ac7ffbe42aaf7b915af37e28f03da2 Mon Sep 17 00:00:00 2001 From: yz <115013613+iCanDoAllThingszz@users.noreply.github.com> Date: Thu, 17 Oct 2024 13:21:41 +0800 Subject: [PATCH] Fixed the error caused by file format when docx loading files (#2061) Co-authored-by: zhaoyu93 Co-authored-by: aries_ckt <916701291@qq.com> --- dbgpt/rag/knowledge/docx.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dbgpt/rag/knowledge/docx.py b/dbgpt/rag/knowledge/docx.py index b10276347..7c1ecaa9f 100644 --- a/dbgpt/rag/knowledge/docx.py +++ b/dbgpt/rag/knowledge/docx.py @@ -2,6 +2,8 @@ from typing import Any, Dict, List, Optional, Union import docx +from docx.opc.oxml import parse_xml +from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships from dbgpt.core import Document from dbgpt.rag.knowledge.base import ( @@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import ( ) +def load_from_xml_v2(base_uri, rels_item_xml): + """Return |_SerializedRelationships| instance loaded with the relationships. + + contained in *rels_item_xml*.collection if *rels_item_xml* is |None|. + """ + srels = _SerializedRelationships() + if rels_item_xml is not None: + rels_elm = parse_xml(rels_item_xml) + for rel_elm in rels_elm.Relationship_lst: + if rel_elm.target_ref in ("../NULL", "NULL"): + continue + srels._srels.append(_SerializedRelationship(base_uri, rel_elm)) + return srels + + class DocxKnowledge(Knowledge): """Docx Knowledge.""" @@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge): documents = self._loader.load() else: docs = [] + _SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore doc = docx.Document(self._path) content = [] + for i in range(len(doc.paragraphs)): para = doc.paragraphs[i] text = para.text