Fixed the error caused by file format when docx loading files (#2061)

Co-authored-by: zhaoyu93 <zhaoyu93@meituan.com> Co-authored-by: aries_ckt <916701291@qq.com>
2025-07-22 11:51:42 +00:00 · 2024-10-17 13:21:41 +08:00 · 2024-10-17 13:21:41 +08:00 · 630d644cc4
commit 630d644cc4
parent 4cde891c11
1 changed files with 19 additions and 0 deletions
--- a/dbgpt/rag/knowledge/docx.py
+++ b/dbgpt/rag/knowledge/docx.py
@ -2,6 +2,8 @@
 from typing import Any, Dict, List, Optional, Union
 import docx
 from docx.opc.oxml import parse_xml
 from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
 from dbgpt.core import Document
 from dbgpt.rag.knowledge.base import (
@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import (
 )
 def load_from_xml_v2(base_uri, rels_item_xml):
    """Return |_SerializedRelationships| instance loaded with the relationships.
    contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ("../NULL", "NULL"):
                continue
            srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
    return srels
 class DocxKnowledge(Knowledge):
    """Docx Knowledge."""
@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge):
            documents = self._loader.load()
        else:
            docs = []
            _SerializedRelationships.load_from_xml = load_from_xml_v2   # type: ignore
            doc = docx.Document(self._path)
            content = []
            for i in range(len(doc.paragraphs)):
                para = doc.paragraphs[i]
                text = para.text