Fixed the error caused by file format when docx loading files (#2061)

Co-authored-by: zhaoyu93 <zhaoyu93@meituan.com>
Co-authored-by: aries_ckt <916701291@qq.com>
This commit is contained in:
yz 2024-10-17 13:21:41 +08:00 committed by GitHub
parent 4cde891c11
commit 630d644cc4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,6 +2,8 @@
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
import docx import docx
from docx.opc.oxml import parse_xml
from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
from dbgpt.core import Document from dbgpt.core import Document
from dbgpt.rag.knowledge.base import ( from dbgpt.rag.knowledge.base import (
@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import (
) )
def load_from_xml_v2(base_uri, rels_item_xml):
"""Return |_SerializedRelationships| instance loaded with the relationships.
contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ("../NULL", "NULL"):
continue
srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
return srels
class DocxKnowledge(Knowledge): class DocxKnowledge(Knowledge):
"""Docx Knowledge.""" """Docx Knowledge."""
@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge):
documents = self._loader.load() documents = self._loader.load()
else: else:
docs = [] docs = []
_SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore
doc = docx.Document(self._path) doc = docx.Document(self._path)
content = [] content = []
for i in range(len(doc.paragraphs)): for i in range(len(doc.paragraphs)):
para = doc.paragraphs[i] para = doc.paragraphs[i]
text = para.text text = para.text