mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-22 20:01:46 +00:00
Fixed the error caused by file format when docx loading files (#2061)
Co-authored-by: zhaoyu93 <zhaoyu93@meituan.com> Co-authored-by: aries_ckt <916701291@qq.com>
This commit is contained in:
parent
4cde891c11
commit
630d644cc4
@ -2,6 +2,8 @@
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import docx
|
||||
from docx.opc.oxml import parse_xml
|
||||
from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import (
|
||||
)
|
||||
|
||||
|
||||
def load_from_xml_v2(base_uri, rels_item_xml):
|
||||
"""Return |_SerializedRelationships| instance loaded with the relationships.
|
||||
|
||||
contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
|
||||
"""
|
||||
srels = _SerializedRelationships()
|
||||
if rels_item_xml is not None:
|
||||
rels_elm = parse_xml(rels_item_xml)
|
||||
for rel_elm in rels_elm.Relationship_lst:
|
||||
if rel_elm.target_ref in ("../NULL", "NULL"):
|
||||
continue
|
||||
srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
|
||||
return srels
|
||||
|
||||
|
||||
class DocxKnowledge(Knowledge):
|
||||
"""Docx Knowledge."""
|
||||
|
||||
@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge):
|
||||
documents = self._loader.load()
|
||||
else:
|
||||
docs = []
|
||||
_SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore
|
||||
doc = docx.Document(self._path)
|
||||
content = []
|
||||
|
||||
for i in range(len(doc.paragraphs)):
|
||||
para = doc.paragraphs[i]
|
||||
text = para.text
|
||||
|
Loading…
Reference in New Issue
Block a user