mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-22 11:51:42 +00:00
Fixed the error caused by file format when docx loading files (#2061)
Co-authored-by: zhaoyu93 <zhaoyu93@meituan.com> Co-authored-by: aries_ckt <916701291@qq.com>
This commit is contained in:
parent
4cde891c11
commit
630d644cc4
@ -2,6 +2,8 @@
|
|||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
|
from docx.opc.oxml import parse_xml
|
||||||
|
from docx.opc.pkgreader import _SerializedRelationship, _SerializedRelationships
|
||||||
|
|
||||||
from dbgpt.core import Document
|
from dbgpt.core import Document
|
||||||
from dbgpt.rag.knowledge.base import (
|
from dbgpt.rag.knowledge.base import (
|
||||||
@ -12,6 +14,21 @@ from dbgpt.rag.knowledge.base import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_from_xml_v2(base_uri, rels_item_xml):
|
||||||
|
"""Return |_SerializedRelationships| instance loaded with the relationships.
|
||||||
|
|
||||||
|
contained in *rels_item_xml*.collection if *rels_item_xml* is |None|.
|
||||||
|
"""
|
||||||
|
srels = _SerializedRelationships()
|
||||||
|
if rels_item_xml is not None:
|
||||||
|
rels_elm = parse_xml(rels_item_xml)
|
||||||
|
for rel_elm in rels_elm.Relationship_lst:
|
||||||
|
if rel_elm.target_ref in ("../NULL", "NULL"):
|
||||||
|
continue
|
||||||
|
srels._srels.append(_SerializedRelationship(base_uri, rel_elm))
|
||||||
|
return srels
|
||||||
|
|
||||||
|
|
||||||
class DocxKnowledge(Knowledge):
|
class DocxKnowledge(Knowledge):
|
||||||
"""Docx Knowledge."""
|
"""Docx Knowledge."""
|
||||||
|
|
||||||
@ -47,8 +64,10 @@ class DocxKnowledge(Knowledge):
|
|||||||
documents = self._loader.load()
|
documents = self._loader.load()
|
||||||
else:
|
else:
|
||||||
docs = []
|
docs = []
|
||||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 # type: ignore
|
||||||
doc = docx.Document(self._path)
|
doc = docx.Document(self._path)
|
||||||
content = []
|
content = []
|
||||||
|
|
||||||
for i in range(len(doc.paragraphs)):
|
for i in range(len(doc.paragraphs)):
|
||||||
para = doc.paragraphs[i]
|
para = doc.paragraphs[i]
|
||||||
text = para.text
|
text = para.text
|
||||||
|
Loading…
Reference in New Issue
Block a user