From 45fbcafbf68c31da751a606b6ec4799ee69551e0 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Mon, 24 Jul 2023 17:37:21 +0800 Subject: [PATCH] fix:word embedding update 1.use Docx2txtLoader replace UnstructuredWordDocumentLoader --- pilot/embedding_engine/word_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/embedding_engine/word_embedding.py b/pilot/embedding_engine/word_embedding.py index 24f1d0386..55a2b5079 100644 --- a/pilot/embedding_engine/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- from typing import List, Optional -from langchain.document_loaders import UnstructuredWordDocumentLoader from langchain.schema import Document +from langchain.document_loaders import Docx2txtLoader from langchain.text_splitter import ( SpacyTextSplitter, RecursiveCharacterTextSplitter, @@ -36,7 +36,7 @@ class WordEmbedding(SourceEmbedding): def read(self): """Load from word path.""" if self.source_reader is None: - self.source_reader = UnstructuredWordDocumentLoader(self.file_path) + self.source_reader = Docx2txtLoader(self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter(