diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py index 93fa185a6..2f313a35a 100644 --- a/pilot/source_embedding/knowledge_embedding.py +++ b/pilot/source_embedding/knowledge_embedding.py @@ -1,7 +1,7 @@ import os from bs4 import BeautifulSoup -from langchain.document_loaders import TextLoader, markdown +from langchain.document_loaders import TextLoader, markdown, PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings from pilot.configs.config import Config @@ -12,7 +12,6 @@ from pilot.source_embedding.markdown_embedding import MarkdownEmbedding from pilot.source_embedding.pdf_embedding import PDFEmbedding import markdown -from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader from pilot.vector_store.connector import VectorStoreConnector CFG = Config() @@ -89,7 +88,7 @@ class KnowledgeEmbedding: docs[i].page_content = docs[i].page_content.replace("\n", " ") i += 1 elif filename.lower().endswith(".pdf"): - loader = UnstructuredPaddlePDFLoader(filename) + loader = PyPDFLoader(filename) textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) docs = loader.load_and_split(textsplitter) i = 0 diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index a8749695b..75d17c4c6 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -2,12 +2,12 @@ # -*- coding: utf-8 -*- from typing import List +from langchain.document_loaders import PyPDFLoader from langchain.schema import Document from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter -from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader class PDFEmbedding(SourceEmbedding): @@ -23,7 +23,8 @@ class PDFEmbedding(SourceEmbedding): @register def read(self): """Load from pdf path.""" - loader = UnstructuredPaddlePDFLoader(self.file_path) + # loader = UnstructuredPaddlePDFLoader(self.file_path) + loader = PyPDFLoader(self.file_path) textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) return loader.load_and_split(textsplitter)