update:PDF loader

This commit is contained in:
aries-ckt 2023-05-23 22:43:07 +08:00
parent 0b92066bf5
commit 926c971691
2 changed files with 5 additions and 5 deletions

View File

@ -1,7 +1,7 @@
import os import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader, markdown from langchain.document_loaders import TextLoader, markdown, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings from langchain.embeddings import HuggingFaceEmbeddings
from pilot.configs.config import Config from pilot.configs.config import Config
@ -12,7 +12,6 @@ from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
from pilot.source_embedding.pdf_embedding import PDFEmbedding from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown import markdown
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
from pilot.vector_store.connector import VectorStoreConnector from pilot.vector_store.connector import VectorStoreConnector
CFG = Config() CFG = Config()
@ -89,7 +88,7 @@ class KnowledgeEmbedding:
docs[i].page_content = docs[i].page_content.replace("\n", " ") docs[i].page_content = docs[i].page_content.replace("\n", " ")
i += 1 i += 1
elif filename.lower().endswith(".pdf"): elif filename.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(filename) loader = PyPDFLoader(filename)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(textsplitter) docs = loader.load_and_split(textsplitter)
i = 0 i = 0

View File

@ -2,12 +2,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from typing import List from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
class PDFEmbedding(SourceEmbedding): class PDFEmbedding(SourceEmbedding):
@ -23,7 +23,8 @@ class PDFEmbedding(SourceEmbedding):
@register @register
def read(self): def read(self):
"""Load from pdf path.""" """Load from pdf path."""
loader = UnstructuredPaddlePDFLoader(self.file_path) # loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
return loader.load_and_split(textsplitter) return loader.load_and_split(textsplitter)