update:PDF loader

This commit is contained in:
aries-ckt 2023-05-23 22:43:07 +08:00
parent 0b92066bf5
commit 926c971691
2 changed files with 5 additions and 5 deletions

View File

@ -1,7 +1,7 @@
import os
from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader, markdown
from langchain.document_loaders import TextLoader, markdown, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from pilot.configs.config import Config
@ -12,7 +12,6 @@ from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
from pilot.vector_store.connector import VectorStoreConnector
CFG = Config()
@ -89,7 +88,7 @@ class KnowledgeEmbedding:
docs[i].page_content = docs[i].page_content.replace("\n", " ")
i += 1
elif filename.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(filename)
loader = PyPDFLoader(filename)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(textsplitter)
i = 0

View File

@ -2,12 +2,12 @@
# -*- coding: utf-8 -*-
from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
class PDFEmbedding(SourceEmbedding):
@ -23,7 +23,8 @@ class PDFEmbedding(SourceEmbedding):
@register
def read(self):
"""Load from pdf path."""
loader = UnstructuredPaddlePDFLoader(self.file_path)
# loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
return loader.load_and_split(textsplitter)