Merge branch 'dev' of https://github.com/csunny/DB-GPT into llm_fxp

This commit is contained in:
csunny 2023-06-08 20:07:44 +08:00
commit eae38931fa

View File

@ -4,10 +4,10 @@ from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter
from pilot.configs.config import Config
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
CFG = Config()
@ -24,11 +24,11 @@ class PDFEmbedding(SourceEmbedding):
@register
def read(self):
"""Load from pdf path."""
# loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
)
# textsplitter = CHNDocumentSplitter(
# pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
# )
textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=1000, chunk_overlap=200)
return loader.load_and_split(textsplitter)
@register