mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-19 00:37:34 +00:00
Merge branch 'dev' of https://github.com/csunny/DB-GPT into llm_fxp
This commit is contained in:
commit
eae38931fa
@ -4,10 +4,10 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import SpacyTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
|
||||
CFG = Config()
|
||||
|
||||
@ -24,11 +24,11 @@ class PDFEmbedding(SourceEmbedding):
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from pdf path."""
|
||||
# loader = UnstructuredPaddlePDFLoader(self.file_path)
|
||||
loader = PyPDFLoader(self.file_path)
|
||||
textsplitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
)
|
||||
# textsplitter = CHNDocumentSplitter(
|
||||
# pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
# )
|
||||
textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=1000, chunk_overlap=200)
|
||||
return loader.load_and_split(textsplitter)
|
||||
|
||||
@register
|
||||
|
Loading…
Reference in New Issue
Block a user