mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-19 08:47:32 +00:00
Merge branch 'dev' of https://github.com/csunny/DB-GPT into llm_fxp
This commit is contained in:
commit
eae38931fa
@ -4,10 +4,10 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader
|
from langchain.document_loaders import PyPDFLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
from langchain.text_splitter import SpacyTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
from pilot.source_embedding import SourceEmbedding, register
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
CFG = Config()
|
||||||
|
|
||||||
@ -24,11 +24,11 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from pdf path."""
|
"""Load from pdf path."""
|
||||||
# loader = UnstructuredPaddlePDFLoader(self.file_path)
|
|
||||||
loader = PyPDFLoader(self.file_path)
|
loader = PyPDFLoader(self.file_path)
|
||||||
textsplitter = CHNDocumentSplitter(
|
# textsplitter = CHNDocumentSplitter(
|
||||||
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
# pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||||
)
|
# )
|
||||||
|
textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=1000, chunk_overlap=200)
|
||||||
return loader.load_and_split(textsplitter)
|
return loader.load_and_split(textsplitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
Loading…
Reference in New Issue
Block a user