mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-29 13:34:14 +00:00
update:knowledge env
This commit is contained in:
@@ -5,19 +5,20 @@ from typing import List
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.schema import Document
|
||||
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
from pilot.configs.config import Config
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
|
||||
CFG = Config()
|
||||
|
||||
|
||||
class PDFEmbedding(SourceEmbedding):
|
||||
"""pdf embedding for read pdf document."""
|
||||
|
||||
def __init__(self, file_path, model_name, vector_store_config, encoding):
|
||||
def __init__(self, file_path, vector_store_config, encoding):
|
||||
"""Initialize with pdf path."""
|
||||
super().__init__(file_path, model_name, vector_store_config)
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.model_name = model_name
|
||||
self.vector_store_config = vector_store_config
|
||||
self.encoding = encoding
|
||||
|
||||
@@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding):
|
||||
# loader = UnstructuredPaddlePDFLoader(self.file_path)
|
||||
loader = PyPDFLoader(self.file_path)
|
||||
textsplitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
|
||||
|
Reference in New Issue
Block a user