update:knowledge env

This commit is contained in:
aries-ckt
2023-06-05 18:08:55 +08:00
parent f2f28fee42
commit e29fa37cde
18 changed files with 43 additions and 80 deletions

View File

@@ -5,19 +5,20 @@ from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.configs.config import Config
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
CFG = Config()
class PDFEmbedding(SourceEmbedding):
"""pdf embedding for read pdf document."""
def __init__(self, file_path, model_name, vector_store_config, encoding):
def __init__(self, file_path, vector_store_config, encoding):
"""Initialize with pdf path."""
super().__init__(file_path, model_name, vector_store_config)
super().__init__(file_path, vector_store_config)
self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config
self.encoding = encoding
@@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding):
# loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
)
return loader.load_and_split(textsplitter)