From c0115061c26af771fd21bdc993c8032920b03638 Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Wed, 17 May 2023 23:03:51 +0800 Subject: [PATCH] update:pdf split chunk --- pilot/server/webserver.py | 5 ++++- pilot/source_embedding/pdf_embedding.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index ad9f66b61..a76cf3400 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -254,6 +254,8 @@ def http_bot(state, mode, sql_mode, db_selector, temperature, max_new_tokens, re result = prompt_template.format(context="\n".join(context), question=query) state.messages[-2][1] = result prompt = state.get_prompt() + print("prompt length:" + len(prompt)) + if len(prompt) > 4000: logger.info("prompt length greater than 4000, rebuild") context = context[:2000] @@ -264,7 +266,8 @@ def http_bot(state, mode, sql_mode, db_selector, temperature, max_new_tokens, re result = prompt_template.format(context="\n".join(context), question=query) state.messages[-2][1] = result prompt = state.get_prompt() - print(len(prompt)) + print("new prompt length:" + len(prompt)) + state.messages[-2][1] = query skip_echo_len = len(prompt.replace("", " ")) + 1 diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index e162aefd8..557637c5a 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -6,6 +6,7 @@ from langchain.document_loaders import PyPDFLoader from langchain.schema import Document from pilot.source_embedding import SourceEmbedding, register +from pilot.source_embedding.chinese_text_splitter import ChineseTextSplitter class PDFEmbedding(SourceEmbedding): @@ -22,7 +23,8 @@ class PDFEmbedding(SourceEmbedding): def read(self): """Load from pdf path.""" loader = PyPDFLoader(self.file_path) - return loader.load() + textsplitter = ChineseTextSplitter(pdf=True, sentence_size=100) + return loader.load_and_split(textsplitter) @register def data_process(self, documents: List[Document]):