From 8dd25815e14ff29e881048ffc1a8477072335b64 Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Thu, 1 Jun 2023 21:28:25 +0800 Subject: [PATCH 1/2] fix:url embedding --- pilot/scene/chat_knowledge/custom/prompt.py | 2 +- pilot/scene/chat_knowledge/default/prompt.py | 6 +++++- pilot/scene/chat_knowledge/url/prompt.py | 19 +++++++++++++++---- pilot/source_embedding/url_embedding.py | 2 +- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pilot/scene/chat_knowledge/custom/prompt.py b/pilot/scene/chat_knowledge/custom/prompt.py index c3153c819..110250221 100644 --- a/pilot/scene/chat_knowledge/custom/prompt.py +++ b/pilot/scene/chat_knowledge/custom/prompt.py @@ -31,7 +31,7 @@ prompt = PromptTemplate( template_scene=ChatScene.ChatNewKnowledge.value, input_variables=["context", "question"], response_format=None, - template_define=None, + template_define=PROMPT_SCENE_DEFINE, template=_DEFAULT_TEMPLATE, stream_out=PROMPT_NEED_NEED_STREAM_OUT, output_parser=NormalChatOutputParser( diff --git a/pilot/scene/chat_knowledge/default/prompt.py b/pilot/scene/chat_knowledge/default/prompt.py index 51d2419d5..0526be69b 100644 --- a/pilot/scene/chat_knowledge/default/prompt.py +++ b/pilot/scene/chat_knowledge/default/prompt.py @@ -11,6 +11,10 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser CFG = Config() +PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. + The assistant gives helpful, detailed, professional and polite answers to the user's questions. """ + + _DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题, 如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造。 已知内容: @@ -28,7 +32,7 @@ prompt = PromptTemplate( template_scene=ChatScene.ChatKnowledge.value, input_variables=["context", "question"], response_format=None, - template_define=None, + template_define=PROMPT_SCENE_DEFINE, template=_DEFAULT_TEMPLATE, stream_out=PROMPT_NEED_NEED_STREAM_OUT, output_parser=NormalChatOutputParser( diff --git a/pilot/scene/chat_knowledge/url/prompt.py b/pilot/scene/chat_knowledge/url/prompt.py index 20a69d8b2..96e1ee520 100644 --- a/pilot/scene/chat_knowledge/url/prompt.py +++ b/pilot/scene/chat_knowledge/url/prompt.py @@ -11,10 +11,21 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser CFG = Config() -_DEFAULT_TEMPLATE = """ Based on the known information, provide professional and concise answers to the user's questions. If the answer cannot be obtained from the provided content, please say: 'The information provided in the knowledge base is not sufficient to answer this question.' Fabrication is prohibited.。 - known information: +PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. + The assistant gives helpful, detailed, professional and polite answers to the user's questions. """ + + +# _DEFAULT_TEMPLATE = """ Based on the known information, provide professional and concise answers to the user's questions. If the answer cannot be obtained from the provided content, please say: 'The information provided in the knowledge base is not sufficient to answer this question.' Fabrication is prohibited.。 +# known information: +# {context} +# question: +# {question} +# """ +_DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题, + 如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造。 + 已知内容: {context} - question: + 问题: {question} """ @@ -27,7 +38,7 @@ prompt = PromptTemplate( template_scene=ChatScene.ChatUrlKnowledge.value, input_variables=["context", "question"], response_format=None, - template_define=None, + template_define=PROMPT_SCENE_DEFINE, template=_DEFAULT_TEMPLATE, stream_out=PROMPT_NEED_NEED_STREAM_OUT, output_parser=NormalChatOutputParser( diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py index e74defa80..774f6e852 100644 --- a/pilot/source_embedding/url_embedding.py +++ b/pilot/source_embedding/url_embedding.py @@ -23,7 +23,7 @@ class URLEmbedding(SourceEmbedding): """Load from url path.""" loader = WebBaseLoader(web_path=self.file_path) text_splitor = CharacterTextSplitter( - chunk_size=1000, chunk_overlap=20, length_function=len + chunk_size=100, chunk_overlap=20, length_function=len ) return loader.load_and_split(text_splitor) From 1d432e4d297c3d0e9f779bfdb8d8aaae2041220d Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Thu, 1 Jun 2023 22:07:33 +0800 Subject: [PATCH 2/2] fix:url embedding --- pilot/out_parser/base.py | 6 ++++-- pilot/scene/chat_knowledge/url/prompt.py | 2 +- pilot/source_embedding/url_embedding.py | 18 +++++++++++++----- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pilot/out_parser/base.py b/pilot/out_parser/base.py index 72868cf4d..46a7dde8b 100644 --- a/pilot/out_parser/base.py +++ b/pilot/out_parser/base.py @@ -53,9 +53,11 @@ class BaseOutputParser(ABC): """ if data["error_code"] == 0: if "vicuna" in CFG.LLM_MODEL: - output = data["text"][skip_echo_len + 11:].strip() + # output = data["text"][skip_echo_len + 11:].strip() + output = data["text"][skip_echo_len:].strip() elif "guanaco" in CFG.LLM_MODEL: - output = data["text"][skip_echo_len + 14:].replace("", "").strip() + # output = data["text"][skip_echo_len + 14:].replace("", "").strip() + output = data["text"][skip_echo_len:].replace("", "").strip() else: output = data["text"].strip() diff --git a/pilot/scene/chat_knowledge/url/prompt.py b/pilot/scene/chat_knowledge/url/prompt.py index 96e1ee520..38d5dfe35 100644 --- a/pilot/scene/chat_knowledge/url/prompt.py +++ b/pilot/scene/chat_knowledge/url/prompt.py @@ -11,7 +11,7 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser CFG = Config() -PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. +PROMPT_SCENE_DEFINE = """A chat between a curious human and an artificial intelligence assistant, who very familiar with database related knowledge. The assistant gives helpful, detailed, professional and polite answers to the user's questions. """ diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py index 774f6e852..7acfaf961 100644 --- a/pilot/source_embedding/url_embedding.py +++ b/pilot/source_embedding/url_embedding.py @@ -5,9 +5,12 @@ from langchain.document_loaders import WebBaseLoader from langchain.schema import Document from langchain.text_splitter import CharacterTextSplitter +from pilot.configs.config import Config +from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.source_embedding import SourceEmbedding, register +from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter - +CFG = Config() class URLEmbedding(SourceEmbedding): """url embedding for read url document.""" @@ -22,10 +25,15 @@ class URLEmbedding(SourceEmbedding): def read(self): """Load from url path.""" loader = WebBaseLoader(web_path=self.file_path) - text_splitor = CharacterTextSplitter( - chunk_size=100, chunk_overlap=20, length_function=len - ) - return loader.load_and_split(text_splitor) + if CFG.LANGUAGE == "en": + text_splitter = CharacterTextSplitter( + chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE, chunk_overlap=20, length_function=len + ) + else: + text_splitter = CHNDocumentSplitter( + pdf=True, sentence_size=1000 + ) + return loader.load_and_split(text_splitter) @register def data_process(self, documents: List[Document]):