From 8dd25815e14ff29e881048ffc1a8477072335b64 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Thu, 1 Jun 2023 21:28:25 +0800
Subject: [PATCH 1/2] fix:url embedding

---
 pilot/scene/chat_knowledge/custom/prompt.py  |  2 +-
 pilot/scene/chat_knowledge/default/prompt.py |  6 +++++-
 pilot/scene/chat_knowledge/url/prompt.py     | 19 +++++++++++++++----
 pilot/source_embedding/url_embedding.py      |  2 +-
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/pilot/scene/chat_knowledge/custom/prompt.py b/pilot/scene/chat_knowledge/custom/prompt.py
index c3153c819..110250221 100644
--- a/pilot/scene/chat_knowledge/custom/prompt.py
+++ b/pilot/scene/chat_knowledge/custom/prompt.py
@@ -31,7 +31,7 @@ prompt = PromptTemplate(
     template_scene=ChatScene.ChatNewKnowledge.value,
     input_variables=["context", "question"],
     response_format=None,
-    template_define=None,
+    template_define=PROMPT_SCENE_DEFINE,
     template=_DEFAULT_TEMPLATE,
     stream_out=PROMPT_NEED_NEED_STREAM_OUT,
     output_parser=NormalChatOutputParser(
diff --git a/pilot/scene/chat_knowledge/default/prompt.py b/pilot/scene/chat_knowledge/default/prompt.py
index 51d2419d5..0526be69b 100644
--- a/pilot/scene/chat_knowledge/default/prompt.py
+++ b/pilot/scene/chat_knowledge/default/prompt.py
@@ -11,6 +11,10 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser
 
 CFG = Config()
 
+PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. 
+    The assistant gives helpful, detailed, professional and polite answers to the user's questions. """
+
+
 _DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
             如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造。 
             已知内容: 
@@ -28,7 +32,7 @@ prompt = PromptTemplate(
     template_scene=ChatScene.ChatKnowledge.value,
     input_variables=["context", "question"],
     response_format=None,
-    template_define=None,
+    template_define=PROMPT_SCENE_DEFINE,
     template=_DEFAULT_TEMPLATE,
     stream_out=PROMPT_NEED_NEED_STREAM_OUT,
     output_parser=NormalChatOutputParser(
diff --git a/pilot/scene/chat_knowledge/url/prompt.py b/pilot/scene/chat_knowledge/url/prompt.py
index 20a69d8b2..96e1ee520 100644
--- a/pilot/scene/chat_knowledge/url/prompt.py
+++ b/pilot/scene/chat_knowledge/url/prompt.py
@@ -11,10 +11,21 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser
 
 CFG = Config()
 
-_DEFAULT_TEMPLATE = """ Based on the known information, provide professional and concise answers to the user's questions. If the answer cannot be obtained from the provided content, please say: 'The information provided in the knowledge base is not sufficient to answer this question.' Fabrication is prohibited.。 
-            known information: 
+PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. 
+    The assistant gives helpful, detailed, professional and polite answers to the user's questions. """
+
+
+# _DEFAULT_TEMPLATE = """ Based on the known information, provide professional and concise answers to the user's questions. If the answer cannot be obtained from the provided content, please say: 'The information provided in the knowledge base is not sufficient to answer this question.' Fabrication is prohibited.。
+#             known information:
+#             {context}
+#             question:
+#             {question}
+# """
+_DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
+            如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造。 
+            已知内容: 
             {context}
-            question:
+            问题:
             {question}
 """
 
@@ -27,7 +38,7 @@ prompt = PromptTemplate(
     template_scene=ChatScene.ChatUrlKnowledge.value,
     input_variables=["context", "question"],
     response_format=None,
-    template_define=None,
+    template_define=PROMPT_SCENE_DEFINE,
     template=_DEFAULT_TEMPLATE,
     stream_out=PROMPT_NEED_NEED_STREAM_OUT,
     output_parser=NormalChatOutputParser(
diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py
index e74defa80..774f6e852 100644
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -23,7 +23,7 @@ class URLEmbedding(SourceEmbedding):
         """Load from url path."""
         loader = WebBaseLoader(web_path=self.file_path)
         text_splitor = CharacterTextSplitter(
-            chunk_size=1000, chunk_overlap=20, length_function=len
+            chunk_size=100, chunk_overlap=20, length_function=len
         )
         return loader.load_and_split(text_splitor)
 

From 1d432e4d297c3d0e9f779bfdb8d8aaae2041220d Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Thu, 1 Jun 2023 22:07:33 +0800
Subject: [PATCH 2/2] fix:url embedding

---
 pilot/out_parser/base.py                 |  6 ++++--
 pilot/scene/chat_knowledge/url/prompt.py |  2 +-
 pilot/source_embedding/url_embedding.py  | 18 +++++++++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/pilot/out_parser/base.py b/pilot/out_parser/base.py
index 72868cf4d..46a7dde8b 100644
--- a/pilot/out_parser/base.py
+++ b/pilot/out_parser/base.py
@@ -53,9 +53,11 @@ class BaseOutputParser(ABC):
         """
         if data["error_code"] == 0:
             if "vicuna" in CFG.LLM_MODEL:
-                output = data["text"][skip_echo_len + 11:].strip()
+                # output = data["text"][skip_echo_len + 11:].strip()
+                output = data["text"][skip_echo_len:].strip()
             elif "guanaco" in CFG.LLM_MODEL:
-                output = data["text"][skip_echo_len + 14:].replace("<s>", "").strip()
+                # output = data["text"][skip_echo_len + 14:].replace("<s>", "").strip()
+                output = data["text"][skip_echo_len:].replace("<s>", "").strip()
             else:
                 output = data["text"].strip()
 
diff --git a/pilot/scene/chat_knowledge/url/prompt.py b/pilot/scene/chat_knowledge/url/prompt.py
index 96e1ee520..38d5dfe35 100644
--- a/pilot/scene/chat_knowledge/url/prompt.py
+++ b/pilot/scene/chat_knowledge/url/prompt.py
@@ -11,7 +11,7 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser
 
 CFG = Config()
 
-PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. 
+PROMPT_SCENE_DEFINE = """A chat between a curious human and an artificial intelligence assistant, who very familiar with database related knowledge. 
     The assistant gives helpful, detailed, professional and polite answers to the user's questions. """
 
 
diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py
index 774f6e852..7acfaf961 100644
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -5,9 +5,12 @@ from langchain.document_loaders import WebBaseLoader
 from langchain.schema import Document
 from langchain.text_splitter import CharacterTextSplitter
 
+from pilot.configs.config import Config
+from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
 from pilot.source_embedding import SourceEmbedding, register
+from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 
-
+CFG = Config()
 class URLEmbedding(SourceEmbedding):
     """url embedding for read url document."""
 
@@ -22,10 +25,15 @@ class URLEmbedding(SourceEmbedding):
     def read(self):
         """Load from url path."""
         loader = WebBaseLoader(web_path=self.file_path)
-        text_splitor = CharacterTextSplitter(
-            chunk_size=100, chunk_overlap=20, length_function=len
-        )
-        return loader.load_and_split(text_splitor)
+        if CFG.LANGUAGE == "en":
+            text_splitter = CharacterTextSplitter(
+                chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE, chunk_overlap=20, length_function=len
+            )
+        else:
+            text_splitter = CHNDocumentSplitter(
+                pdf=True, sentence_size=1000
+            )
+        return loader.load_and_split(text_splitter)
 
     @register
     def data_process(self, documents: List[Document]):