From e29fa37cde7d62bd200bfa82a29e23435c38d64c Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Mon, 5 Jun 2023 18:08:55 +0800
Subject: [PATCH] update:knowledge env

---
 .env.template                                 |  8 +++-
 pilot/configs/config.py                       |  4 +-
 pilot/scene/chat_knowledge/custom/chat.py     |  3 +-
 pilot/scene/chat_knowledge/default/chat.py    |  3 +-
 pilot/scene/chat_knowledge/url/chat.py        |  3 +-
 pilot/server/vectordb_qa.py                   |  6 ++-
 pilot/server/webserver.py                     |  1 -
 pilot/source_embedding/csv_embedding.py       |  4 +-
 pilot/source_embedding/knowledge_embedding.py |  1 -
 pilot/source_embedding/markdown_embedding.py  | 11 +++---
 pilot/source_embedding/pdf_embedding.py       | 11 +++---
 pilot/source_embedding/source_embedding.py    |  2 -
 pilot/source_embedding/string_embedding.py    |  5 +--
 pilot/source_embedding/url_embedding.py       |  7 ++--
 pilot/source_embedding/word_embedding.py      | 11 +++---
 pilot/summary/db_summary_client.py            | 39 ++-----------------
 pilot/vector_store/file_loader.py             |  2 -
 tools/knowlege_init.py                        |  2 -
 18 files changed, 43 insertions(+), 80 deletions(-)

diff --git a/.env.template b/.env.template
index 3e8ae536b..234b12738 100644
--- a/.env.template
+++ b/.env.template
@@ -28,8 +28,12 @@ MAX_POSITION_EMBEDDINGS=4096
 # FAST_LLM_MODEL=chatglm-6b
 
 
-### EMBEDDINGS
-## EMBEDDING_MODEL       - Model to use for creating embeddings
+#*******************************************************************#
+#**                         EMBEDDING SETTINGS                    **#
+#*******************************************************************#
+EMBEDDING_MODEL=text2vec
+KNOWLEDGE_CHUNK_SIZE=500
+KNOWLEDGE_SEARCH_TOP_SIZE=5
 ## EMBEDDING_TOKENIZER   - Tokenizer to use for chunking large inputs
 ## EMBEDDING_TOKEN_LIMIT - Chunk size limit for large inputs
 # EMBEDDING_MODEL=all-MiniLM-L6-v2
diff --git a/pilot/configs/config.py b/pilot/configs/config.py
index 6f6271477..c4458eaf7 100644
--- a/pilot/configs/config.py
+++ b/pilot/configs/config.py
@@ -148,8 +148,8 @@ class Config(metaclass=Singleton):
 
         ### EMBEDDING Configuration
         self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec")
-        self.KNOWLEDGE_CHUNK_SIZE = os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)
-        self.KNOWLEDGE_SEARCH_TOP_SIZE = os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10)
+        self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 500))
+        self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10))
         ### SUMMARY_CONFIG Configuration
         self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR")
 
diff --git a/pilot/scene/chat_knowledge/custom/chat.py b/pilot/scene/chat_knowledge/custom/chat.py
index 8fc0f3d82..a56b2a098 100644
--- a/pilot/scene/chat_knowledge/custom/chat.py
+++ b/pilot/scene/chat_knowledge/custom/chat.py
@@ -14,7 +14,6 @@ from pilot.configs.model_config import (
     KNOWLEDGE_UPLOAD_ROOT_PATH,
     LLM_MODEL_CONFIG,
     LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )
 
 from pilot.scene.chat_knowledge.custom.prompt import prompt
@@ -52,7 +51,7 @@ class ChatNewKnowledge(BaseChat):
 
     def generate_input_values(self):
         docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
         )
         context = [d.page_content for d in docs]
         context = context[:2000]
diff --git a/pilot/scene/chat_knowledge/default/chat.py b/pilot/scene/chat_knowledge/default/chat.py
index 1087ee2c0..325b03783 100644
--- a/pilot/scene/chat_knowledge/default/chat.py
+++ b/pilot/scene/chat_knowledge/default/chat.py
@@ -14,7 +14,6 @@ from pilot.configs.model_config import (
     KNOWLEDGE_UPLOAD_ROOT_PATH,
     LLM_MODEL_CONFIG,
     LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )
 
 from pilot.scene.chat_knowledge.default.prompt import prompt
@@ -48,7 +47,7 @@ class ChatDefaultKnowledge(BaseChat):
 
     def generate_input_values(self):
         docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
         )
         context = [d.page_content for d in docs]
         context = context[:2000]
diff --git a/pilot/scene/chat_knowledge/url/chat.py b/pilot/scene/chat_knowledge/url/chat.py
index 2634dc80d..88dc7ad0b 100644
--- a/pilot/scene/chat_knowledge/url/chat.py
+++ b/pilot/scene/chat_knowledge/url/chat.py
@@ -14,7 +14,6 @@ from pilot.configs.model_config import (
     KNOWLEDGE_UPLOAD_ROOT_PATH,
     LLM_MODEL_CONFIG,
     LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )
 
 from pilot.scene.chat_knowledge.url.prompt import prompt
@@ -56,7 +55,7 @@ class ChatUrlKnowledge(BaseChat):
 
     def generate_input_values(self):
         docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
         )
         context = [d.page_content for d in docs]
         context = context[:2000]
diff --git a/pilot/server/vectordb_qa.py b/pilot/server/vectordb_qa.py
index 9faae5eb8..2a09e6a98 100644
--- a/pilot/server/vectordb_qa.py
+++ b/pilot/server/vectordb_qa.py
@@ -3,12 +3,14 @@
 
 from langchain.prompts import PromptTemplate
 
-from pilot.configs.model_config import VECTOR_SEARCH_TOP_K
+from pilot.configs.config import Config
 from pilot.conversation import conv_qa_prompt_template, conv_db_summary_templates
 from pilot.logs import logger
 from pilot.model.llm_out.vicuna_llm import VicunaLLM
 from pilot.vector_store.file_loader import KnownLedge2Vector
 
+CFG = Config()
+
 
 class KnownLedgeBaseQA:
     def __init__(self) -> None:
@@ -22,7 +24,7 @@ class KnownLedgeBaseQA:
         )
 
         retriever = self.vector_store.as_retriever(
-            search_kwargs={"k": VECTOR_SEARCH_TOP_K}
+            search_kwargs={"k": CFG.KNOWLEDGE_SEARCH_TOP_SIZE}
         )
         docs = retriever.get_relevant_documents(query=query)
 
diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py
index f7655fd7d..239fc5d9e 100644
--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@@ -634,7 +634,6 @@ def knowledge_embedding_store(vs_id, files):
         knowledge_embedding_client = KnowledgeEmbedding(
             file_path=os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vs_id, filename),
             model_name=LLM_MODEL_CONFIG["text2vec"],
-            local_persist=False,
             vector_store_config={
                 "vector_store_name": vector_store_name["vs_name"],
                 "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/source_embedding/csv_embedding.py
index 8b2e25ff3..0e69574b4 100644
--- a/pilot/source_embedding/csv_embedding.py
+++ b/pilot/source_embedding/csv_embedding.py
@@ -12,14 +12,12 @@ class CSVEmbedding(SourceEmbedding):
     def __init__(
         self,
         file_path,
-        model_name,
         vector_store_config,
         embedding_args: Optional[Dict] = None,
     ):
         """Initialize with csv path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
         self.embedding_args = embedding_args
 
diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index c81953ffc..1e072c861 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -64,7 +64,6 @@ class KnowledgeEmbedding:
             knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension]
             embedding = knowledge_class(
                 self.file_path,
-                model_name=self.model_name,
                 vector_store_config=self.vector_store_config,
                 **knowledge_args,
             )
diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py
index da974c366..e2851d122 100644
--- a/pilot/source_embedding/markdown_embedding.py
+++ b/pilot/source_embedding/markdown_embedding.py
@@ -8,20 +8,21 @@ from bs4 import BeautifulSoup
 from langchain.document_loaders import TextLoader
 from langchain.schema import Document
 
-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 
+CFG = Config()
+
 
 class MarkdownEmbedding(SourceEmbedding):
     """markdown embedding for read markdown document."""
 
-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
         """Initialize with markdown path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
         # self.encoding = encoding
 
@@ -30,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding):
         """Load from markdown path."""
         loader = EncodeTextLoader(self.file_path)
         text_splitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
         )
         return loader.load_and_split(text_splitter)
 
diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py
index 55f3783f3..6eced03f3 100644
--- a/pilot/source_embedding/pdf_embedding.py
+++ b/pilot/source_embedding/pdf_embedding.py
@@ -5,19 +5,20 @@ from typing import List
 from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
 
-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 
+CFG = Config()
+
 
 class PDFEmbedding(SourceEmbedding):
     """pdf embedding for read pdf document."""
 
-    def __init__(self, file_path, model_name, vector_store_config, encoding):
+    def __init__(self, file_path, vector_store_config, encoding):
         """Initialize with pdf path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
         self.encoding = encoding
 
@@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding):
         # loader = UnstructuredPaddlePDFLoader(self.file_path)
         loader = PyPDFLoader(self.file_path)
         textsplitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
         )
         return loader.load_and_split(textsplitter)
 
diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py
index 7db92ea9b..50c7044f9 100644
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -23,13 +23,11 @@ class SourceEmbedding(ABC):
     def __init__(
         self,
         file_path,
-        model_name,
         vector_store_config,
         embedding_args: Optional[Dict] = None,
     ):
         """Initialize with Loader url, model_name, vector_store_config"""
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
         self.embedding_args = embedding_args
         self.embeddings = vector_store_config["embeddings"]
diff --git a/pilot/source_embedding/string_embedding.py b/pilot/source_embedding/string_embedding.py
index b4d7b1228..a1d18ee82 100644
--- a/pilot/source_embedding/string_embedding.py
+++ b/pilot/source_embedding/string_embedding.py
@@ -8,11 +8,10 @@ from pilot import SourceEmbedding, register
 class StringEmbedding(SourceEmbedding):
     """string embedding for read string document."""
 
-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
         """Initialize with pdf path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
 
     @register
diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py
index 39224a9f4..a315e6e45 100644
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -16,11 +16,10 @@ CFG = Config()
 class URLEmbedding(SourceEmbedding):
     """url embedding for read url document."""
 
-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
         """Initialize with url path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
 
     @register
@@ -29,7 +28,7 @@ class URLEmbedding(SourceEmbedding):
         loader = WebBaseLoader(web_path=self.file_path)
         if CFG.LANGUAGE == "en":
             text_splitter = CharacterTextSplitter(
-                chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE,
+                chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
                 chunk_overlap=20,
                 length_function=len,
             )
diff --git a/pilot/source_embedding/word_embedding.py b/pilot/source_embedding/word_embedding.py
index 5dd2f0199..1f30f241c 100644
--- a/pilot/source_embedding/word_embedding.py
+++ b/pilot/source_embedding/word_embedding.py
@@ -5,19 +5,20 @@ from typing import List
 from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
 from langchain.schema import Document
 
-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 
+CFG = Config()
+
 
 class WordEmbedding(SourceEmbedding):
     """word embedding for read word document."""
 
-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
         """Initialize with word path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
         self.file_path = file_path
-        self.model_name = model_name
         self.vector_store_config = vector_store_config
 
     @register
@@ -25,7 +26,7 @@ class WordEmbedding(SourceEmbedding):
         """Load from word path."""
         loader = UnstructuredWordDocumentLoader(self.file_path)
         textsplitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
         )
         return loader.load_and_split(textsplitter)
 
diff --git a/pilot/summary/db_summary_client.py b/pilot/summary/db_summary_client.py
index c5bfcc718..3dfbede72 100644
--- a/pilot/summary/db_summary_client.py
+++ b/pilot/summary/db_summary_client.py
@@ -34,24 +34,21 @@ class DBSummaryClient:
             "embeddings": embeddings,
         }
         embedding = StringEmbedding(
-            db_summary_client.get_summery(),
-            LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
-            vector_store_config,
+            file_path=db_summary_client.get_summery(),
+            vector_store_config=vector_store_config,
         )
         if not embedding.vector_name_exist():
             if CFG.SUMMARY_CONFIG == "FAST":
                 for vector_table_info in db_summary_client.get_summery():
                     embedding = StringEmbedding(
                         vector_table_info,
-                        LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
                         vector_store_config,
                     )
                     embedding.source_embedding()
             else:
                 embedding = StringEmbedding(
-                    db_summary_client.get_summery(),
-                    LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
-                    vector_store_config,
+                    file_path=db_summary_client.get_summery(),
+                    vector_store_config=vector_store_config,
                 )
                 embedding.source_embedding()
             for (
@@ -64,7 +61,6 @@ class DBSummaryClient:
                 }
                 embedding = StringEmbedding(
                     table_summary,
-                    LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
                     table_vector_store_config,
                 )
                 embedding.source_embedding()
@@ -124,30 +120,3 @@ def _get_llm_response(query, db_input, dbsummary):
     )
     res = chat.nostream_call()
     return json.loads(res)["table"]
-
-
-# if __name__ == "__main__":
-#     # summary = DBSummaryClient.get_similar_tables("db_test", "查询在线用户的购物车", 10)
-#
-#     text= """Based on the input "查询在线聊天的用户好友" and the known database information, the tables involved in the user input are "chat_users" and "friends".
-# Response:
-#
-# {
-#    "table": ["chat_users"]
-# }"""
-#     text = text.rstrip().replace("\n","")
-#     start = text.find("{")
-#     end = text.find("}") + 1
-#
-#     # 从字符串中截取出JSON数据
-#     json_str = text[start:end]
-#
-#     # 将JSON数据转换为Python中的字典类型
-#     data = json.loads(json_str)
-#     # pattern = r'{s*"table"s*:s*[[^]]*]s*}'
-#     # match = re.search(pattern, text)
-#     # if match:
-#     #     json_string = match.group(0)
-#     #     # 将JSON字符串转换为Python对象
-#     #     json_obj = json.loads(json_string)
-#     # print(summary)
diff --git a/pilot/vector_store/file_loader.py b/pilot/vector_store/file_loader.py
index c42eda7a6..cca027324 100644
--- a/pilot/vector_store/file_loader.py
+++ b/pilot/vector_store/file_loader.py
@@ -17,7 +17,6 @@ from langchain.vectorstores import Chroma
 from pilot.configs.model_config import (
     DATASETS_DIR,
     LLM_MODEL_CONFIG,
-    VECTOR_SEARCH_TOP_K,
     VECTORE_PATH,
 )
 
@@ -41,7 +40,6 @@ class KnownLedge2Vector:
 
     embeddings: object = None
     model_name = LLM_MODEL_CONFIG["sentence-transforms"]
-    top_k: int = VECTOR_SEARCH_TOP_K
 
     def __init__(self, model_name=None) -> None:
         if not model_name:
diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py
index e886e4d85..ff13865b4 100644
--- a/tools/knowlege_init.py
+++ b/tools/knowlege_init.py
@@ -10,7 +10,6 @@ from pilot.configs.config import Config
 from pilot.configs.model_config import (
     DATASETS_DIR,
     LLM_MODEL_CONFIG,
-    VECTOR_SEARCH_TOP_K,
 )
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
 
@@ -19,7 +18,6 @@ CFG = Config()
 
 class LocalKnowledgeInit:
     embeddings: object = None
-    top_k: int = VECTOR_SEARCH_TOP_K
 
     def __init__(self, vector_store_config) -> None:
         self.vector_store_config = vector_store_config