From e29fa37cde7d62bd200bfa82a29e23435c38d64c Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Mon, 5 Jun 2023 18:08:55 +0800 Subject: [PATCH] update:knowledge env --- .env.template | 8 +++- pilot/configs/config.py | 4 +- pilot/scene/chat_knowledge/custom/chat.py | 3 +- pilot/scene/chat_knowledge/default/chat.py | 3 +- pilot/scene/chat_knowledge/url/chat.py | 3 +- pilot/server/vectordb_qa.py | 6 ++- pilot/server/webserver.py | 1 - pilot/source_embedding/csv_embedding.py | 4 +- pilot/source_embedding/knowledge_embedding.py | 1 - pilot/source_embedding/markdown_embedding.py | 11 +++--- pilot/source_embedding/pdf_embedding.py | 11 +++--- pilot/source_embedding/source_embedding.py | 2 - pilot/source_embedding/string_embedding.py | 5 +-- pilot/source_embedding/url_embedding.py | 7 ++-- pilot/source_embedding/word_embedding.py | 11 +++--- pilot/summary/db_summary_client.py | 39 ++----------------- pilot/vector_store/file_loader.py | 2 - tools/knowlege_init.py | 2 - 18 files changed, 43 insertions(+), 80 deletions(-) diff --git a/.env.template b/.env.template index 3e8ae536b..234b12738 100644 --- a/.env.template +++ b/.env.template @@ -28,8 +28,12 @@ MAX_POSITION_EMBEDDINGS=4096 # FAST_LLM_MODEL=chatglm-6b -### EMBEDDINGS -## EMBEDDING_MODEL - Model to use for creating embeddings +#*******************************************************************# +#** EMBEDDING SETTINGS **# +#*******************************************************************# +EMBEDDING_MODEL=text2vec +KNOWLEDGE_CHUNK_SIZE=500 +KNOWLEDGE_SEARCH_TOP_SIZE=5 ## EMBEDDING_TOKENIZER - Tokenizer to use for chunking large inputs ## EMBEDDING_TOKEN_LIMIT - Chunk size limit for large inputs # EMBEDDING_MODEL=all-MiniLM-L6-v2 diff --git a/pilot/configs/config.py b/pilot/configs/config.py index 6f6271477..c4458eaf7 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -148,8 +148,8 @@ class Config(metaclass=Singleton): ### EMBEDDING Configuration self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec") - self.KNOWLEDGE_CHUNK_SIZE = os.getenv("KNOWLEDGE_CHUNK_SIZE", 100) - self.KNOWLEDGE_SEARCH_TOP_SIZE = os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10) + self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 500)) + self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10)) ### SUMMARY_CONFIG Configuration self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR") diff --git a/pilot/scene/chat_knowledge/custom/chat.py b/pilot/scene/chat_knowledge/custom/chat.py index 8fc0f3d82..a56b2a098 100644 --- a/pilot/scene/chat_knowledge/custom/chat.py +++ b/pilot/scene/chat_knowledge/custom/chat.py @@ -14,7 +14,6 @@ from pilot.configs.model_config import ( KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, LOGDIR, - VECTOR_SEARCH_TOP_K, ) from pilot.scene.chat_knowledge.custom.prompt import prompt @@ -52,7 +51,7 @@ class ChatNewKnowledge(BaseChat): def generate_input_values(self): docs = self.knowledge_embedding_client.similar_search( - self.current_user_input, VECTOR_SEARCH_TOP_K + self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE ) context = [d.page_content for d in docs] context = context[:2000] diff --git a/pilot/scene/chat_knowledge/default/chat.py b/pilot/scene/chat_knowledge/default/chat.py index 1087ee2c0..325b03783 100644 --- a/pilot/scene/chat_knowledge/default/chat.py +++ b/pilot/scene/chat_knowledge/default/chat.py @@ -14,7 +14,6 @@ from pilot.configs.model_config import ( KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, LOGDIR, - VECTOR_SEARCH_TOP_K, ) from pilot.scene.chat_knowledge.default.prompt import prompt @@ -48,7 +47,7 @@ class ChatDefaultKnowledge(BaseChat): def generate_input_values(self): docs = self.knowledge_embedding_client.similar_search( - self.current_user_input, VECTOR_SEARCH_TOP_K + self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE ) context = [d.page_content for d in docs] context = context[:2000] diff --git a/pilot/scene/chat_knowledge/url/chat.py b/pilot/scene/chat_knowledge/url/chat.py index 2634dc80d..88dc7ad0b 100644 --- a/pilot/scene/chat_knowledge/url/chat.py +++ b/pilot/scene/chat_knowledge/url/chat.py @@ -14,7 +14,6 @@ from pilot.configs.model_config import ( KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, LOGDIR, - VECTOR_SEARCH_TOP_K, ) from pilot.scene.chat_knowledge.url.prompt import prompt @@ -56,7 +55,7 @@ class ChatUrlKnowledge(BaseChat): def generate_input_values(self): docs = self.knowledge_embedding_client.similar_search( - self.current_user_input, VECTOR_SEARCH_TOP_K + self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE ) context = [d.page_content for d in docs] context = context[:2000] diff --git a/pilot/server/vectordb_qa.py b/pilot/server/vectordb_qa.py index 9faae5eb8..2a09e6a98 100644 --- a/pilot/server/vectordb_qa.py +++ b/pilot/server/vectordb_qa.py @@ -3,12 +3,14 @@ from langchain.prompts import PromptTemplate -from pilot.configs.model_config import VECTOR_SEARCH_TOP_K +from pilot.configs.config import Config from pilot.conversation import conv_qa_prompt_template, conv_db_summary_templates from pilot.logs import logger from pilot.model.llm_out.vicuna_llm import VicunaLLM from pilot.vector_store.file_loader import KnownLedge2Vector +CFG = Config() + class KnownLedgeBaseQA: def __init__(self) -> None: @@ -22,7 +24,7 @@ class KnownLedgeBaseQA: ) retriever = self.vector_store.as_retriever( - search_kwargs={"k": VECTOR_SEARCH_TOP_K} + search_kwargs={"k": CFG.KNOWLEDGE_SEARCH_TOP_SIZE} ) docs = retriever.get_relevant_documents(query=query) diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index f7655fd7d..239fc5d9e 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -634,7 +634,6 @@ def knowledge_embedding_store(vs_id, files): knowledge_embedding_client = KnowledgeEmbedding( file_path=os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vs_id, filename), model_name=LLM_MODEL_CONFIG["text2vec"], - local_persist=False, vector_store_config={ "vector_store_name": vector_store_name["vs_name"], "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/source_embedding/csv_embedding.py index 8b2e25ff3..0e69574b4 100644 --- a/pilot/source_embedding/csv_embedding.py +++ b/pilot/source_embedding/csv_embedding.py @@ -12,14 +12,12 @@ class CSVEmbedding(SourceEmbedding): def __init__( self, file_path, - model_name, vector_store_config, embedding_args: Optional[Dict] = None, ): """Initialize with csv path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config self.embedding_args = embedding_args diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py index c81953ffc..1e072c861 100644 --- a/pilot/source_embedding/knowledge_embedding.py +++ b/pilot/source_embedding/knowledge_embedding.py @@ -64,7 +64,6 @@ class KnowledgeEmbedding: knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension] embedding = knowledge_class( self.file_path, - model_name=self.model_name, vector_store_config=self.vector_store_config, **knowledge_args, ) diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py index da974c366..e2851d122 100644 --- a/pilot/source_embedding/markdown_embedding.py +++ b/pilot/source_embedding/markdown_embedding.py @@ -8,20 +8,21 @@ from bs4 import BeautifulSoup from langchain.document_loaders import TextLoader from langchain.schema import Document -from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE +from pilot.configs.config import Config from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +CFG = Config() + class MarkdownEmbedding(SourceEmbedding): """markdown embedding for read markdown document.""" - def __init__(self, file_path, model_name, vector_store_config): + def __init__(self, file_path, vector_store_config): """Initialize with markdown path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config # self.encoding = encoding @@ -30,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding): """Load from markdown path.""" loader = EncodeTextLoader(self.file_path) text_splitter = CHNDocumentSplitter( - pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE + pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE ) return loader.load_and_split(text_splitter) diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index 55f3783f3..6eced03f3 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -5,19 +5,20 @@ from typing import List from langchain.document_loaders import PyPDFLoader from langchain.schema import Document -from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE +from pilot.configs.config import Config from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +CFG = Config() + class PDFEmbedding(SourceEmbedding): """pdf embedding for read pdf document.""" - def __init__(self, file_path, model_name, vector_store_config, encoding): + def __init__(self, file_path, vector_store_config, encoding): """Initialize with pdf path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config self.encoding = encoding @@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding): # loader = UnstructuredPaddlePDFLoader(self.file_path) loader = PyPDFLoader(self.file_path) textsplitter = CHNDocumentSplitter( - pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE + pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE ) return loader.load_and_split(textsplitter) diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py index 7db92ea9b..50c7044f9 100644 --- a/pilot/source_embedding/source_embedding.py +++ b/pilot/source_embedding/source_embedding.py @@ -23,13 +23,11 @@ class SourceEmbedding(ABC): def __init__( self, file_path, - model_name, vector_store_config, embedding_args: Optional[Dict] = None, ): """Initialize with Loader url, model_name, vector_store_config""" self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config self.embedding_args = embedding_args self.embeddings = vector_store_config["embeddings"] diff --git a/pilot/source_embedding/string_embedding.py b/pilot/source_embedding/string_embedding.py index b4d7b1228..a1d18ee82 100644 --- a/pilot/source_embedding/string_embedding.py +++ b/pilot/source_embedding/string_embedding.py @@ -8,11 +8,10 @@ from pilot import SourceEmbedding, register class StringEmbedding(SourceEmbedding): """string embedding for read string document.""" - def __init__(self, file_path, model_name, vector_store_config): + def __init__(self, file_path, vector_store_config): """Initialize with pdf path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config @register diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py index 39224a9f4..a315e6e45 100644 --- a/pilot/source_embedding/url_embedding.py +++ b/pilot/source_embedding/url_embedding.py @@ -16,11 +16,10 @@ CFG = Config() class URLEmbedding(SourceEmbedding): """url embedding for read url document.""" - def __init__(self, file_path, model_name, vector_store_config): + def __init__(self, file_path, vector_store_config): """Initialize with url path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config @register @@ -29,7 +28,7 @@ class URLEmbedding(SourceEmbedding): loader = WebBaseLoader(web_path=self.file_path) if CFG.LANGUAGE == "en": text_splitter = CharacterTextSplitter( - chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE, + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len, ) diff --git a/pilot/source_embedding/word_embedding.py b/pilot/source_embedding/word_embedding.py index 5dd2f0199..1f30f241c 100644 --- a/pilot/source_embedding/word_embedding.py +++ b/pilot/source_embedding/word_embedding.py @@ -5,19 +5,20 @@ from typing import List from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader from langchain.schema import Document -from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE +from pilot.configs.config import Config from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +CFG = Config() + class WordEmbedding(SourceEmbedding): """word embedding for read word document.""" - def __init__(self, file_path, model_name, vector_store_config): + def __init__(self, file_path, vector_store_config): """Initialize with word path.""" - super().__init__(file_path, model_name, vector_store_config) + super().__init__(file_path, vector_store_config) self.file_path = file_path - self.model_name = model_name self.vector_store_config = vector_store_config @register @@ -25,7 +26,7 @@ class WordEmbedding(SourceEmbedding): """Load from word path.""" loader = UnstructuredWordDocumentLoader(self.file_path) textsplitter = CHNDocumentSplitter( - pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE + pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE ) return loader.load_and_split(textsplitter) diff --git a/pilot/summary/db_summary_client.py b/pilot/summary/db_summary_client.py index c5bfcc718..3dfbede72 100644 --- a/pilot/summary/db_summary_client.py +++ b/pilot/summary/db_summary_client.py @@ -34,24 +34,21 @@ class DBSummaryClient: "embeddings": embeddings, } embedding = StringEmbedding( - db_summary_client.get_summery(), - LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], - vector_store_config, + file_path=db_summary_client.get_summery(), + vector_store_config=vector_store_config, ) if not embedding.vector_name_exist(): if CFG.SUMMARY_CONFIG == "FAST": for vector_table_info in db_summary_client.get_summery(): embedding = StringEmbedding( vector_table_info, - LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], vector_store_config, ) embedding.source_embedding() else: embedding = StringEmbedding( - db_summary_client.get_summery(), - LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], - vector_store_config, + file_path=db_summary_client.get_summery(), + vector_store_config=vector_store_config, ) embedding.source_embedding() for ( @@ -64,7 +61,6 @@ class DBSummaryClient: } embedding = StringEmbedding( table_summary, - LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], table_vector_store_config, ) embedding.source_embedding() @@ -124,30 +120,3 @@ def _get_llm_response(query, db_input, dbsummary): ) res = chat.nostream_call() return json.loads(res)["table"] - - -# if __name__ == "__main__": -# # summary = DBSummaryClient.get_similar_tables("db_test", "查询在线用户的购物车", 10) -# -# text= """Based on the input "查询在线聊天的用户好友" and the known database information, the tables involved in the user input are "chat_users" and "friends". -# Response: -# -# { -# "table": ["chat_users"] -# }""" -# text = text.rstrip().replace("\n","") -# start = text.find("{") -# end = text.find("}") + 1 -# -# # 从字符串中截取出JSON数据 -# json_str = text[start:end] -# -# # 将JSON数据转换为Python中的字典类型 -# data = json.loads(json_str) -# # pattern = r'{s*"table"s*:s*[[^]]*]s*}' -# # match = re.search(pattern, text) -# # if match: -# # json_string = match.group(0) -# # # 将JSON字符串转换为Python对象 -# # json_obj = json.loads(json_string) -# # print(summary) diff --git a/pilot/vector_store/file_loader.py b/pilot/vector_store/file_loader.py index c42eda7a6..cca027324 100644 --- a/pilot/vector_store/file_loader.py +++ b/pilot/vector_store/file_loader.py @@ -17,7 +17,6 @@ from langchain.vectorstores import Chroma from pilot.configs.model_config import ( DATASETS_DIR, LLM_MODEL_CONFIG, - VECTOR_SEARCH_TOP_K, VECTORE_PATH, ) @@ -41,7 +40,6 @@ class KnownLedge2Vector: embeddings: object = None model_name = LLM_MODEL_CONFIG["sentence-transforms"] - top_k: int = VECTOR_SEARCH_TOP_K def __init__(self, model_name=None) -> None: if not model_name: diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py index e886e4d85..ff13865b4 100644 --- a/tools/knowlege_init.py +++ b/tools/knowlege_init.py @@ -10,7 +10,6 @@ from pilot.configs.config import Config from pilot.configs.model_config import ( DATASETS_DIR, LLM_MODEL_CONFIG, - VECTOR_SEARCH_TOP_K, ) from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding @@ -19,7 +18,6 @@ CFG = Config() class LocalKnowledgeInit: embeddings: object = None - top_k: int = VECTOR_SEARCH_TOP_K def __init__(self, vector_store_config) -> None: self.vector_store_config = vector_store_config