update:knowledge env

2025-08-03 09:34:04 +00:00 · 2023-06-05 18:08:55 +08:00 · 2023-06-05 18:08:55 +08:00 · e29fa37cde
commit e29fa37cde
parent f2f28fee42
18 changed files with 43 additions and 80 deletions
--- a/.env.template
+++ b/.env.template
@ -28,8 +28,12 @@ MAX_POSITION_EMBEDDINGS=4096
 # FAST_LLM_MODEL=chatglm-6b


-### EMBEDDINGS
-## EMBEDDING_MODEL       - Model to use for creating embeddings
+#*******************************************************************#
+#**                         EMBEDDING SETTINGS                    **#
+#*******************************************************************#
+EMBEDDING_MODEL=text2vec
+KNOWLEDGE_CHUNK_SIZE=500
+KNOWLEDGE_SEARCH_TOP_SIZE=5
 ## EMBEDDING_TOKENIZER   - Tokenizer to use for chunking large inputs
 ## EMBEDDING_TOKEN_LIMIT - Chunk size limit for large inputs
 # EMBEDDING_MODEL=all-MiniLM-L6-v2
--- a/pilot/configs/config.py
+++ b/pilot/configs/config.py
@ -148,8 +148,8 @@ class Config(metaclass=Singleton):

        ### EMBEDDING Configuration
        self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec")
-        self.KNOWLEDGE_CHUNK_SIZE = os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)
-        self.KNOWLEDGE_SEARCH_TOP_SIZE = os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10)
+        self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 500))
+        self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10))
        ### SUMMARY_CONFIG Configuration
        self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR")

--- a/pilot/scene/chat_knowledge/custom/chat.py
+++ b/pilot/scene/chat_knowledge/custom/chat.py
@ -14,7 +14,6 @@ from pilot.configs.model_config import (
    KNOWLEDGE_UPLOAD_ROOT_PATH,
    LLM_MODEL_CONFIG,
    LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )

 from pilot.scene.chat_knowledge.custom.prompt import prompt
@ -52,7 +51,7 @@ class ChatNewKnowledge(BaseChat):

    def generate_input_values(self):
        docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
        )
        context = [d.page_content for d in docs]
        context = context[:2000]
--- a/pilot/scene/chat_knowledge/default/chat.py
+++ b/pilot/scene/chat_knowledge/default/chat.py
@ -14,7 +14,6 @@ from pilot.configs.model_config import (
    KNOWLEDGE_UPLOAD_ROOT_PATH,
    LLM_MODEL_CONFIG,
    LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )

 from pilot.scene.chat_knowledge.default.prompt import prompt
@ -48,7 +47,7 @@ class ChatDefaultKnowledge(BaseChat):

    def generate_input_values(self):
        docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
        )
        context = [d.page_content for d in docs]
        context = context[:2000]
--- a/pilot/scene/chat_knowledge/url/chat.py
+++ b/pilot/scene/chat_knowledge/url/chat.py
@ -14,7 +14,6 @@ from pilot.configs.model_config import (
    KNOWLEDGE_UPLOAD_ROOT_PATH,
    LLM_MODEL_CONFIG,
    LOGDIR,
-    VECTOR_SEARCH_TOP_K,
 )

 from pilot.scene.chat_knowledge.url.prompt import prompt
@ -56,7 +55,7 @@ class ChatUrlKnowledge(BaseChat):

    def generate_input_values(self):
        docs = self.knowledge_embedding_client.similar_search(
-            self.current_user_input, VECTOR_SEARCH_TOP_K
+            self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
        )
        context = [d.page_content for d in docs]
        context = context[:2000]
--- a/pilot/server/vectordb_qa.py
+++ b/pilot/server/vectordb_qa.py
@ -3,12 +3,14 @@

 from langchain.prompts import PromptTemplate

-from pilot.configs.model_config import VECTOR_SEARCH_TOP_K
+from pilot.configs.config import Config
 from pilot.conversation import conv_qa_prompt_template, conv_db_summary_templates
 from pilot.logs import logger
 from pilot.model.llm_out.vicuna_llm import VicunaLLM
 from pilot.vector_store.file_loader import KnownLedge2Vector

+CFG = Config()
+

 class KnownLedgeBaseQA:
    def __init__(self) -> None:
@ -22,7 +24,7 @@ class KnownLedgeBaseQA:
        )

        retriever = self.vector_store.as_retriever(
-            search_kwargs={"k": VECTOR_SEARCH_TOP_K}
+            search_kwargs={"k": CFG.KNOWLEDGE_SEARCH_TOP_SIZE}
        )
        docs = retriever.get_relevant_documents(query=query)

--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@ -634,7 +634,6 @@ def knowledge_embedding_store(vs_id, files):
        knowledge_embedding_client = KnowledgeEmbedding(
            file_path=os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vs_id, filename),
            model_name=LLM_MODEL_CONFIG["text2vec"],
-            local_persist=False,
            vector_store_config={
                "vector_store_name": vector_store_name["vs_name"],
                "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
--- a/pilot/source_embedding/csv_embedding.py
+++ b/pilot/source_embedding/csv_embedding.py
@ -12,14 +12,12 @@ class CSVEmbedding(SourceEmbedding):
    def __init__(
        self,
        file_path,
-        model_name,
        vector_store_config,
        embedding_args: Optional[Dict] = None,
    ):
        """Initialize with csv path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config
        self.embedding_args = embedding_args

--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@ -64,7 +64,6 @@ class KnowledgeEmbedding:
            knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension]
            embedding = knowledge_class(
                self.file_path,
-                model_name=self.model_name,
                vector_store_config=self.vector_store_config,
                **knowledge_args,
            )
--- a/pilot/source_embedding/markdown_embedding.py
+++ b/pilot/source_embedding/markdown_embedding.py
@ -8,20 +8,21 @@ from bs4 import BeautifulSoup
 from langchain.document_loaders import TextLoader
 from langchain.schema import Document

-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter

+CFG = Config()
+

 class MarkdownEmbedding(SourceEmbedding):
    """markdown embedding for read markdown document."""

-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
        """Initialize with markdown path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config
        # self.encoding = encoding

@ -30,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding):
        """Load from markdown path."""
        loader = EncodeTextLoader(self.file_path)
        text_splitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
        )
        return loader.load_and_split(text_splitter)

--- a/pilot/source_embedding/pdf_embedding.py
+++ b/pilot/source_embedding/pdf_embedding.py
@ -5,19 +5,20 @@ from typing import List
 from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document

-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter

+CFG = Config()
+

 class PDFEmbedding(SourceEmbedding):
    """pdf embedding for read pdf document."""

-    def __init__(self, file_path, model_name, vector_store_config, encoding):
+    def __init__(self, file_path, vector_store_config, encoding):
        """Initialize with pdf path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config
        self.encoding = encoding

@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding):
        # loader = UnstructuredPaddlePDFLoader(self.file_path)
        loader = PyPDFLoader(self.file_path)
        textsplitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
        )
        return loader.load_and_split(textsplitter)

--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@ -23,13 +23,11 @@ class SourceEmbedding(ABC):
    def __init__(
        self,
        file_path,
-        model_name,
        vector_store_config,
        embedding_args: Optional[Dict] = None,
    ):
        """Initialize with Loader url, model_name, vector_store_config"""
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config
        self.embedding_args = embedding_args
        self.embeddings = vector_store_config["embeddings"]
--- a/pilot/source_embedding/string_embedding.py
+++ b/pilot/source_embedding/string_embedding.py
@ -8,11 +8,10 @@ from pilot import SourceEmbedding, register
 class StringEmbedding(SourceEmbedding):
    """string embedding for read string document."""

-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
        """Initialize with pdf path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config

    @register
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@ -16,11 +16,10 @@ CFG = Config()
 class URLEmbedding(SourceEmbedding):
    """url embedding for read url document."""

-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
        """Initialize with url path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config

    @register
@ -29,7 +28,7 @@ class URLEmbedding(SourceEmbedding):
        loader = WebBaseLoader(web_path=self.file_path)
        if CFG.LANGUAGE == "en":
            text_splitter = CharacterTextSplitter(
-                chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE,
+                chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
                chunk_overlap=20,
                length_function=len,
            )
--- a/pilot/source_embedding/word_embedding.py
+++ b/pilot/source_embedding/word_embedding.py
@ -5,19 +5,20 @@ from typing import List
 from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
 from langchain.schema import Document

-from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.config import Config
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter

+CFG = Config()
+

 class WordEmbedding(SourceEmbedding):
    """word embedding for read word document."""

-    def __init__(self, file_path, model_name, vector_store_config):
+    def __init__(self, file_path, vector_store_config):
        """Initialize with word path."""
-        super().__init__(file_path, model_name, vector_store_config)
+        super().__init__(file_path, vector_store_config)
        self.file_path = file_path
-        self.model_name = model_name
        self.vector_store_config = vector_store_config

    @register
@ -25,7 +26,7 @@ class WordEmbedding(SourceEmbedding):
        """Load from word path."""
        loader = UnstructuredWordDocumentLoader(self.file_path)
        textsplitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
+            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
        )
        return loader.load_and_split(textsplitter)

--- a/pilot/summary/db_summary_client.py
+++ b/pilot/summary/db_summary_client.py
@ -34,24 +34,21 @@ class DBSummaryClient:
            "embeddings": embeddings,
        }
        embedding = StringEmbedding(
-            db_summary_client.get_summery(),
-            LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
-            vector_store_config,
+            file_path=db_summary_client.get_summery(),
+            vector_store_config=vector_store_config,
        )
        if not embedding.vector_name_exist():
            if CFG.SUMMARY_CONFIG == "FAST":
                for vector_table_info in db_summary_client.get_summery():
                    embedding = StringEmbedding(
                        vector_table_info,
-                        LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
                        vector_store_config,
                    )
                    embedding.source_embedding()
            else:
                embedding = StringEmbedding(
-                    db_summary_client.get_summery(),
-                    LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
-                    vector_store_config,
+                    file_path=db_summary_client.get_summery(),
+                    vector_store_config=vector_store_config,
                )
                embedding.source_embedding()
            for (
@ -64,7 +61,6 @@ class DBSummaryClient:
                }
                embedding = StringEmbedding(
                    table_summary,
-                    LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
                    table_vector_store_config,
                )
                embedding.source_embedding()
@ -124,30 +120,3 @@ def _get_llm_response(query, db_input, dbsummary):
    )
    res = chat.nostream_call()
    return json.loads(res)["table"]
-
-
-# if __name__ == "__main__":
-#     # summary = DBSummaryClient.get_similar_tables("db_test", "查询在线用户的购物车", 10)
-#
-#     text= """Based on the input "查询在线聊天的用户好友" and the known database information, the tables involved in the user input are "chat_users" and "friends".
-# Response:
-#
-# {
-#    "table": ["chat_users"]
-# }"""
-#     text = text.rstrip().replace("\n","")
-#     start = text.find("{")
-#     end = text.find("}") + 1
-#
-#     # 从字符串中截取出JSON数据
-#     json_str = text[start:end]
-#
-#     # 将JSON数据转换为Python中的字典类型
-#     data = json.loads(json_str)
-#     # pattern = r'{s*"table"s*:s*[[^]]*]s*}'
-#     # match = re.search(pattern, text)
-#     # if match:
-#     #     json_string = match.group(0)
-#     #     # 将JSON字符串转换为Python对象
-#     #     json_obj = json.loads(json_string)
-#     # print(summary)
--- a/pilot/vector_store/file_loader.py
+++ b/pilot/vector_store/file_loader.py
@ -17,7 +17,6 @@ from langchain.vectorstores import Chroma
 from pilot.configs.model_config import (
    DATASETS_DIR,
    LLM_MODEL_CONFIG,
-    VECTOR_SEARCH_TOP_K,
    VECTORE_PATH,
 )

@ -41,7 +40,6 @@ class KnownLedge2Vector:

    embeddings: object = None
    model_name = LLM_MODEL_CONFIG["sentence-transforms"]
-    top_k: int = VECTOR_SEARCH_TOP_K

    def __init__(self, model_name=None) -> None:
        if not model_name:
--- a/tools/knowlege_init.py
+++ b/tools/knowlege_init.py
@ -10,7 +10,6 @@ from pilot.configs.config import Config
 from pilot.configs.model_config import (
    DATASETS_DIR,
    LLM_MODEL_CONFIG,
-    VECTOR_SEARCH_TOP_K,
 )
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding

@ -19,7 +18,6 @@ CFG = Config()

 class LocalKnowledgeInit:
    embeddings: object = None
-    top_k: int = VECTOR_SEARCH_TOP_K

    def __init__(self, vector_store_config) -> None:
        self.vector_store_config = vector_store_config