From 336ba1e042debb1cf7e26cbaba22768f2ef7b1c2 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Fri, 19 May 2023 21:17:39 +0800
Subject: [PATCH 01/15] update:knowledge load script

---
 pilot/configs/model_config.py                 |  9 +++--
 pilot/server/webserver.py                     |  1 +
 .../source_embedding/chn_document_splitter.py | 24 ++----------
 pilot/source_embedding/knowledge_embedding.py | 18 ++++++---
 pilot/source_embedding/markdown_embedding.py  |  3 +-
 pilot/source_embedding/pdf_embedding.py       |  7 ++--
 pilot/source_embedding/search_milvus.py       |  2 +-
 pilot/vector_store/milvus_store.py            | 37 ++++++++++---------
 tools/knowlege_init.py                        |  1 -
 9 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py
index faa93227f..da68ab332 100644
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@@ -21,15 +21,17 @@ LLM_MODEL_CONFIG = {
     "flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"),
     "vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"),
     "text2vec": os.path.join(MODEL_PATH, "text2vec-large-chinese"),
+    "text2vec-base": os.path.join(MODEL_PATH, "text2vec-base-chinese"),
     "sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2")
 }
 
 
-VECTOR_SEARCH_TOP_K = 3
+VECTOR_SEARCH_TOP_K = 20
 LLM_MODEL = "vicuna-13b"
 LIMIT_MODEL_CONCURRENCY = 5
 MAX_POSITION_EMBEDDINGS = 4096 
-VICUNA_MODEL_SERVER = "http://121.41.227.141:8000"
+# VICUNA_MODEL_SERVER = "http://121.41.227.141:8000"
+VICUNA_MODEL_SERVER = "http://120.79.27.110:8000"
 
 # Load model config
 ISLOAD_8BIT = True
@@ -44,4 +46,5 @@ DB_SETTINGS = {
 }
 
 VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
-KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
\ No newline at end of file
+KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
+KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py
index 07d94b773..25940a437 100644
--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@@ -499,6 +499,7 @@ def build_single_model_ui():
                         files = gr.File(label="添加文件",
                                         file_types=[".txt", ".md", ".docx", ".pdf"],
                                         file_count="multiple",
+                                        allow_flagged_uploads=True,
                                         show_label=False
                                         )
 
diff --git a/pilot/source_embedding/chn_document_splitter.py b/pilot/source_embedding/chn_document_splitter.py
index 090a6af56..10a77aeca 100644
--- a/pilot/source_embedding/chn_document_splitter.py
+++ b/pilot/source_embedding/chn_document_splitter.py
@@ -9,33 +9,17 @@ class CHNDocumentSplitter(CharacterTextSplitter):
         self.pdf = pdf
         self.sentence_size = sentence_size
 
-    # def split_text_version2(self, text: str) -> List[str]:
-    #     if self.pdf:
-    #         text = re.sub(r"\n{3,}", "\n", text)
-    #         text = re.sub('\s', ' ', text)
-    #         text = text.replace("\n\n", "")
-    #     sent_sep_pattern = re.compile('([﹒﹔﹖﹗．。！？]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')  # del ：；
-    #     sent_list = []
-    #     for ele in sent_sep_pattern.split(text):
-    #         if sent_sep_pattern.match(ele) and sent_list:
-    #             sent_list[-1] += ele
-    #         elif ele:
-    #             sent_list.append(ele)
-    #     return sent_list
-
     def split_text(self, text: str) -> List[str]:
         if self.pdf:
             text = re.sub(r"\n{3,}", r"\n", text)
             text = re.sub('\s', " ", text)
             text = re.sub("\n\n", "", text)
 
-        text = re.sub(r'([;；.!?。！？\?])([^”’])', r"\1\n\2", text)  # 单字符断句符
-        text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)  # 英文省略号
-        text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)  # 中文省略号
+        text = re.sub(r'([;；.!?。！？\?])([^”’])', r"\1\n\2", text)
+        text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
+        text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)
         text = re.sub(r'([;；!?。！？\?]["’”」』]{0,2})([^;；!?，。！？\?])', r'\1\n\2', text)
-        # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
-        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
-        # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
+        text = text.rstrip()
         ls = [i for i in text.split("\n") if i]
         for ele in ls:
             if len(ele) > self.sentence_size:
diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index 594723b6e..08d962908 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -4,13 +4,15 @@ from bs4 import BeautifulSoup
 from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
-from pilot.configs.model_config import DATASETS_DIR
+from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 from pilot.source_embedding.csv_embedding import CSVEmbedding
 from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
 from pilot.source_embedding.pdf_embedding import PDFEmbedding
 import markdown
 
+from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
+
 
 class KnowledgeEmbedding:
     def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
@@ -63,7 +65,7 @@ class KnowledgeEmbedding:
                 print("directly return vector store")
                 vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
         else:
-            print(vector_name + "is new vector store, knowledge begin load...")
+            print(vector_name + " is new vector store, knowledge begin load...")
             documents = self._load_knownlege(self.file_path)
             vector_store = Chroma.from_documents(documents=documents,
                                                  embedding=self.embeddings,
@@ -88,7 +90,7 @@ class KnowledgeEmbedding:
     def _load_file(self, filename):
         if filename.lower().endswith(".md"):
             loader = TextLoader(filename)
-            text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
+            text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
             docs = loader.load_and_split(text_splitter)
             i = 0
             for d in docs:
@@ -100,11 +102,15 @@ class KnowledgeEmbedding:
                 docs[i].page_content = docs[i].page_content.replace("\n", " ")
                 i += 1
         elif filename.lower().endswith(".pdf"):
-            loader = PyPDFLoader(filename)
-            textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
+            loader = UnstructuredPaddlePDFLoader(filename)
+            textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
             docs = loader.load_and_split(textsplitter)
+            i = 0
+            for d in docs:
+                docs[i].page_content = d.page_content.replace("\n", " ").replace("�", "")
+                i += 1
         else:
             loader = TextLoader(filename)
-            text_splitor = CHNDocumentSplitter(sentence_size=100)
+            text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
             docs = loader.load_and_split(text_splitor)
         return docs
\ No newline at end of file
diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py
index fee9504b6..834226f75 100644
--- a/pilot/source_embedding/markdown_embedding.py
+++ b/pilot/source_embedding/markdown_embedding.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 from langchain.document_loaders import TextLoader
 from langchain.schema import Document
 import markdown
+from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
 
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
@@ -26,7 +27,7 @@ class MarkdownEmbedding(SourceEmbedding):
     def read(self):
         """Load from markdown path."""
         loader = TextLoader(self.file_path)
-        text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
+        text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
         return loader.load_and_split(text_splitter)
 
     @register
diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py
index bd0ae3aba..a8749695b 100644
--- a/pilot/source_embedding/pdf_embedding.py
+++ b/pilot/source_embedding/pdf_embedding.py
@@ -2,11 +2,12 @@
 # -*- coding: utf-8 -*-
 from typing import List
 
-from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
+from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
 
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
+from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
 
 
 class PDFEmbedding(SourceEmbedding):
@@ -22,8 +23,8 @@ class PDFEmbedding(SourceEmbedding):
     @register
     def read(self):
         """Load from pdf path."""
-        loader = PyPDFLoader(self.file_path)
-        textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
+        loader = UnstructuredPaddlePDFLoader(self.file_path)
+        textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
         return loader.load_and_split(textsplitter)
 
     @register
diff --git a/pilot/source_embedding/search_milvus.py b/pilot/source_embedding/search_milvus.py
index 18f93d1d3..ec0aa6813 100644
--- a/pilot/source_embedding/search_milvus.py
+++ b/pilot/source_embedding/search_milvus.py
@@ -50,7 +50,7 @@
 #
 # # text_embeddings = Text2Vectors()
 # mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"})
-# 
+#
 # mivuls.insert(["textc","tezt2"])
 # print("success")
 # ct
diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py
index 1f07c969e..eda0b4e38 100644
--- a/pilot/vector_store/milvus_store.py
+++ b/pilot/vector_store/milvus_store.py
@@ -1,6 +1,7 @@
-
+from langchain.embeddings import HuggingFaceEmbeddings
 from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection
 
+from pilot.configs.model_config import LLM_MODEL_CONFIG
 from pilot.vector_store.vector_store_base import VectorStoreBase
 
 
@@ -9,7 +10,7 @@ class MilvusStore(VectorStoreBase):
         """Construct a milvus memory storage connection.
 
         Args:
-            cfg (Config): Auto-GPT global config.
+            cfg (Config): MilvusStore global config.
         """
         # self.configure(cfg)
 
@@ -71,21 +72,21 @@ class MilvusStore(VectorStoreBase):
                 self.index_params,
                 index_name="vector",
             )
+        info = self.collection.describe()
         self.collection.load()
 
-    # def add(self, data) -> str:
-    #     """Add an embedding of data into milvus.
-    #
-    #     Args:
-    #         data (str): The raw text to construct embedding index.
-    #
-    #     Returns:
-    #         str: log.
-    #     """
-    #     embedding = get_ada_embedding(data)
-    #     result = self.collection.insert([[embedding], [data]])
-    #     _text = (
-    #         "Inserting data into memory at primary key: "
-    #         f"{result.primary_keys[0]}:\n data: {data}"
-    #     )
-    #     return _text
\ No newline at end of file
+    def insert(self, text) -> str:
+        """Add an embedding of data into milvus.
+        Args:
+            text (str): The raw text to construct embedding index.
+        Returns:
+            str: log.
+        """
+        # embedding = get_ada_embedding(data)
+        embeddings = HuggingFaceEmbeddings(model_name=LLM_MODEL_CONFIG["sentence-transforms"])
+        result = self.collection.insert([embeddings.embed_documents(text), text])
+        _text = (
+            "Inserting data into memory at primary key: "
+            f"{result.primary_keys[0]}:\n data: {text}"
+        )
+        return _text
\ No newline at end of file
diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py
index bc827953d..e9ecad49a 100644
--- a/tools/knowlege_init.py
+++ b/tools/knowlege_init.py
@@ -41,5 +41,4 @@ if __name__ == "__main__":
     append_mode = args.append
     kv  = LocalKnowledgeInit()
     vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, vector_name=vector_name, append_mode=append_mode)
-    docs = vector_store.similarity_search("小明",1)
     print("your knowledge embedding success...")
\ No newline at end of file

From e871df20f5ddaf7f7116245fdb18dad7056c2c3b Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Fri, 19 May 2023 21:35:35 +0800
Subject: [PATCH 02/15] update:knowledge load script

---
 README.md    |  19 +++++
 README.zh.md | 233 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 README.zh.md

diff --git a/README.md b/README.md
index 54a8d0de7..bfc906c2c 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,25 @@ As the knowledge base is currently the most significant user demand scenario, we
 2. Custom addition of knowledge bases
 3. Various usage scenarios such as constructing knowledge bases through plugin capabilities and web crawling. Users only need to organize the knowledge documents, and they can use our existing capabilities to build the knowledge base required for the large model.
 
+Create your own knowledge base:
+
+1.Place personal knowledge files or folders in the pilot/datasets directory.
+
+2.Run the knowledge repository script in the tools directory.
+
+```
+python tools/knowledge_init.py
+
+--vector_name : your vector store name  default_value:default
+--append: append mode, True:append, False: not append default_value:False
+
+```
+
+3.Add the knowledge base in the interface by entering the name of your knowledge base (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. 
+
+Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
+
+
 ### LLMs Management
 
 In the underlying large model integration, we have designed an open interface that supports integration with various large models. At the same time, we have a very strict control and evaluation mechanism for the effectiveness of the integrated models. In terms of accuracy, the integrated models need to align with the capability of ChatGPT at a level of 85% or higher. We use higher standards to select models, hoping to save users the cumbersome testing and evaluation process in the process of use.
diff --git a/README.zh.md b/README.zh.md
new file mode 100644
index 000000000..3675f3496
--- /dev/null
+++ b/README.zh.md
@@ -0,0 +1,233 @@
+# DB-GPT ![GitHub Repo stars](https://img.shields.io/github/stars/csunny/db-gpt?style=social)
+
+[English](README.zh.md)
+
+[![Star History Chart](https://api.star-history.com/svg?repos=csunny/DB-GPT)](https://star-history.com/#csunny/DB-GPT)
+
+## DB-GPT 是什么？
+随着大模型的发布迭代，大模型变得越来越智能，在使用大模型的过程当中，遇到极大的数据安全与隐私挑战。在利用大模型能力的过程中我们的私密数据跟环境需要掌握自己的手里，完全可控，避免任何的数据隐私泄露以及安全风险。基于此，我们发起了DB-GPT项目，为所有以数据库为基础的场景，构建一套完整的私有大模型解决方案。 此方案因为支持本地部署，所以不仅仅可以应用于独立私有环境，而且还可以根据业务模块独立部署隔离，让大模型的能力绝对私有、安全、可控。
+
+DB-GPT 是一个开源的以数据库为基础的GPT实验项目，使用本地化的GPT大模型与您的数据和环境进行交互，无数据泄露风险，100% 私密，100% 安全。
+
+
+## 特性一览
+
+目前我们已经发布了多种关键的特性，这里一一列举展示一下当前发布的能力。
+- SQL 语言能力
+  - SQL生成
+  - SQL诊断
+- 私域问答与数据处理
+  - 数据库知识问答
+  - 数据处理
+- 插件模型
+  - 支持自定义插件执行任务，原生支持Auto-GPT插件。如:
+    - SQL自动执行，获取查询结果
+    - 自动爬取学习知识
+- 知识库统一向量存储/索引
+  - 非结构化数据支持包括PDF、MarkDown、CSV、WebURL
+
+## 效果演示
+
+示例通过 RTX 4090 GPU 演示，[YouTube 地址](https://www.youtube.com/watch?v=1PWI6F89LPo)
+### 运行环境演示
+
+<p align="center">
+  <img src="./assets/演示.gif" width="600px" />
+</p>
+
+<p align="center">
+  <img src="./assets/Auto-DB-GPT.gif" width="600px" />
+</p>
+
+### SQL 生成
+
+1. 生成建表语句
+
+<p align="center">
+   <img src="./assets/SQL_Gen_CreateTable.png" width="600px" />
+</p>
+
+2. 生成可运行SQL
+首先选择对应的数据库, 然后模型即可根据对应的数据库 Schema 信息生成 SQL, 运行成功的效果如下面的演示：
+
+<p align="center">
+  <img src="./assets/exeable.png" width="600px" />
+</p>
+
+3. 自动分析执行SQL输出运行结果
+
+<p align="center">
+  <img src="./assets/Auto-DB-GPT.png" width="600px" />
+</p>
+
+### 数据库问答
+
+<p align="center">
+  <img src="./assets/DB_QA.png" width="600px" />
+</p>
+
+
+1. 基于默认内置知识库问答
+
+<p align="center">
+  <img src="./assets/VectorDBQA.png" width="600px" />
+</p>
+
+2. 自己新增知识库
+
+<p align="center">
+  <img src="./assets/new_knownledge.gif" width="600px" />
+</p>
+
+3. 从网络自己爬取数据学习
+- TODO
+
+## 架构方案
+DB-GPT基于 [FastChat](https://github.com/lm-sys/FastChat) 构建大模型运行环境，并提供 vicuna 作为基础的大语言模型。此外，我们通过LangChain提供私域知识库问答能力。同时我们支持插件模式, 在设计上原生支持Auto-GPT插件。 
+
+整个DB-GPT的架构，如下图所示
+
+<p align="center">
+  <img src="./assets/DB-GPT.png" width="600px" />
+</p>
+
+核心能力主要有以下几个部分。 
+1. 知识库能力：支持私域知识库问答能力   
+2. 大模型管理能力：基于FastChat提供一个大模型的运营环境。
+3. 统一的数据向量化存储与索引：提供一种统一的方式来存储和索引各种数据类型。   
+4. 连接模块：用于连接不同的模块和数据源，实现数据的流转和交互。 
+5. Agent与插件：提供Agent和插件机制，使得用户可以自定义并增强系统的行为。  
+6. Prompt自动生成与优化：自动化生成高质量的Prompt，并进行优化，提高系统的响应效率。  
+7. 多端产品界面：支持多种不同的客户端产品，例如Web、移动应用和桌面应用等。
+
+下面对每个模块也做一些简要的介绍:
+
+### 知识库能力
+知识库作为当前用户需求最大的场景，我们原生支持知识库的构建与处理。同时在本项目当中，也提供了多种知识库的管理策略。 如:
+1. 默认内置知识库  
+2. 自定义新增知识库 
+3. 通过插件能力自抓取构建知识库等多种使用场景。
+   
+用户只需要整理好知识文档，即可用我们现有的能力构建大模型所需要的知识库能力。
+
+打造属于你的知识库：
+
+1、将个人知识文件或者文件夹放入pilot/datasets目录中
+
+2、在tools目录执行知识入库脚本
+
+```
+python tools/knowledge_init.py
+
+--vector_name : your vector store name  default_value:default
+--append: append mode, True:append, False: not append default_value:False
+
+```
+3、在界面上新增知识库输入你的知识库名（如果没指定输入default）,就可以根据你的知识库进行问答
+
+注意，这里默认向量模型是text2vec-large-chinese(模型比较大，如果个人电脑配置不够建议采用text2vec-base-chinese),因此确保需要将模型download下来放到models目录中
+
+
+### 大模型管理能力
+在底层大模型接入中，设计了开放的接口，支持对接多种大模型。同时对于接入模型的效果，我们有非常严格的把控与评审机制。对大模型能力上与ChatGPT对比，在准确率上需要满足85%以上的能力对齐。我们用更高的标准筛选模型，是期望在用户使用过程中，可以省去前面繁琐的测试评估环节。
+
+### 统一的数据向量化存储与索引
+为了方便对知识向量化之后的管理，我们内置了多种向量存储引擎，从基于内存的Chroma到分布式的Milvus, 可以根据自己的场景需求，选择不同的存储引擎，整个知识向量存储是AI能力增强的基石，向量作为人与大语言模型交互的中间语言，在本项目中的作用非常重要。
+
+### 连接模块
+为了能够更方便的与用户的私有环境进行交互，项目设计了连接模块，连接模块可以支持连接到数据库、Excel、知识库等等多种环境当中，实现信息与数据交互。 
+
+### Agent与插件  
+Agent与插件能力是大模型能否自动化的核心，在本的项目中，原生支持插件模式，大模型可以自动化完成目标。 同时为了充分发挥社区的优势，本项目中所用的插件原生支持Auto-GPT插件生态，即Auto-GPT的插件可以直接在我们的项目中运行。
+
+### Prompt自动生成与优化 
+Prompt是与大模型交互过程中非常重要的部分，一定程度上Prompt决定了大模型生成答案的质量与准确性，在本的项目中，我们会根据用户输入与使用场景，自动优化对应的Prompt，让用户使用大语言模型变得更简单、更高效。 
+
+### 多端产品界面
+TODO: 在终端展示上，我们将提供多端产品界面。包括PC、手机、命令行、Slack等多种模式。 
+
+
+## 安装教程
+### 1.硬件说明
+因为我们的项目在效果上具备ChatGPT 85%以上的能力，因此对硬件有一定的要求。 但总体来说，我们在消费级的显卡上即可完成项目的部署使用，具体部署的硬件说明如下:
+| GPU型号 | 显存大小 | 性能                                       |
+| ------- | -------- | ------------------------------------------ |
+| RTX4090 | 24G      | 可以流畅的进行对话推理，无卡顿             |
+| RTX3090 | 24G      | 可以流畅进行对话推理，有卡顿感，但好于V100 |
+| V100    | 16G      | 可以进行对话推理，有明显卡顿               |
+### 2.DB-GPT安装
+
+本项目依赖一个本地的 MySQL 数据库服务，你需要本地安装，推荐直接使用 Docker 安装。
+```
+docker run --name=mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=aa12345678 -dit mysql:latest
+```
+向量数据库我们默认使用的是Chroma内存数据库，所以无需特殊安装，如果有需要连接其他的同学，可以按照我们的教程进行安装配置。整个DB-GPT的安装过程，我们使用的是miniconda3的虚拟环境。创建虚拟环境，并安装python依赖包
+
+```
+python>=3.10
+conda create -n dbgpt_env python=3.10
+conda activate dbgpt_env
+pip install -r requirements.txt
+
+```
+
+### 3. 运行大模型
+
+关于基础模型, 可以根据[Vicuna](https://github.com/lm-sys/FastChat/blob/main/README.md#model-weights)合成教程进行合成。 
+如果此步有困难的同学，也可以直接使用[此链接](https://huggingface.co/Tribbiani/vicuna-7b)上的模型进行替代。
+
+  运行模型服务 
+```
+cd pilot/server
+python llmserver.py
+```
+
+运行 gradio webui
+
+```bash
+$ python webserver.py 
+```
+注意: 在启动Webserver之前, 需要修改.env 文件中的MODEL_SERVER = "http://127.0.0.1:8000", 将地址设置为你的服务器地址。
+
+## 使用说明
+
+我们提供了Gradio的用户界面，可以通过我们的用户界面使用DB-GPT， 同时关于我们项目相关的一些代码跟原理介绍，我们也准备了以下几篇参考文章。
+1.  [大模型实战系列(1) —— 强强联合Langchain-Vicuna应用实战](https://zhuanlan.zhihu.com/p/628750042)
+2.  [大模型实战系列(2) —— DB-GPT 阿里云部署指南](https://zhuanlan.zhihu.com/p/629467580)
+3.  [大模型实战系列(3) —— DB-GPT插件模型原理与使用](https://zhuanlan.zhihu.com/p/629623125)
+
+## 感谢
+
+项目取得的成果，需要感谢技术社区，尤其以下项目。
+
+- [FastChat](https://github.com/lm-sys/FastChat) 提供 chat 服务
+- [vicuna-13b](https://huggingface.co/Tribbiani/vicuna-13b) 作为基础模型
+- [langchain](https://github.com/hwchase17/langchain) 工具链
+- [Auto-GPT](https://github.com/Significant-Gravitas/Auto-GPT) 通用的插件模版
+- [Hugging Face](https://huggingface.co/) 大模型管理
+- [Chroma](https://github.com/chroma-core/chroma) 向量存储
+- [Milvus](https://milvus.io/) 分布式向量存储
+- [ChatGLM](https://github.com/THUDM/ChatGLM-6B) 基础模型
+- [llama-index](https://github.com/jerryjliu/llama_index) 基于现有知识库进行[In-Context Learning](https://arxiv.org/abs/2301.00234)来对其进行数据库相关知识的增强。
+
+<!-- GITCONTRIBUTOR_START -->
+
+## Contributors
+
+|[<img src="https://avatars.githubusercontent.com/u/17919400?v=4" width="100px;"/><br/><sub><b>csunny</b></sub>](https://github.com/csunny)<br/>|[<img src="https://avatars.githubusercontent.com/u/1011681?v=4" width="100px;"/><br/><sub><b>xudafeng</b></sub>](https://github.com/xudafeng)<br/>|[<img src="https://avatars.githubusercontent.com/u/7636723?s=96&v=4" width="100px;"/><br/><sub><b>明天</b></sub>](https://github.com/yhjun1026)<br/> | [<img src="https://avatars.githubusercontent.com/u/13723926?v=4" width="100px;"/><br/><sub><b>Aries-ckt</b></sub>](https://github.com/Aries-ckt)<br/>|[<img src="https://avatars.githubusercontent.com/u/95130644?v=4" width="100px;"/><br/><sub><b>thebigbone</b></sub>](https://github.com/thebigbone)<br/>|
+| :---: | :---: | :---: | :---: |:---: |
+
+
+This project follows the git-contributor [spec](https://github.com/xudafeng/git-contributor), auto updated at `Sun May 14 2023 23:02:43 GMT+0800`.
+
+<!-- GITCONTRIBUTOR_END -->
+
+这是一个用于数据库的复杂且创新的工具, 我们的项目也在紧急的开发当中, 会陆续发布一些新的feature。如在使用当中有任何具体问题, 优先在项目下提issue, 如有需要, 请联系如下微信，我会尽力提供帮助，同时也非常欢迎大家参与到项目建设中。
+
+<p align="center">
+  <img src="./assets/DB_GPT_wechat.png" width="320px" />
+</p>
+
+## Licence
+
+The MIT License (MIT)

From 98d50b1b98c93f8cc5d22f69da2cd0bf99466a03 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Fri, 19 May 2023 22:04:20 +0800
Subject: [PATCH 03/15] update:reademe knowledge init

---
 README.md    | 36 ++++++++++++++++++------------------
 README.zh.md | 32 ++++++++++++++++----------------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index bfc906c2c..cd7aaff9b 100644
--- a/README.md
+++ b/README.md
@@ -103,24 +103,6 @@ As the knowledge base is currently the most significant user demand scenario, we
 2. Custom addition of knowledge bases
 3. Various usage scenarios such as constructing knowledge bases through plugin capabilities and web crawling. Users only need to organize the knowledge documents, and they can use our existing capabilities to build the knowledge base required for the large model.
 
-Create your own knowledge base:
-
-1.Place personal knowledge files or folders in the pilot/datasets directory.
-
-2.Run the knowledge repository script in the tools directory.
-
-```
-python tools/knowledge_init.py
-
---vector_name : your vector store name  default_value:default
---append: append mode, True:append, False: not append default_value:False
-
-```
-
-3.Add the knowledge base in the interface by entering the name of your knowledge base (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. 
-
-Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
-
 
 ### LLMs Management
 
@@ -191,9 +173,27 @@ $ python pilot/server/webserver.py
 Notice:  the webserver need to connect llmserver,  so you need change the .env file. change the MODEL_SERVER = "http://127.0.0.1:8000" to your address.  It's very important.
 
 ## Usage Instructions
+
 We provide a user interface for Gradio, which allows you to use DB-GPT through our user interface. Additionally, we have prepared several reference articles (written in Chinese) that introduce the code and principles related to our project.
 - [LLM Practical In Action Series (1) — Combined Langchain-Vicuna Application Practical](https://medium.com/@cfqcsunny/llm-practical-in-action-series-1-combined-langchain-vicuna-application-practical-701cd0413c9f)
 
+####Create your own knowledge repository:
+
+1.Place personal knowledge files or folders in the pilot/datasets directory.
+
+2.Run the knowledge repository script in the tools directory.
+
+```
+python tools/knowledge_init.py
+
+--vector_name : your vector store name  default_value:default
+--append: append mode, True:append, False: not append default_value:False
+
+```
+
+3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. 
+
+Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
 ## Acknowledgement
 
 The achievements of this project are thanks to the technical community, especially the following projects:
diff --git a/README.zh.md b/README.zh.md
index 3675f3496..2c6ecca43 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -110,22 +110,6 @@ DB-GPT基于 [FastChat](https://github.com/lm-sys/FastChat) 构建大模型运
    
 用户只需要整理好知识文档，即可用我们现有的能力构建大模型所需要的知识库能力。
 
-打造属于你的知识库：
-
-1、将个人知识文件或者文件夹放入pilot/datasets目录中
-
-2、在tools目录执行知识入库脚本
-
-```
-python tools/knowledge_init.py
-
---vector_name : your vector store name  default_value:default
---append: append mode, True:append, False: not append default_value:False
-
-```
-3、在界面上新增知识库输入你的知识库名（如果没指定输入default）,就可以根据你的知识库进行问答
-
-注意，这里默认向量模型是text2vec-large-chinese(模型比较大，如果个人电脑配置不够建议采用text2vec-base-chinese),因此确保需要将模型download下来放到models目录中
 
 
 ### 大模型管理能力
@@ -196,6 +180,22 @@ $ python webserver.py
 2.  [大模型实战系列(2) —— DB-GPT 阿里云部署指南](https://zhuanlan.zhihu.com/p/629467580)
 3.  [大模型实战系列(3) —— DB-GPT插件模型原理与使用](https://zhuanlan.zhihu.com/p/629623125)
 
+####打造属于你的知识库：
+
+1、将个人知识文件或者文件夹放入pilot/datasets目录中
+
+2、在tools目录执行知识入库脚本
+
+```
+python tools/knowledge_init.py
+
+--vector_name : your vector store name  default_value:default
+--append: append mode, True:append, False: not append default_value:False
+
+```
+3、在界面上新增知识库输入你的知识库名（如果没指定输入default）,就可以根据你的知识库进行问答
+
+注意，这里默认向量模型是text2vec-large-chinese(模型比较大，如果个人电脑配置不够建议采用text2vec-base-chinese),因此确保需要将模型download下来放到models目录中。
 ## 感谢
 
 项目取得的成果，需要感谢技术社区，尤其以下项目。

From 6747d877ccf75e758eac9c9c6a005ecd6ed7c460 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Sun, 21 May 2023 16:29:00 +0800
Subject: [PATCH 04/15] feature:add milvus store

---
 pilot/configs/model_config.py                 |   2 +
 pilot/server/webserver.py                     |  15 +-
 pilot/source_embedding/knowledge_embedding.py |  49 +++--
 pilot/source_embedding/source_embedding.py    |  38 +++-
 pilot/vector_store/milvus_store.py            | 206 ++++++++++++++++--
 requirements.txt                              |   1 +
 tools/knowlege_init.py                        |  24 +-
 7 files changed, 277 insertions(+), 58 deletions(-)

diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py
index da68ab332..0f9cef937 100644
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@@ -48,3 +48,5 @@ DB_SETTINGS = {
 VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
 KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
 KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
+VECTOR_STORE_TYPE = "milvus"
+VECTOR_STORE_CONFIG = {"url": "127.0.0.1", "port": "19530"}
diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py
index 25940a437..bcf8f6385 100644
--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@@ -19,7 +19,8 @@ from langchain import PromptTemplate
 ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(ROOT_PATH)
 
-from pilot.configs.model_config import DB_SETTINGS, KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K
+from pilot.configs.model_config import DB_SETTINGS, KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, \
+    VECTOR_STORE_CONFIG
 from pilot.server.vectordb_qa import KnownLedgeBaseQA
 from pilot.connections.mysql import MySQLOperator
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
@@ -267,12 +268,16 @@ def http_bot(state, mode, sql_mode, db_selector, temperature, max_new_tokens, re
         skip_echo_len = len(prompt.replace("</s>", " ")) + 1
 
     if mode == conversation_types["custome"] and not db_selector:
-        persist_dir = os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vector_store_name["vs_name"] + ".vectordb")
-        print("vector store path: ", persist_dir)
+        # persist_dir = os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vector_store_name["vs_name"])
+        print("vector store type: ", VECTOR_STORE_CONFIG)
+        print("vector store name: ", vector_store_name["vs_name"])
+        vector_store_config = VECTOR_STORE_CONFIG
+        vector_store_config["vector_store_name"] = vector_store_name["vs_name"]
+        vector_store_config["text_field"] = "content"
+        vector_store_config["vector_store_path"] = KNOWLEDGE_UPLOAD_ROOT_PATH
         knowledge_embedding_client = KnowledgeEmbedding(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"],
                                                         local_persist=False,
-                                                        vector_store_config={"vector_store_name": vector_store_name["vs_name"],
-                                                      "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH})
+                                                        vector_store_config=vector_store_config)
         query = state.messages[-2][1]
         docs = knowledge_embedding_client.similar_search(query, VECTOR_SEARCH_TOP_K)
         context = [d.page_content for d in docs]
diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index 08d962908..63d6c2121 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -1,7 +1,7 @@
 import os
 
 from bs4 import BeautifulSoup
-from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
+from langchain.document_loaders import TextLoader, markdown
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
 from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
@@ -12,6 +12,7 @@ from pilot.source_embedding.pdf_embedding import PDFEmbedding
 import markdown
 
 from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
+from pilot.vector_store.milvus_store import MilvusStore
 
 
 class KnowledgeEmbedding:
@@ -20,7 +21,7 @@ class KnowledgeEmbedding:
         self.file_path = file_path
         self.model_name = model_name
         self.vector_store_config = vector_store_config
-        self.vector_store_type = "default"
+        self.file_type = "default"
         self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
         self.local_persist = local_persist
         if not self.local_persist:
@@ -42,7 +43,7 @@ class KnowledgeEmbedding:
         elif self.file_path.endswith(".csv"):
             embedding = CSVEmbedding(file_path=self.file_path, model_name=self.model_name,
                                      vector_store_config=self.vector_store_config)
-        elif self.vector_store_type == "default":
+        elif self.file_type == "default":
             embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
 
         return embedding
@@ -52,25 +53,33 @@ class KnowledgeEmbedding:
 
     def knowledge_persist_initialization(self, append_mode):
         vector_name = self.vector_store_config["vector_store_name"]
-        persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
-        print("vector db path: ", persist_dir)
-        if os.path.exists(persist_dir):
-            if append_mode:
-                print("append knowledge return vector store")
-                new_documents = self._load_knownlege(self.file_path)
-                vector_store = Chroma.from_documents(documents=new_documents,
+        documents = self._load_knownlege(self.file_path)
+        if self.vector_store_config["vector_store_type"] == "Chroma":
+            persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
+            print("vector db path: ", persist_dir)
+            if os.path.exists(persist_dir):
+                if append_mode:
+                    print("append knowledge return vector store")
+                    new_documents = self._load_knownlege(self.file_path)
+                    vector_store = Chroma.from_documents(documents=new_documents,
+                                                         embedding=self.embeddings,
+                                                         persist_directory=persist_dir)
+                else:
+                    print("directly return vector store")
+                    vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
+            else:
+                print(vector_name + " is new vector store, knowledge begin load...")
+                vector_store = Chroma.from_documents(documents=documents,
                                                      embedding=self.embeddings,
                                                      persist_directory=persist_dir)
-            else:
-                print("directly return vector store")
-                vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
-        else:
-            print(vector_name + " is new vector store, knowledge begin load...")
-            documents = self._load_knownlege(self.file_path)
-            vector_store = Chroma.from_documents(documents=documents,
-                                                 embedding=self.embeddings,
-                                                 persist_directory=persist_dir)
-            vector_store.persist()
+                vector_store.persist()
+
+        elif self.vector_store_config["vector_store_type"] == "milvus":
+            vector_store = MilvusStore({"url": self.vector_store_config["url"],
+                                "port": self.vector_store_config["port"],
+                                "embedding": self.embeddings})
+            vector_store.init_schema_and_load(vector_name, documents)
+
         return vector_store
 
     def _load_knownlege(self, path):
diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py
index 66bc97b6d..a253e4d78 100644
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -5,9 +5,14 @@ from abc import ABC, abstractmethod
 
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
+from langchain.vectorstores import Milvus
 
 from typing import List, Optional, Dict
 
+
+from pilot.configs.model_config import VECTOR_STORE_TYPE, VECTOR_STORE_CONFIG
+from pilot.vector_store.milvus_store import MilvusStore
+
 registered_methods = []
 
 
@@ -29,9 +34,20 @@ class SourceEmbedding(ABC):
         self.vector_store_config = vector_store_config
         self.embedding_args = embedding_args
         self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
-        persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
-                                   self.vector_store_config["vector_store_name"] + ".vectordb")
-        self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
+
+        if VECTOR_STORE_TYPE == "milvus":
+            print(VECTOR_STORE_CONFIG)
+            if self.vector_store_config.get("text_field") is None:
+                self.vector_store_client = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
+                                            "port": VECTOR_STORE_CONFIG["port"],
+                                            "embedding": self.embeddings})
+            else:
+                self.vector_store_client = Milvus(embedding_function=self.embeddings, collection_name=self.vector_store_config["vector_store_name"], text_field="content",
+                                            connection_args={"host": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"]})
+        else:
+            persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
+                                       self.vector_store_config["vector_store_name"] + ".vectordb")
+            self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
 
     @abstractmethod
     @register
@@ -54,10 +70,18 @@ class SourceEmbedding(ABC):
     @register
     def index_to_store(self, docs):
         """index to vector store"""
-        persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
-                                   self.vector_store_config["vector_store_name"] + ".vectordb")
-        self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
-        self.vector_store.persist()
+
+        if VECTOR_STORE_TYPE == "chroma":
+            persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
+                                       self.vector_store_config["vector_store_name"] + ".vectordb")
+            self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
+            self.vector_store.persist()
+
+        elif VECTOR_STORE_TYPE == "milvus":
+            self.vector_store = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
+                                        "port": VECTOR_STORE_CONFIG["port"],
+                                        "embedding": self.embeddings})
+            self.vector_store.init_schema_and_load(self.vector_store_config["vector_store_name"], docs)
 
     @register
     def similar_search(self, doc, topk):
diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py
index eda0b4e38..6b06dcf00 100644
--- a/pilot/vector_store/milvus_store.py
+++ b/pilot/vector_store/milvus_store.py
@@ -1,31 +1,35 @@
+from typing import List, Optional, Iterable
+
 from langchain.embeddings import HuggingFaceEmbeddings
 from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection
 
-from pilot.configs.model_config import LLM_MODEL_CONFIG
 from pilot.vector_store.vector_store_base import VectorStoreBase
 
 
 class MilvusStore(VectorStoreBase):
-    def __init__(self, cfg: {}) -> None:
-        """Construct a milvus memory storage connection.
+    def __init__(self, ctx: {}) -> None:
+        """init a milvus storage connection.
 
         Args:
-            cfg (Config): MilvusStore global config.
+            ctx ({}): MilvusStore global config.
         """
         # self.configure(cfg)
 
         connect_kwargs = {}
         self.uri = None
-        self.uri = cfg["url"]
-        self.port = cfg["port"]
-        self.username = cfg.get("username", None)
-        self.password = cfg.get("password", None)
-        self.collection_name = cfg["table_name"]
-        self.password = cfg.get("secure", None)
+        self.uri = ctx["url"]
+        self.port = ctx["port"]
+        self.username = ctx.get("username", None)
+        self.password = ctx.get("password", None)
+        self.collection_name = ctx.get("table_name", None)
+        self.secure = ctx.get("secure", None)
+        self.model_config = ctx.get("model_config", None)
+        self.embedding = ctx.get("embedding", None)
+        self.fields = []
 
         # use HNSW by default.
         self.index_params = {
-            "metric_type": "IP",
+            "metric_type": "L2",
             "index_type": "HNSW",
             "params": {"M": 8, "efConstruction": 64},
         }
@@ -39,20 +43,144 @@ class MilvusStore(VectorStoreBase):
             connect_kwargs["password"] = self.password
 
         connections.connect(
-            **connect_kwargs,
             host=self.uri or "127.0.0.1",
             port=self.port or "19530",
             alias="default"
             # secure=self.secure,
         )
+        if self.collection_name is not None:
+            self.col = Collection(self.collection_name)
+            schema = self.col.schema
+            for x in schema.fields:
+                self.fields.append(x.name)
+                if x.auto_id:
+                    self.fields.remove(x.name)
+                if x.is_primary:
+                    self.primary_field = x.name
+                if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
+                    self.vector_field = x.name
 
-        self.init_schema()
+
+        # self.init_schema()
+        # self.init_collection_schema()
+
+    def init_schema_and_load(self, vector_name, documents):
+        """Create a Milvus collection, indexes it with HNSW, load document.
+                Args:
+                    documents (List[str]): Text to insert.
+                    vector_name (Embeddings): your collection name.
+                Returns:
+                    VectorStore: The MilvusStore vector store.
+                """
+        try:
+            from pymilvus import (
+                Collection,
+                CollectionSchema,
+                DataType,
+                FieldSchema,
+                connections,
+            )
+            from pymilvus.orm.types import infer_dtype_bydata
+        except ImportError:
+            raise ValueError(
+                "Could not import pymilvus python package. "
+                "Please install it with `pip install pymilvus`."
+            )
+        # Connect to Milvus instance
+        if not connections.has_connection("default"):
+            connections.connect(
+                host=self.uri or "127.0.0.1",
+                port=self.port or "19530",
+                alias="default"
+                # secure=self.secure,
+            )
+        texts = [d.page_content for d in documents]
+        metadatas = [d.metadata for d in documents]
+        embeddings = self.embedding.embed_query(texts[0])
+        dim = len(embeddings)
+        # Generate unique names
+        primary_field = "pk_id"
+        vector_field = "vector"
+        text_field = "content"
+        self.text_field = text_field
+        collection_name = vector_name
+        fields = []
+        # Determine metadata schema
+        # if metadatas:
+        #     # Check if all metadata keys line up
+        #     key = metadatas[0].keys()
+        #     for x in metadatas:
+        #         if key != x.keys():
+        #             raise ValueError(
+        #                 "Mismatched metadata. "
+        #                 "Make sure all metadata has the same keys and datatype."
+        #             )
+        #     # Create FieldSchema for each entry in singular metadata.
+        #     for key, value in metadatas[0].items():
+        #         # Infer the corresponding datatype of the metadata
+        #         dtype = infer_dtype_bydata(value)
+        #         if dtype == DataType.UNKNOWN:
+        #             raise ValueError(f"Unrecognized datatype for {key}.")
+        #         elif dtype == DataType.VARCHAR:
+        #             # Find out max length text based metadata
+        #             max_length = 0
+        #             for subvalues in metadatas:
+        #                 max_length = max(max_length, len(subvalues[key]))
+        #             fields.append(
+        #                 FieldSchema(key, DataType.VARCHAR, max_length=max_length + 1)
+        #             )
+        #         else:
+        #             fields.append(FieldSchema(key, dtype))
+
+        # Find out max length of texts
+        max_length = 0
+        for y in texts:
+            max_length = max(max_length, len(y))
+        # Create the text field
+        fields.append(
+            FieldSchema(text_field, DataType.VARCHAR, max_length=max_length + 1)
+        )
+        # Create the primary key field
+        fields.append(
+            FieldSchema(primary_field, DataType.INT64, is_primary=True, auto_id=True)
+        )
+        # Create the vector field
+        fields.append(FieldSchema(vector_field, DataType.FLOAT_VECTOR, dim=dim))
+        # Create the schema for the collection
+        schema = CollectionSchema(fields)
+        # Create the collection
+        collection = Collection(collection_name, schema)
+        self.col = collection
+        # Index parameters for the collection
+        index = self.index_params
+        # Create the index
+        collection.create_index(vector_field, index)
+        # Create the VectorStore
+        # milvus = cls(
+        #     embedding,
+        #     kwargs.get("connection_args", {"port": 19530}),
+        #     collection_name,
+        #     text_field,
+        # )
+        # Add the texts.
+        schema = collection.schema
+        for x in schema.fields:
+            self.fields.append(x.name)
+            if x.auto_id:
+                self.fields.remove(x.name)
+            if x.is_primary:
+                self.primary_field = x.name
+            if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
+                self.vector_field = x.name
+        self._add_texts(texts, metadatas)
+
+        return self.collection_name
 
     def init_schema(self) -> None:
         """Initialize collection in milvus database."""
         fields = [
             FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
-            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),
+            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=self.model_config["dim"]),
             FieldSchema(name="raw_text", dtype=DataType.VARCHAR, max_length=65535),
         ]
 
@@ -75,7 +203,7 @@ class MilvusStore(VectorStoreBase):
         info = self.collection.describe()
         self.collection.load()
 
-    def insert(self, text) -> str:
+    def insert(self, text, model_config) -> str:
         """Add an embedding of data into milvus.
         Args:
             text (str): The raw text to construct embedding index.
@@ -83,10 +211,54 @@ class MilvusStore(VectorStoreBase):
             str: log.
         """
         # embedding = get_ada_embedding(data)
-        embeddings = HuggingFaceEmbeddings(model_name=LLM_MODEL_CONFIG["sentence-transforms"])
+        embeddings = HuggingFaceEmbeddings(model_name=self.model_config["model_name"])
         result = self.collection.insert([embeddings.embed_documents(text), text])
         _text = (
             "Inserting data into memory at primary key: "
             f"{result.primary_keys[0]}:\n data: {text}"
         )
-        return _text
\ No newline at end of file
+        return _text
+
+    def _add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        partition_name: Optional[str] = None,
+        timeout: Optional[int] = None,
+    ) -> List[str]:
+        """Insert text data into Milvus.
+        Args:
+            texts (Iterable[str]): The text being embedded and inserted.
+            metadatas (Optional[List[dict]], optional): The metadata that
+                corresponds to each insert. Defaults to None.
+            partition_name (str, optional): The partition of the collection
+                to insert data into. Defaults to None.
+            timeout: specified timeout.
+
+        Returns:
+            List[str]: The resulting keys for each inserted element.
+        """
+        insert_dict: Any = {self.text_field: list(texts)}
+        try:
+            insert_dict[self.vector_field] = self.embedding.embed_documents(
+                list(texts)
+            )
+        except NotImplementedError:
+            insert_dict[self.vector_field] = [
+                self.embedding.embed_query(x) for x in texts
+            ]
+        # Collect the metadata into the insert dict.
+        if len(self.fields) > 2 and metadatas is not None:
+            for d in metadatas:
+                for key, value in d.items():
+                    if key in self.fields:
+                        insert_dict.setdefault(key, []).append(value)
+        # Convert dict to list of lists for insertion
+        insert_list = [insert_dict[x] for x in self.fields]
+        # Insert into the collection.
+        res = self.col.insert(
+            insert_list, partition_name=partition_name, timeout=timeout
+        )
+        # Flush to make sure newly inserted is immediately searchable.
+        self.col.flush()
+        return res.primary_keys
diff --git a/requirements.txt b/requirements.txt
index eac927c3d..ba31d0d04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,6 +60,7 @@ gTTS==2.3.1
 langchain
 nltk
 python-dotenv==1.0.0
+pymilvus
 
 # Testing dependencies
 pytest
diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py
index e9ecad49a..fdc754e05 100644
--- a/tools/knowlege_init.py
+++ b/tools/knowlege_init.py
@@ -2,8 +2,10 @@
 # -*- coding: utf-8 -*-
 import argparse
 
-from pilot.configs.model_config import DATASETS_DIR, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, \
-    KNOWLEDGE_UPLOAD_ROOT_PATH
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Milvus
+
+from pilot.configs.model_config import DATASETS_DIR, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, VECTOR_STORE_CONFIG
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
 
 
@@ -12,15 +14,15 @@ class LocalKnowledgeInit:
     model_name = LLM_MODEL_CONFIG["text2vec"]
     top_k: int = VECTOR_SEARCH_TOP_K
 
-    def __init__(self) -> None:
-        pass
+    def __init__(self, vector_store_config) -> None:
+        self.vector_store_config = vector_store_config
 
-    def knowledge_persist(self, file_path, vector_name, append_mode):
+    def knowledge_persist(self, file_path, append_mode):
         """ knowledge persist """
         kv = KnowledgeEmbedding(
             file_path=file_path,
             model_name=LLM_MODEL_CONFIG["text2vec"],
-            vector_store_config= {"vector_store_name":vector_name, "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH})
+            vector_store_config= self.vector_store_config)
         vector_store = kv.knowledge_persist_initialization(append_mode)
         return vector_store
 
@@ -34,11 +36,15 @@ class LocalKnowledgeInit:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--vector_name", type=str, default="default")
+    parser.add_argument("--vector_name", type=str, default="keting")
     parser.add_argument("--append", type=bool, default=False)
+    parser.add_argument("--store_type", type=str, default="Chroma")
     args = parser.parse_args()
     vector_name = args.vector_name
     append_mode = args.append
-    kv  = LocalKnowledgeInit()
-    vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, vector_name=vector_name, append_mode=append_mode)
+    store_type = args.store_type
+    vector_store_config = {"url": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"], "vector_store_name":vector_name, "vector_store_type":store_type}
+    print(vector_store_config)
+    kv  = LocalKnowledgeInit(vector_store_config=vector_store_config)
+    vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, append_mode=append_mode)
     print("your knowledge embedding success...")
\ No newline at end of file

From 0297fa425b8c7c34eafd1d33a255bd1055979a6c Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Sun, 21 May 2023 16:40:18 +0800
Subject: [PATCH 05/15] feature:add milvus store

---
 pilot/configs/model_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py
index 0f9cef937..c63187d03 100644
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@@ -48,5 +48,5 @@ DB_SETTINGS = {
 VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
 KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
 KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
-VECTOR_STORE_TYPE = "milvus"
+VECTOR_STORE_TYPE = "Chroma"
 VECTOR_STORE_CONFIG = {"url": "127.0.0.1", "port": "19530"}

From 983a00f53a2d7aff3935b208615fbd770baebc13 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 10:50:43 +0800
Subject: [PATCH 06/15] feature:vector store connector

---
 pilot/configs/model_config.py                 |   4 +-
 pilot/source_embedding/knowledge_embedding.py |  37 +---
 pilot/source_embedding/source_embedding.py    |  41 +---
 pilot/vector_store/chroma_store.py            |  30 +++
 pilot/vector_store/connector.py               |  22 +++
 pilot/vector_store/milvus_store.py            | 185 +++++++++++++-----
 pilot/vector_store/vector_store_base.py       |   8 +-
 tools/knowlege_init.py                        |  10 +-
 8 files changed, 209 insertions(+), 128 deletions(-)
 create mode 100644 pilot/vector_store/chroma_store.py
 create mode 100644 pilot/vector_store/connector.py

diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py
index 7c4928304..6e32daefc 100644
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@@ -48,5 +48,7 @@ VECTOR_SEARCH_TOP_K = 10
 VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
 KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
 KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
-VECTOR_STORE_TYPE = "milvus"
+#vector db type, now provided Chroma and Milvus
+VECTOR_STORE_TYPE = "Milvus"
+#vector db config
 VECTOR_STORE_CONFIG = {"url": "127.0.0.1", "port": "19530"}
diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index 63d6c2121..85db5ab02 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -3,8 +3,7 @@ import os
 from bs4 import BeautifulSoup
 from langchain.document_loaders import TextLoader, markdown
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import Chroma
-from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
+from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE, VECTOR_STORE_TYPE
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 from pilot.source_embedding.csv_embedding import CSVEmbedding
 from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
@@ -12,7 +11,7 @@ from pilot.source_embedding.pdf_embedding import PDFEmbedding
 import markdown
 
 from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
-from pilot.vector_store.milvus_store import MilvusStore
+from pilot.vector_store.connector import VectorStoreConnector
 
 
 class KnowledgeEmbedding:
@@ -23,6 +22,7 @@ class KnowledgeEmbedding:
         self.vector_store_config = vector_store_config
         self.file_type = "default"
         self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
+        self.vector_store_config["embeddings"] = self.embeddings
         self.local_persist = local_persist
         if not self.local_persist:
             self.knowledge_embedding_client = self.init_knowledge_embedding()
@@ -52,35 +52,10 @@ class KnowledgeEmbedding:
         return self.knowledge_embedding_client.similar_search(text, topk)
 
     def knowledge_persist_initialization(self, append_mode):
-        vector_name = self.vector_store_config["vector_store_name"]
         documents = self._load_knownlege(self.file_path)
-        if self.vector_store_config["vector_store_type"] == "Chroma":
-            persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
-            print("vector db path: ", persist_dir)
-            if os.path.exists(persist_dir):
-                if append_mode:
-                    print("append knowledge return vector store")
-                    new_documents = self._load_knownlege(self.file_path)
-                    vector_store = Chroma.from_documents(documents=new_documents,
-                                                         embedding=self.embeddings,
-                                                         persist_directory=persist_dir)
-                else:
-                    print("directly return vector store")
-                    vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
-            else:
-                print(vector_name + " is new vector store, knowledge begin load...")
-                vector_store = Chroma.from_documents(documents=documents,
-                                                     embedding=self.embeddings,
-                                                     persist_directory=persist_dir)
-                vector_store.persist()
-
-        elif self.vector_store_config["vector_store_type"] == "milvus":
-            vector_store = MilvusStore({"url": self.vector_store_config["url"],
-                                "port": self.vector_store_config["port"],
-                                "embedding": self.embeddings})
-            vector_store.init_schema_and_load(vector_name, documents)
-
-        return vector_store
+        self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, self.vector_store_config)
+        self.vector_client.load_document(documents)
+        return self.vector_client
 
     def _load_knownlege(self, path):
         docments = []
diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py
index a253e4d78..ddefd4f1e 100644
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -1,17 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import os
 from abc import ABC, abstractmethod
 
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import Chroma
-from langchain.vectorstores import Milvus
-
 from typing import List, Optional, Dict
-
-
-from pilot.configs.model_config import VECTOR_STORE_TYPE, VECTOR_STORE_CONFIG
-from pilot.vector_store.milvus_store import MilvusStore
+from pilot.configs.model_config import VECTOR_STORE_TYPE
+from pilot.vector_store.connector import VectorStoreConnector
 
 registered_methods = []
 
@@ -35,19 +29,8 @@ class SourceEmbedding(ABC):
         self.embedding_args = embedding_args
         self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
 
-        if VECTOR_STORE_TYPE == "milvus":
-            print(VECTOR_STORE_CONFIG)
-            if self.vector_store_config.get("text_field") is None:
-                self.vector_store_client = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
-                                            "port": VECTOR_STORE_CONFIG["port"],
-                                            "embedding": self.embeddings})
-            else:
-                self.vector_store_client = Milvus(embedding_function=self.embeddings, collection_name=self.vector_store_config["vector_store_name"], text_field="content",
-                                            connection_args={"host": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"]})
-        else:
-            persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
-                                       self.vector_store_config["vector_store_name"] + ".vectordb")
-            self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
+        vector_store_config["embeddings"] = self.embeddings
+        self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, vector_store_config)
 
     @abstractmethod
     @register
@@ -70,24 +53,12 @@ class SourceEmbedding(ABC):
     @register
     def index_to_store(self, docs):
         """index to vector store"""
-
-        if VECTOR_STORE_TYPE == "chroma":
-            persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
-                                       self.vector_store_config["vector_store_name"] + ".vectordb")
-            self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
-            self.vector_store.persist()
-
-        elif VECTOR_STORE_TYPE == "milvus":
-            self.vector_store = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
-                                        "port": VECTOR_STORE_CONFIG["port"],
-                                        "embedding": self.embeddings})
-            self.vector_store.init_schema_and_load(self.vector_store_config["vector_store_name"], docs)
+        self.vector_client.load_document(docs)
 
     @register
     def similar_search(self, doc, topk):
         """vector store similarity_search"""
-
-        return self.vector_store_client.similarity_search(doc, topk)
+        return self.vector_client.similar_search(doc, topk)
 
     def source_embedding(self):
         if 'read' in registered_methods:
diff --git a/pilot/vector_store/chroma_store.py b/pilot/vector_store/chroma_store.py
new file mode 100644
index 000000000..9a91659f1
--- /dev/null
+++ b/pilot/vector_store/chroma_store.py
@@ -0,0 +1,30 @@
+import os
+
+from langchain.vectorstores import Chroma
+
+from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
+from pilot.logs import logger
+from pilot.vector_store.vector_store_base import VectorStoreBase
+
+
+class ChromaStore(VectorStoreBase):
+    """chroma database"""
+
+    def __init__(self, ctx: {}) -> None:
+        self.ctx = ctx
+        self.embeddings = ctx["embeddings"]
+        self.persist_dir = os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH,
+                                   ctx["vector_store_name"] + ".vectordb")
+        self.vector_store_client = Chroma(persist_directory=self.persist_dir, embedding_function=self.embeddings)
+
+    def similar_search(self, text, topk) -> None:
+        logger.info("ChromaStore similar search")
+        return self.vector_store_client.similarity_search(text, topk)
+
+    def load_document(self, documents):
+        logger.info("ChromaStore load document")
+        texts = [doc.page_content for doc in documents]
+        metadatas = [doc.metadata for doc in documents]
+        self.vector_store_client.add_texts(texts=texts, metadatas=metadatas)
+        self.vector_store_client.persist()
+
diff --git a/pilot/vector_store/connector.py b/pilot/vector_store/connector.py
new file mode 100644
index 000000000..003415712
--- /dev/null
+++ b/pilot/vector_store/connector.py
@@ -0,0 +1,22 @@
+from pilot.vector_store.chroma_store import ChromaStore
+from pilot.vector_store.milvus_store import MilvusStore
+
+connector = {
+        "Chroma": ChromaStore,
+        "Milvus": MilvusStore
+    }
+
+
+class VectorStoreConnector:
+    """ vector store connector, can connect different vector db provided load document api and similar search api
+        """
+    def __init__(self, vector_store_type, ctx: {}) -> None:
+        self.ctx = ctx
+        self.connector_class = connector[vector_store_type]
+        self.client = self.connector_class(ctx)
+
+    def load_document(self, docs):
+        self.client.load_document(docs)
+
+    def similar_search(self, docs, topk):
+        return self.client.similar_search(docs, topk)
diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py
index 5204e6b11..1c6d6bdbc 100644
--- a/pilot/vector_store/milvus_store.py
+++ b/pilot/vector_store/milvus_store.py
@@ -1,12 +1,14 @@
-from typing import List, Optional, Iterable
+from typing import List, Optional, Iterable, Tuple, Any
 
-from langchain.embeddings import HuggingFaceEmbeddings
-from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection
+from pymilvus import connections, Collection, DataType
 
+from pilot.configs.model_config import VECTOR_STORE_CONFIG
+from langchain.docstore.document import Document
 from pilot.vector_store.vector_store_base import VectorStoreBase
 
 
 class MilvusStore(VectorStoreBase):
+    """Milvus database"""
     def __init__(self, ctx: {}) -> None:
         """init a milvus storage connection.
 
@@ -17,14 +19,13 @@ class MilvusStore(VectorStoreBase):
 
         connect_kwargs = {}
         self.uri = None
-        self.uri = ctx["url"]
-        self.port = ctx["port"]
+        self.uri = ctx.get("url", VECTOR_STORE_CONFIG["url"])
+        self.port = ctx.get("port", VECTOR_STORE_CONFIG["port"])
         self.username = ctx.get("username", None)
         self.password = ctx.get("password", None)
-        self.collection_name = ctx.get("table_name", None)
+        self.collection_name = ctx.get("vector_store_name", None)
         self.secure = ctx.get("secure", None)
-        self.model_config = ctx.get("model_config", None)
-        self.embedding = ctx.get("embedding", None)
+        self.embedding = ctx.get("embeddings", None)
         self.fields = []
 
         # use HNSW by default.
@@ -33,6 +34,20 @@ class MilvusStore(VectorStoreBase):
             "index_type": "HNSW",
             "params": {"M": 8, "efConstruction": 64},
         }
+        # use HNSW by default.
+        self.index_params_map = {
+            "IVF_FLAT": {"params": {"nprobe": 10}},
+            "IVF_SQ8": {"params": {"nprobe": 10}},
+            "IVF_PQ": {"params": {"nprobe": 10}},
+            "HNSW": {"params": {"ef": 10}},
+            "RHNSW_FLAT": {"params": {"ef": 10}},
+            "RHNSW_SQ": {"params": {"ef": 10}},
+            "RHNSW_PQ": {"params": {"ef": 10}},
+            "IVF_HNSW": {"params": {"nprobe": 10, "ef": 10}},
+            "ANNOY": {"params": {"search_k": 10}},
+        }
+
+        self.text_field = "content"
 
         if (self.username is None) != (self.password is None):
             raise ValueError(
@@ -48,21 +63,6 @@ class MilvusStore(VectorStoreBase):
             alias="default"
             # secure=self.secure,
         )
-        if self.collection_name is not None:
-            self.col = Collection(self.collection_name)
-            schema = self.col.schema
-            for x in schema.fields:
-                self.fields.append(x.name)
-                if x.auto_id:
-                    self.fields.remove(x.name)
-                if x.is_primary:
-                    self.primary_field = x.name
-                if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
-                    self.vector_field = x.name
-
-
-        # self.init_schema()
-        # self.init_collection_schema()
 
     def init_schema_and_load(self, vector_name, documents):
         """Create a Milvus collection, indexes it with HNSW, load document.
@@ -86,7 +86,6 @@ class MilvusStore(VectorStoreBase):
                 "Could not import pymilvus python package. "
                 "Please install it with `pip install pymilvus`."
             )
-        # Connect to Milvus instance
         if not connections.has_connection("default"):
             connections.connect(
                 host=self.uri or "127.0.0.1",
@@ -140,11 +139,11 @@ class MilvusStore(VectorStoreBase):
         fields.append(
             FieldSchema(text_field, DataType.VARCHAR, max_length=max_length + 1)
         )
-        # Create the primary key field
+        # create the primary key field
         fields.append(
             FieldSchema(primary_field, DataType.INT64, is_primary=True, auto_id=True)
         )
-        # Create the vector field
+        # create the vector field
         fields.append(FieldSchema(vector_field, DataType.FLOAT_VECTOR, dim=dim))
         # Create the schema for the collection
         schema = CollectionSchema(fields)
@@ -176,32 +175,44 @@ class MilvusStore(VectorStoreBase):
 
         return self.collection_name
 
-    def init_schema(self) -> None:
-        """Initialize collection in milvus database."""
-        fields = [
-            FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
-            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=self.model_config["dim"]),
-            FieldSchema(name="raw_text", dtype=DataType.VARCHAR, max_length=65535),
-        ]
-
-        # create collection if not exist and load it.
-        self.schema = CollectionSchema(fields, "db-gpt memory storage")
-        self.collection = Collection(self.collection_name, self.schema)
-        self.index_params = {
-            "metric_type": "IP",
-            "index_type": "HNSW",
-            "params": {"M": 8, "efConstruction": 64},
-        }
-        # create index if not exist.
-        if not self.collection.has_index():
-            self.collection.release()
-            self.collection.create_index(
-                "vector",
-                self.index_params,
-                index_name="vector",
-            )
-        info = self.collection.describe()
-        self.collection.load()
+    # def init_schema(self) -> None:
+    #     """Initialize collection in milvus database."""
+    #     fields = [
+    #         FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
+    #         FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=self.model_config["dim"]),
+    #         FieldSchema(name="raw_text", dtype=DataType.VARCHAR, max_length=65535),
+    #     ]
+    #
+    #     # create collection if not exist and load it.
+    #     self.schema = CollectionSchema(fields, "db-gpt memory storage")
+    #     self.collection = Collection(self.collection_name, self.schema)
+    #     self.index_params_map = {
+    #         "IVF_FLAT": {"params": {"nprobe": 10}},
+    #         "IVF_SQ8": {"params": {"nprobe": 10}},
+    #         "IVF_PQ": {"params": {"nprobe": 10}},
+    #         "HNSW": {"params": {"ef": 10}},
+    #         "RHNSW_FLAT": {"params": {"ef": 10}},
+    #         "RHNSW_SQ": {"params": {"ef": 10}},
+    #         "RHNSW_PQ": {"params": {"ef": 10}},
+    #         "IVF_HNSW": {"params": {"nprobe": 10, "ef": 10}},
+    #         "ANNOY": {"params": {"search_k": 10}},
+    #     }
+    #
+    #     self.index_params = {
+    #         "metric_type": "IP",
+    #         "index_type": "HNSW",
+    #         "params": {"M": 8, "efConstruction": 64},
+    #     }
+    #     # create index if not exist.
+    #     if not self.collection.has_index():
+    #         self.collection.release()
+    #         self.collection.create_index(
+    #             "vector",
+    #             self.index_params,
+    #             index_name="vector",
+    #         )
+    #     info = self.collection.describe()
+    #     self.collection.load()
 
     # def insert(self, text, model_config) -> str:
     #     """Add an embedding of data into milvus.
@@ -226,7 +237,7 @@ class MilvusStore(VectorStoreBase):
         partition_name: Optional[str] = None,
         timeout: Optional[int] = None,
     ) -> List[str]:
-        """Insert text data into Milvus.
+        """add text data into Milvus.
         Args:
             texts (Iterable[str]): The text being embedded and inserted.
             metadatas (Optional[List[dict]], optional): The metadata that
@@ -259,6 +270,72 @@ class MilvusStore(VectorStoreBase):
         res = self.col.insert(
             insert_list, partition_name=partition_name, timeout=timeout
         )
-        # Flush to make sure newly inserted is immediately searchable.
+        # make sure data is searchable.
         self.col.flush()
         return res.primary_keys
+
+    def load_document(self, documents) -> None:
+        """load document in vector database."""
+        self.init_schema_and_load(self.collection_name, documents)
+
+    def similar_search(self, text, topk) -> None:
+        self.col = Collection(self.collection_name)
+        schema = self.col.schema
+        for x in schema.fields:
+            self.fields.append(x.name)
+            if x.auto_id:
+                self.fields.remove(x.name)
+            if x.is_primary:
+                self.primary_field = x.name
+            if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
+                self.vector_field = x.name
+        _, docs_and_scores = self._search(text, topk)
+        return [doc for doc, _, _ in docs_and_scores]
+
+    def _search(
+        self,
+        query: str,
+        k: int = 4,
+        param: Optional[dict] = None,
+        expr: Optional[str] = None,
+        partition_names: Optional[List[str]] = None,
+        round_decimal: int = -1,
+        timeout: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Tuple[List[float], List[Tuple[Document, Any, Any]]]:
+        self.col.load()
+        # use default index params.
+        if param is None:
+            index_type = self.col.indexes[0].params["index_type"]
+            param = self.index_params_map[index_type]
+        #  query text embedding.
+        data = [self.embedding.embed_query(query)]
+        # Determine result metadata fields.
+        output_fields = self.fields[:]
+        output_fields.remove(self.vector_field)
+        # milvus search.
+        res = self.col.search(
+            data,
+            self.vector_field,
+            param,
+            k,
+            expr=expr,
+            output_fields=output_fields,
+            partition_names=partition_names,
+            round_decimal=round_decimal,
+            timeout=timeout,
+            **kwargs,
+        )
+        # Organize results.
+        ret = []
+        for result in res[0]:
+            meta = {x: result.entity.get(x) for x in output_fields}
+            ret.append(
+                (
+                    Document(page_content=meta.pop(self.text_field), metadata=meta),
+                    result.distance,
+                    result.id,
+                )
+            )
+
+        return data[0], ret
diff --git a/pilot/vector_store/vector_store_base.py b/pilot/vector_store/vector_store_base.py
index 818730f0f..b483b3116 100644
--- a/pilot/vector_store/vector_store_base.py
+++ b/pilot/vector_store/vector_store_base.py
@@ -2,8 +2,14 @@ from abc import ABC, abstractmethod
 
 
 class VectorStoreBase(ABC):
+    """base class for vector store database"""
 
     @abstractmethod
-    def init_schema(self) -> None:
+    def load_document(self, documents) -> None:
+        """load document in vector database."""
+        pass
+
+    @abstractmethod
+    def similar_search(self, text, topk) -> None:
         """Initialize schema in vector database."""
         pass
\ No newline at end of file
diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py
index 60010e4de..23ca33a80 100644
--- a/tools/knowlege_init.py
+++ b/tools/knowlege_init.py
@@ -2,10 +2,8 @@
 # -*- coding: utf-8 -*-
 import argparse
 
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import Milvus
-
-from pilot.configs.model_config import DATASETS_DIR, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, VECTOR_STORE_CONFIG
+from pilot.configs.model_config import DATASETS_DIR, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, VECTOR_STORE_CONFIG, \
+    VECTOR_STORE_TYPE
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
 
 
@@ -42,8 +40,8 @@ if __name__ == "__main__":
     args = parser.parse_args()
     vector_name = args.vector_name
     append_mode = args.append
-    store_type = args.store_type
-    vector_store_config = {"url": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"], "vector_store_name":vector_name, "vector_store_type":store_type}
+    store_type = VECTOR_STORE_TYPE
+    vector_store_config = {"url": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"], "vector_store_name":vector_name}
     print(vector_store_config)
     kv  = LocalKnowledgeInit(vector_store_config=vector_store_config)
     vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, append_mode=append_mode)

From ef64935145a0e1d117d1b7d7f0276c049f85d0d5 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:06:07 +0800
Subject: [PATCH 07/15] update:vector store config

---
 pilot/configs/config.py                       |  8 ++++++
 pilot/configs/model_config.py                 |  6 +----
 pilot/source_embedding/knowledge_embedding.py |  5 +++-
 pilot/source_embedding/source_embedding.py    |  6 +++--
 pilot/vector_store/milvus_store.py            | 26 ++++++-------------
 5 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/pilot/configs/config.py b/pilot/configs/config.py
index b914390f7..e9ec2bd48 100644
--- a/pilot/configs/config.py
+++ b/pilot/configs/config.py
@@ -109,6 +109,14 @@ class Config(metaclass=Singleton):
         self.MODEL_SERVER = os.getenv("MODEL_SERVER",  "http://127.0.0.1" + ":" + str(self.MODEL_PORT))
         self.ISLOAD_8BIT = os.getenv("ISLOAD_8BIT", "True") == "True"
 
+        ### Vector Store Configuration
+        self.VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE",  "Chroma")
+        self.MILVUS_URL = os.getenv("MILVUS_URL",  "127.0.0.1")
+        self.MILVUS_PORT = os.getenv("MILVUS_PORT",  "19530")
+        self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME",  None)
+        self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD",  None)
+
+
     def set_debug_mode(self, value: bool) -> None:
         """Set the debug mode value"""
         self.debug_mode = value
diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py
index 6e32daefc..ebd8513e4 100644
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@@ -47,8 +47,4 @@ ISDEBUG = False
 VECTOR_SEARCH_TOP_K = 10
 VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
 KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
-KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
-#vector db type, now provided Chroma and Milvus
-VECTOR_STORE_TYPE = "Milvus"
-#vector db config
-VECTOR_STORE_CONFIG = {"url": "127.0.0.1", "port": "19530"}
+KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
\ No newline at end of file
diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index 85db5ab02..cb1fcb504 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -3,6 +3,8 @@ import os
 from bs4 import BeautifulSoup
 from langchain.document_loaders import TextLoader, markdown
 from langchain.embeddings import HuggingFaceEmbeddings
+
+from pilot.configs.config import Config
 from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE, VECTOR_STORE_TYPE
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 from pilot.source_embedding.csv_embedding import CSVEmbedding
@@ -13,6 +15,7 @@ import markdown
 from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
 from pilot.vector_store.connector import VectorStoreConnector
 
+CFG = Config()
 
 class KnowledgeEmbedding:
     def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
@@ -53,7 +56,7 @@ class KnowledgeEmbedding:
 
     def knowledge_persist_initialization(self, append_mode):
         documents = self._load_knownlege(self.file_path)
-        self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, self.vector_store_config)
+        self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, self.vector_store_config)
         self.vector_client.load_document(documents)
         return self.vector_client
 
diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py
index ddefd4f1e..a84282009 100644
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -4,10 +4,12 @@ from abc import ABC, abstractmethod
 
 from langchain.embeddings import HuggingFaceEmbeddings
 from typing import List, Optional, Dict
-from pilot.configs.model_config import VECTOR_STORE_TYPE
+
+from pilot.configs.config import Config
 from pilot.vector_store.connector import VectorStoreConnector
 
 registered_methods = []
+CFG = Config()
 
 
 def register(method):
@@ -30,7 +32,7 @@ class SourceEmbedding(ABC):
         self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
 
         vector_store_config["embeddings"] = self.embeddings
-        self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, vector_store_config)
+        self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, vector_store_config)
 
     @abstractmethod
     @register
diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py
index 1c6d6bdbc..a61027850 100644
--- a/pilot/vector_store/milvus_store.py
+++ b/pilot/vector_store/milvus_store.py
@@ -2,11 +2,12 @@ from typing import List, Optional, Iterable, Tuple, Any
 
 from pymilvus import connections, Collection, DataType
 
-from pilot.configs.model_config import VECTOR_STORE_CONFIG
 from langchain.docstore.document import Document
+
+from pilot.configs.config import Config
 from pilot.vector_store.vector_store_base import VectorStoreBase
 
-
+CFG = Config()
 class MilvusStore(VectorStoreBase):
     """Milvus database"""
     def __init__(self, ctx: {}) -> None:
@@ -18,11 +19,10 @@ class MilvusStore(VectorStoreBase):
         # self.configure(cfg)
 
         connect_kwargs = {}
-        self.uri = None
-        self.uri = ctx.get("url", VECTOR_STORE_CONFIG["url"])
-        self.port = ctx.get("port", VECTOR_STORE_CONFIG["port"])
-        self.username = ctx.get("username", None)
-        self.password = ctx.get("password", None)
+        self.uri = CFG.MILVUS_URL
+        self.port = CFG.MILVUS_PORT
+        self.username = CFG.MILVUS_USERNAME
+        self.password = CFG.MILVUS_PASSWORD
         self.collection_name = ctx.get("vector_store_name", None)
         self.secure = ctx.get("secure", None)
         self.embedding = ctx.get("embeddings", None)
@@ -238,16 +238,6 @@ class MilvusStore(VectorStoreBase):
         timeout: Optional[int] = None,
     ) -> List[str]:
         """add text data into Milvus.
-        Args:
-            texts (Iterable[str]): The text being embedded and inserted.
-            metadatas (Optional[List[dict]], optional): The metadata that
-                corresponds to each insert. Defaults to None.
-            partition_name (str, optional): The partition of the collection
-                to insert data into. Defaults to None.
-            timeout: specified timeout.
-
-        Returns:
-            List[str]: The resulting keys for each inserted element.
         """
         insert_dict: Any = {self.text_field: list(texts)}
         try:
@@ -279,6 +269,7 @@ class MilvusStore(VectorStoreBase):
         self.init_schema_and_load(self.collection_name, documents)
 
     def similar_search(self, text, topk) -> None:
+        """similar_search in vector database."""
         self.col = Collection(self.collection_name)
         schema = self.col.schema
         for x in schema.fields:
@@ -326,7 +317,6 @@ class MilvusStore(VectorStoreBase):
             timeout=timeout,
             **kwargs,
         )
-        # Organize results.
         ret = []
         for result in res[0]:
             meta = {x: result.entity.get(x) for x in output_fields}

From 74c1f1f7e13be6dec5a772aed84bd435df2521a0 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:09:22 +0800
Subject: [PATCH 08/15] update:vector store config

---
 pilot/server/webserver.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py
index 270eff67f..44c027bdf 100644
--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@@ -19,8 +19,7 @@ from langchain import PromptTemplate
 ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(ROOT_PATH)
 
-from pilot.configs.model_config import DB_SETTINGS, KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K, \
-    VECTOR_STORE_CONFIG
+from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH, LLM_MODEL_CONFIG, VECTOR_SEARCH_TOP_K
 from pilot.server.vectordb_qa import KnownLedgeBaseQA
 from pilot.connections.mysql import MySQLOperator
 from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
@@ -268,10 +267,8 @@ def http_bot(state, mode, sql_mode, db_selector, temperature, max_new_tokens, re
         skip_echo_len = len(prompt.replace("</s>", " ")) + 1
 
     if mode == conversation_types["custome"] and not db_selector:
-        # persist_dir = os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vector_store_name["vs_name"])
-        print("vector store type: ", VECTOR_STORE_CONFIG)
         print("vector store name: ", vector_store_name["vs_name"])
-        vector_store_config = VECTOR_STORE_CONFIG
+        vector_store_config = []
         vector_store_config["vector_store_name"] = vector_store_name["vs_name"]
         vector_store_config["text_field"] = "content"
         vector_store_config["vector_store_path"] = KNOWLEDGE_UPLOAD_ROOT_PATH

From b0d3d02d205e2ce78a664a1a2bb8d0050a07dca9 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:10:15 +0800
Subject: [PATCH 09/15] update:vector store config

---
 pilot/source_embedding/knowledge_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index cb1fcb504..93fa185a6 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -5,7 +5,7 @@ from langchain.document_loaders import TextLoader, markdown
 from langchain.embeddings import HuggingFaceEmbeddings
 
 from pilot.configs.config import Config
-from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE, VECTOR_STORE_TYPE
+from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
 from pilot.source_embedding.csv_embedding import CSVEmbedding
 from pilot.source_embedding.markdown_embedding import MarkdownEmbedding

From 24d6762d962b53a963b11284345b5a53effa7cb2 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:16:08 +0800
Subject: [PATCH 10/15] update:vector store config

---
 pilot/server/webserver.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py
index 44c027bdf..1ac32ab26 100644
--- a/pilot/server/webserver.py
+++ b/pilot/server/webserver.py
@@ -268,10 +268,8 @@ def http_bot(state, mode, sql_mode, db_selector, temperature, max_new_tokens, re
 
     if mode == conversation_types["custome"] and not db_selector:
         print("vector store name: ", vector_store_name["vs_name"])
-        vector_store_config = []
-        vector_store_config["vector_store_name"] = vector_store_name["vs_name"]
-        vector_store_config["text_field"] = "content"
-        vector_store_config["vector_store_path"] = KNOWLEDGE_UPLOAD_ROOT_PATH
+        vector_store_config = {"vector_store_name": vector_store_name["vs_name"], "text_field": "content",
+                               "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH}
         knowledge_embedding_client = KnowledgeEmbedding(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"],
                                                         local_persist=False,
                                                         vector_store_config=vector_store_config)

From 0b92066bf5ad918a207cc3a9c54e85b5de2fc9d4 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:39:56 +0800
Subject: [PATCH 11/15] update:requirements

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 29e792451..f9becba90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -61,7 +61,9 @@ gTTS==2.3.1
 langchain
 nltk
 python-dotenv==1.0.0
-pymilvus
+pymilvus==2.2.1
+paddle==2.2
+paddleocr==2.6.1.3
 vcrpy
 chromadb
 markdown2

From 926c9716915ccedaa72883be3f749e9e51abb793 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:43:07 +0800
Subject: [PATCH 12/15] update:PDF loader

---
 pilot/source_embedding/knowledge_embedding.py | 5 ++---
 pilot/source_embedding/pdf_embedding.py       | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py
index 93fa185a6..2f313a35a 100644
--- a/pilot/source_embedding/knowledge_embedding.py
+++ b/pilot/source_embedding/knowledge_embedding.py
@@ -1,7 +1,7 @@
 import os
 
 from bs4 import BeautifulSoup
-from langchain.document_loaders import TextLoader, markdown
+from langchain.document_loaders import TextLoader, markdown, PyPDFLoader
 from langchain.embeddings import HuggingFaceEmbeddings
 
 from pilot.configs.config import Config
@@ -12,7 +12,6 @@ from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
 from pilot.source_embedding.pdf_embedding import PDFEmbedding
 import markdown
 
-from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
 from pilot.vector_store.connector import VectorStoreConnector
 
 CFG = Config()
@@ -89,7 +88,7 @@ class KnowledgeEmbedding:
                 docs[i].page_content = docs[i].page_content.replace("\n", " ")
                 i += 1
         elif filename.lower().endswith(".pdf"):
-            loader = UnstructuredPaddlePDFLoader(filename)
+            loader = PyPDFLoader(filename)
             textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
             docs = loader.load_and_split(textsplitter)
             i = 0
diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py
index a8749695b..75d17c4c6 100644
--- a/pilot/source_embedding/pdf_embedding.py
+++ b/pilot/source_embedding/pdf_embedding.py
@@ -2,12 +2,12 @@
 # -*- coding: utf-8 -*-
 from typing import List
 
+from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
 from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
 
 from pilot.source_embedding import SourceEmbedding, register
 from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
-from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
 
 
 class PDFEmbedding(SourceEmbedding):
@@ -23,7 +23,8 @@ class PDFEmbedding(SourceEmbedding):
     @register
     def read(self):
         """Load from pdf path."""
-        loader = UnstructuredPaddlePDFLoader(self.file_path)
+        # loader = UnstructuredPaddlePDFLoader(self.file_path)
+        loader = PyPDFLoader(self.file_path)
         textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
         return loader.load_and_split(textsplitter)
 

From 2bf7ba827b250ac7cc7d407c562aeb2cfef13a64 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 22:59:16 +0800
Subject: [PATCH 13/15] update:requirement

---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f9becba90..aea4f00e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,8 +62,6 @@ langchain
 nltk
 python-dotenv==1.0.0
 pymilvus==2.2.1
-paddle==2.2
-paddleocr==2.6.1.3
 vcrpy
 chromadb
 markdown2

From d599a48cbc7a75a0707ac9c6146f64dc826d10e5 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 23:50:12 +0800
Subject: [PATCH 14/15] update:env template

---
 .env.template | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.env.template b/.env.template
index d809a362b..159299f71 100644
--- a/.env.template
+++ b/.env.template
@@ -81,3 +81,14 @@ DENYLISTED_PLUGINS=
 #*******************************************************************#
 # CHAT_MESSAGES_ENABLED - Enable chat messages (Default: False)
 # CHAT_MESSAGES_ENABLED=False
+
+
+#*******************************************************************#
+#**                  VECTOR STORE SETTINGS                       **#
+#*******************************************************************#
+VECTOR_STORE_TYPE=Chroma
+MILVUS_URL=127.0.0.1
+MILVUS_PORT=19530
+#MILVUS_USERNAME
+#MILVUS_PASSWORD
+#MILVUS_SECURE=

From e6339b06ad1879bb9d032d7721b08ec774cdd277 Mon Sep 17 00:00:00 2001
From: aries-ckt <916701291@qq.com>
Date: Tue, 23 May 2023 23:52:48 +0800
Subject: [PATCH 15/15] update:env template

---
 .env.template | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.env.template b/.env.template
index 159299f71..3fe762e73 100644
--- a/.env.template
+++ b/.env.template
@@ -87,8 +87,8 @@ DENYLISTED_PLUGINS=
 #**                  VECTOR STORE SETTINGS                       **#
 #*******************************************************************#
 VECTOR_STORE_TYPE=Chroma
-MILVUS_URL=127.0.0.1
-MILVUS_PORT=19530
+#MILVUS_URL=127.0.0.1
+#MILVUS_PORT=19530
 #MILVUS_USERNAME
 #MILVUS_PASSWORD
 #MILVUS_SECURE=