feat: knowledge_init.py support multiple knowledge file path and skip some error in knowledge embedding

This commit is contained in:
FangYin Cheng
2023-07-24 18:39:33 +08:00
parent bd93f28812
commit 426a364c37
2 changed files with 32 additions and 7 deletions

View File

@@ -31,6 +31,8 @@ python tools/knowledge_init.py
``` ```
Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage.
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. 3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory. Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.

View File

@@ -3,6 +3,7 @@
import argparse import argparse
import os import os
import sys import sys
import traceback
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
@@ -31,7 +32,7 @@ class LocalKnowledgeInit:
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL] self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
def knowledge_persist(self, file_path): def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
"""knowledge persist""" """knowledge persist"""
docs = [] docs = []
embedding_engine = None embedding_engine = None
@@ -44,9 +45,18 @@ class LocalKnowledgeInit:
model_name=self.model_name, model_name=self.model_name,
vector_store_config=self.vector_store_config, vector_store_config=self.vector_store_config,
) )
embedding_engine = ke.init_knowledge_embedding() try:
doc = ke.read() embedding_engine = ke.init_knowledge_embedding()
docs.extend(doc) doc = ke.read()
docs.extend(doc)
except Exception as e:
error_msg = traceback.format_exc()
if skip_wrong_doc:
print(
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
)
else:
raise e
embedding_engine.index_to_store(docs) embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""") print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
try: try:
@@ -64,11 +74,24 @@ class LocalKnowledgeInit:
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() # TODO https://github.com/csunny/DB-GPT/issues/354
parser.add_argument("--vector_name", type=str, default="default") parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--vector_name", type=str, default="default", help="Your vector store name"
)
parser.add_argument(
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
)
parser.add_argument(
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
)
args = parser.parse_args() args = parser.parse_args()
vector_name = args.vector_name vector_name = args.vector_name
store_type = CFG.VECTOR_STORE_TYPE store_type = CFG.VECTOR_STORE_TYPE
file_path = args.file_path
skip_wrong_doc = args.skip_wrong_doc
vector_store_config = { vector_store_config = {
"vector_store_name": vector_name, "vector_store_name": vector_name,
"vector_store_type": CFG.VECTOR_STORE_TYPE, "vector_store_type": CFG.VECTOR_STORE_TYPE,
@@ -76,5 +99,5 @@ if __name__ == "__main__":
} }
print(vector_store_config) print(vector_store_config)
kv = LocalKnowledgeInit(vector_store_config=vector_store_config) kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
kv.knowledge_persist(file_path=DATASETS_DIR) kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
print("your knowledge embedding success...") print("your knowledge embedding success...")