diff --git a/docs/modules/knownledge.md b/docs/modules/knownledge.md index c108920b2..cd42922ed 100644 --- a/docs/modules/knownledge.md +++ b/docs/modules/knownledge.md @@ -31,6 +31,8 @@ python tools/knowledge_init.py ``` +Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage. + 3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory. \ No newline at end of file diff --git a/tools/knowledge_init.py b/tools/knowledge_init.py index c442de8c9..aca2baf1d 100644 --- a/tools/knowledge_init.py +++ b/tools/knowledge_init.py @@ -3,6 +3,7 @@ import argparse import os import sys +import traceback sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -31,7 +32,7 @@ class LocalKnowledgeInit: self.vector_store_config = vector_store_config self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL] - def knowledge_persist(self, file_path): + def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False): """knowledge persist""" docs = [] embedding_engine = None @@ -44,9 +45,18 @@ class LocalKnowledgeInit: model_name=self.model_name, vector_store_config=self.vector_store_config, ) - embedding_engine = ke.init_knowledge_embedding() - doc = ke.read() - docs.extend(doc) + try: + embedding_engine = ke.init_knowledge_embedding() + doc = ke.read() + docs.extend(doc) + except Exception as e: + error_msg = traceback.format_exc() + if skip_wrong_doc: + print( + f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}" + ) + else: + raise e embedding_engine.index_to_store(docs) print(f"""begin create {self.vector_store_config["vector_store_name"]} space""") try: @@ -64,11 +74,24 @@ class LocalKnowledgeInit: if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--vector_name", type=str, default="default") + # TODO https://github.com/csunny/DB-GPT/issues/354 + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--vector_name", type=str, default="default", help="Your vector store name" + ) + parser.add_argument( + "--file_path", type=str, default=DATASETS_DIR, help="Your document path" + ) + parser.add_argument( + "--skip_wrong_doc", type=bool, default=False, help="Skip wrong document" + ) args = parser.parse_args() vector_name = args.vector_name store_type = CFG.VECTOR_STORE_TYPE + file_path = args.file_path + skip_wrong_doc = args.skip_wrong_doc vector_store_config = { "vector_store_name": vector_name, "vector_store_type": CFG.VECTOR_STORE_TYPE, @@ -76,5 +99,5 @@ if __name__ == "__main__": } print(vector_store_config) kv = LocalKnowledgeInit(vector_store_config=vector_store_config) - kv.knowledge_persist(file_path=DATASETS_DIR) + kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc) print("your knowledge embedding success...")