feat: knowledge_init.py support multiple knowledge file path and skip some error in knowledge embedding

This commit is contained in:
FangYin Cheng
2023-07-24 18:39:33 +08:00
parent bd93f28812
commit 426a364c37
2 changed files with 32 additions and 7 deletions

View File

@@ -31,6 +31,8 @@ python tools/knowledge_init.py
```
Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage.
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.

View File

@@ -3,6 +3,7 @@
import argparse
import os
import sys
import traceback
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
@@ -31,7 +32,7 @@ class LocalKnowledgeInit:
self.vector_store_config = vector_store_config
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
def knowledge_persist(self, file_path):
def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
"""knowledge persist"""
docs = []
embedding_engine = None
@@ -44,9 +45,18 @@ class LocalKnowledgeInit:
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
embedding_engine = ke.init_knowledge_embedding()
doc = ke.read()
docs.extend(doc)
try:
embedding_engine = ke.init_knowledge_embedding()
doc = ke.read()
docs.extend(doc)
except Exception as e:
error_msg = traceback.format_exc()
if skip_wrong_doc:
print(
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
)
else:
raise e
embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
try:
@@ -64,11 +74,24 @@ class LocalKnowledgeInit:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--vector_name", type=str, default="default")
# TODO https://github.com/csunny/DB-GPT/issues/354
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--vector_name", type=str, default="default", help="Your vector store name"
)
parser.add_argument(
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
)
parser.add_argument(
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
)
args = parser.parse_args()
vector_name = args.vector_name
store_type = CFG.VECTOR_STORE_TYPE
file_path = args.file_path
skip_wrong_doc = args.skip_wrong_doc
vector_store_config = {
"vector_store_name": vector_name,
"vector_store_type": CFG.VECTOR_STORE_TYPE,
@@ -76,5 +99,5 @@ if __name__ == "__main__":
}
print(vector_store_config)
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
kv.knowledge_persist(file_path=DATASETS_DIR)
kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
print("your knowledge embedding success...")