feat: knowledge_init.py support multiple knowledge file path and skip some error in knowledge embedding

This commit is contained in:
FangYin Cheng
2023-07-24 18:39:33 +08:00
parent bd93f28812
commit 426a364c37
2 changed files with 32 additions and 7 deletions

View File

@@ -3,6 +3,7 @@
import argparse
import os
import sys
import traceback
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
@@ -31,7 +32,7 @@ class LocalKnowledgeInit:
self.vector_store_config = vector_store_config
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
def knowledge_persist(self, file_path):
def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
"""knowledge persist"""
docs = []
embedding_engine = None
@@ -44,9 +45,18 @@ class LocalKnowledgeInit:
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
embedding_engine = ke.init_knowledge_embedding()
doc = ke.read()
docs.extend(doc)
try:
embedding_engine = ke.init_knowledge_embedding()
doc = ke.read()
docs.extend(doc)
except Exception as e:
error_msg = traceback.format_exc()
if skip_wrong_doc:
print(
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
)
else:
raise e
embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
try:
@@ -64,11 +74,24 @@ class LocalKnowledgeInit:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--vector_name", type=str, default="default")
# TODO https://github.com/csunny/DB-GPT/issues/354
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--vector_name", type=str, default="default", help="Your vector store name"
)
parser.add_argument(
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
)
parser.add_argument(
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
)
args = parser.parse_args()
vector_name = args.vector_name
store_type = CFG.VECTOR_STORE_TYPE
file_path = args.file_path
skip_wrong_doc = args.skip_wrong_doc
vector_store_config = {
"vector_store_name": vector_name,
"vector_store_type": CFG.VECTOR_STORE_TYPE,
@@ -76,5 +99,5 @@ if __name__ == "__main__":
}
print(vector_store_config)
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
kv.knowledge_persist(file_path=DATASETS_DIR)
kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
print("your knowledge embedding success...")