mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-03 18:17:45 +00:00
feat: knowledge_init.py support multiple knowledge file path and skip some error in knowledge embedding
This commit is contained in:
@@ -31,6 +31,8 @@ python tools/knowledge_init.py
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage.
|
||||||
|
|
||||||
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
|
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
|
||||||
|
|
||||||
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
|
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
|
@@ -3,6 +3,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
||||||
|
|
||||||
@@ -31,7 +32,7 @@ class LocalKnowledgeInit:
|
|||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
|
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
|
||||||
|
|
||||||
def knowledge_persist(self, file_path):
|
def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
|
||||||
"""knowledge persist"""
|
"""knowledge persist"""
|
||||||
docs = []
|
docs = []
|
||||||
embedding_engine = None
|
embedding_engine = None
|
||||||
@@ -44,9 +45,18 @@ class LocalKnowledgeInit:
|
|||||||
model_name=self.model_name,
|
model_name=self.model_name,
|
||||||
vector_store_config=self.vector_store_config,
|
vector_store_config=self.vector_store_config,
|
||||||
)
|
)
|
||||||
embedding_engine = ke.init_knowledge_embedding()
|
try:
|
||||||
doc = ke.read()
|
embedding_engine = ke.init_knowledge_embedding()
|
||||||
docs.extend(doc)
|
doc = ke.read()
|
||||||
|
docs.extend(doc)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = traceback.format_exc()
|
||||||
|
if skip_wrong_doc:
|
||||||
|
print(
|
||||||
|
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
embedding_engine.index_to_store(docs)
|
embedding_engine.index_to_store(docs)
|
||||||
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
|
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
|
||||||
try:
|
try:
|
||||||
@@ -64,11 +74,24 @@ class LocalKnowledgeInit:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
# TODO https://github.com/csunny/DB-GPT/issues/354
|
||||||
parser.add_argument("--vector_name", type=str, default="default")
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--vector_name", type=str, default="default", help="Your vector store name"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
vector_name = args.vector_name
|
vector_name = args.vector_name
|
||||||
store_type = CFG.VECTOR_STORE_TYPE
|
store_type = CFG.VECTOR_STORE_TYPE
|
||||||
|
file_path = args.file_path
|
||||||
|
skip_wrong_doc = args.skip_wrong_doc
|
||||||
vector_store_config = {
|
vector_store_config = {
|
||||||
"vector_store_name": vector_name,
|
"vector_store_name": vector_name,
|
||||||
"vector_store_type": CFG.VECTOR_STORE_TYPE,
|
"vector_store_type": CFG.VECTOR_STORE_TYPE,
|
||||||
@@ -76,5 +99,5 @@ if __name__ == "__main__":
|
|||||||
}
|
}
|
||||||
print(vector_store_config)
|
print(vector_store_config)
|
||||||
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
|
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
|
||||||
kv.knowledge_persist(file_path=DATASETS_DIR)
|
kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
|
||||||
print("your knowledge embedding success...")
|
print("your knowledge embedding success...")
|
||||||
|
Reference in New Issue
Block a user