feat: Command-line tool with knowledge repository initialization

This commit is contained in:
FangYin Cheng 2023-09-01 18:21:22 +08:00
parent d42afb50a7
commit e5bbd0bd86
9 changed files with 153 additions and 257 deletions

View File

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: DB-GPT 0.3.0\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-07-31 17:04+0800\n"
"POT-Creation-Date: 2023-09-01 18:16+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
@ -19,11 +19,11 @@ msgstr ""
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.12.1\n"
#: ../../modules/knownledge.md:1 b18cf12f806941f3b9d1c13b52d0dfe5
#: ../../modules/knownledge.md:1 ba585bf3ba464c32a156d308f39e65dc
msgid "Knownledge"
msgstr "知识"
#: ../../modules/knownledge.md:3 3fe78b30d3994e4484df41a677614eb2
#: ../../modules/knownledge.md:3 bc5d67c51b004ff8b2d1bbca17fd4aa7
msgid ""
"As the knowledge base is currently the most significant user demand "
"scenario, we natively support the construction and processing of "
@ -31,15 +31,15 @@ msgid ""
"base management strategies in this project, such as:"
msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:"
#: ../../modules/knownledge.md:4 17b2485a12744b5587655201be50e023
#: ../../modules/knownledge.md:4 519f2686500340d191ad5a91eabc7676
msgid "Default built-in knowledge base"
msgstr "默认内置知识库"
#: ../../modules/knownledge.md:5 e137d9916a0a4a0681dbfed5d5a5065f
#: ../../modules/knownledge.md:5 93a25018fc144dfe98fcea0755f2ea94
msgid "Custom addition of knowledge bases"
msgstr "自定义新增知识库"
#: ../../modules/knownledge.md:6 0bca133996d4435b84245f2b53f43d72
#: ../../modules/knownledge.md:6 37359e14b2464b2c9fc4e5621755bb0d
msgid ""
"Various usage scenarios such as constructing knowledge bases through "
"plugin capabilities and web crawling. Users only need to organize the "
@ -47,51 +47,52 @@ msgid ""
"the knowledge base required for the large model."
msgstr "各种使用场景,例如通过插件功能和爬虫构建知识库。用户只需要组织知识文档,并且他们可以使用我们现有的功能来构建大型模型所需的知识库。"
#: ../../modules/knownledge.md:9 7355eed198514efc8e3bc178039b0251
#: ../../modules/knownledge.md:9 656fcb11886546df9e058227d94481b3
msgid "Create your own knowledge repository"
msgstr "创建你自己的知识库"
#: ../../modules/knownledge.md:11 96e0276a5d3047fea5410e9b33c33308
#: ../../modules/knownledge.md:11 37fc3ae2cfe044f8ac61de484bf0653d
msgid ""
"1.Place personal knowledge files or folders in the pilot/datasets "
"directory."
msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
#: ../../modules/knownledge.md:13 8762c0a463094c19924cdd4b7b1b1ede
#: ../../modules/knownledge.md:13 a675d90485834690bfca68b41a10c085
msgid ""
"We currently support many document formats: txt, pdf, md, html, doc, ppt,"
" and url."
msgstr "当前支持txt, pdf, md, doc, ppt, html文档格式"
#: ../../modules/knownledge.md:15 752a5e7c623a49439ecf1ce8e6ccca7d
#: ../../modules/knownledge.md:15 f2c25b0536ff4b3191e13f3020e883a6
msgid "before execution:"
msgstr "在执行之前"
#: ../../modules/knownledge.md:22 fbef967557a94b938f8e47497bb43c20
#: ../../modules/knownledge.md:22 65427906c8b54cd699a07ed482251c83
msgid ""
"2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma "
"(now only support Chroma and Milvus, if you set Milvus, please set "
"MILVUS_URL and MILVUS_PORT)"
msgstr "2.更新你的.env设置你的向量存储类型VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus如果你设置了Milvus请设置MILVUS_URL和MILVUS_PORT)"
#: ../../modules/knownledge.md:25 f1745e2f17864711a636ecbdd6cb9833
msgid "2.Run the knowledge repository script in the tools directory."
msgstr "2.在tools目录执行知识入库脚本"
#: ../../modules/knownledge.md:25 287ae6ee51cc4b668d99e48b81147d3f
#, fuzzy
msgid "2.Run the knowledge repository initialization command"
msgstr "2.执行知识入库命令"
#: ../../modules/knownledge.md:34 5ae832d038b245a29a9e089f9e169cb0
#: ../../modules/knownledge.md:31 1fe0ac58d8354c7fba782901cb0673d8
msgid ""
"Optionally, you can run `python tools/knowledge_init.py -h` command to "
"see more usage."
msgstr ""
"Optionally, you can run `dbgpt knowledge load --help` command to see more"
" usage."
msgstr "另外,你可以运行 `dbgpt knowledge load --help` 命令来查看更多的用法"
#: ../../modules/knownledge.md:36 4aab02276dfd41819dbd218ecc608326
#: ../../modules/knownledge.md:33 e1607e330195470f9087bd4ffbc6d45d
msgid ""
"3.Add the knowledge repository in the interface by entering the name of "
"your knowledge repository (if not specified, enter \"default\") so you "
"can use it for Q&A based on your knowledge base."
msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
#: ../../modules/knownledge.md:38 f990c8495c994aa1beb040ede6b2329a
#: ../../modules/knownledge.md:35 0614e35ccfba42ea9e63881cb481815e
msgid ""
"Note that the default vector model used is text2vec-large-chinese (which "
"is a large model, so if your personal computer configuration is not "
@ -101,3 +102,9 @@ msgstr ""
"注意这里默认向量模型是text2vec-large-chinese(模型比较大如果个人电脑配置不够建议采用text2vec-base-"
"chinese),因此确保需要将模型download下来放到models目录中。"
#~ msgid ""
#~ "Optionally, you can run `python "
#~ "tools/knowledge_init.py -h` command to see "
#~ "more usage."
#~ msgstr ""

View File

@ -22,16 +22,13 @@ python -m spacy download zh_core_web_sm
2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma
(now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT)
2.Run the knowledge repository script in the tools directory.
```
python tools/knowledge_init.py
--vector_name : your vector store name default_value:default
2.Run the knowledge repository initialization command
```bash
dbgpt knowledge load
```
Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage.
Optionally, you can run `dbgpt knowledge load --help` command to see more usage.
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.

View File

@ -1,5 +1,6 @@
import click
import functools
import logging
from pilot.model.controller.registry import ModelRegistryClient
from pilot.model.base import WorkerApplyType
@ -13,6 +14,8 @@ from pilot.utils.parameter_utils import EnvArgumentParser
MODEL_CONTROLLER_ADDRESS = "http://127.0.0.1:8000"
logger = logging.getLogger("dbgpt_cli")
@click.group("model")
@click.option(

View File

@ -8,6 +8,15 @@ sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
logging.basicConfig(
level=logging.WARNING,
encoding="utf-8",
format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("dbgpt_cli")
@click.group()
@click.option(
@ -19,8 +28,7 @@ sys.path.append(
)
@click.version_option()
def cli(log_level: str):
# TODO not working now
logging.basicConfig(level=log_level, encoding="utf-8")
logger.setLevel(logging.getLevelName(log_level.upper()))
def add_command_alias(command, name: str, hidden: bool = False, parent_group=None):
@ -73,6 +81,13 @@ try:
except ImportError as e:
logging.warning(f"Integrating dbgpt model command line tool failed: {e}")
try:
from pilot.server.knowledge._cli.knowledge_cli import knowledge_cli_group
add_command_alias(knowledge_cli_group, name="knowledge", parent_group=cli)
except ImportError as e:
logging.warning(f"Integrating dbgpt knowledge command line tool failed: {e}")
def main():
return cli()

View File

View File

@ -0,0 +1,83 @@
import click
import logging
from pilot.configs.model_config import DATASETS_DIR
API_ADDRESS: str = "http://127.0.0.1:5000"
logger = logging.getLogger("dbgpt_cli")
@click.group("knowledge")
@click.option(
"--address",
type=str,
default=API_ADDRESS,
required=False,
show_default=True,
help=("Address of the Api server."),
)
def knowledge_cli_group(address: str):
"""Knowledge command line tool"""
global API_ADDRESS
API_ADDRESS = address
@knowledge_cli_group.command()
@click.option(
"--vector_name",
required=False,
type=str,
default="default",
show_default=True,
help="Your vector store name",
)
@click.option(
"--vector_store_type",
required=False,
type=str,
default="Chroma",
show_default=True,
help="Vector store type",
)
@click.option(
"--local_doc_dir",
required=False,
type=str,
default=DATASETS_DIR,
show_default=True,
help="Your document directory",
)
@click.option(
"--skip_wrong_doc",
required=False,
type=bool,
default=False,
show_default=True,
help="Skip wrong document",
)
@click.option(
"--max_workers",
required=False,
type=int,
default=None,
help="The maximum number of threads that can be used to upload document",
)
def load(
vector_name: str,
vector_store_type: str,
local_doc_dir: str,
skip_wrong_doc: bool,
max_workers: int,
):
"""Load you local knowledge to DB-GPT"""
from pilot.server.knowledge._cli.knowledge_client import knowledge_init
knowledge_init(
API_ADDRESS,
vector_name,
vector_store_type,
local_doc_dir,
skip_wrong_doc,
max_workers,
)

View File

@ -1,6 +1,7 @@
import os
import requests
import json
import logging
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
@ -18,10 +19,12 @@ from pilot.server.knowledge.request.request import DocumentSyncRequest
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
HTTP_HEADERS = {"Content-Type": "application/json"}
logger = logging.getLogger("dbgpt_cli")
class ApiClient:
def __init__(self, api_address: str) -> None:
self.api_address = api_address
@ -40,9 +43,9 @@ class ApiClient:
def _post(self, url: str, data=None):
if not isinstance(data, dict):
data = data.__dict__
response = requests.post(
urljoin(self.api_address, url), data=json.dumps(data), headers=HTTP_HEADERS
)
url = urljoin(self.api_address, url)
logger.debug(f"Send request to {url}, data: {data}")
response = requests.post(url, data=json.dumps(data), headers=HTTP_HEADERS)
return self._handle_response(response)
@ -55,7 +58,7 @@ class KnowledgeApiClient(ApiClient):
return self._post("/knowledge/space/add", data=request)
except Exception as e:
if "have already named" in str(e):
print(f"Warning: you have already named {request.name}")
logger.warn(f"you have already named {request.name}")
else:
raise e
@ -98,7 +101,6 @@ def knowledge_init(
vector_store_type: str,
local_doc_dir: str,
skip_wrong_doc: bool,
verbose: bool,
max_workers: int = None,
):
client = KnowledgeApiClient(api_address)
@ -109,9 +111,9 @@ def knowledge_init(
space.owner = "DB-GPT"
# Create space
print(f"Create space: {space}")
logger.info(f"Create space: {space}")
client.space_add(space)
print("Create space successfully")
logger.info("Create space successfully")
space_list = client.space_list(KnowledgeSpaceRequest(name=space.name))
if len(space_list) != 1:
raise Exception(f"List space {space.name} error")
@ -121,13 +123,13 @@ def knowledge_init(
def upload(filename: str):
try:
print(f"Begin upload document: {filename} to {space.name}")
logger.info(f"Begin upload document: {filename} to {space.name}")
return client.document_upload(
space.name, filename, KnowledgeType.DOCUMENT.value, filename
)
except Exception as e:
if skip_wrong_doc:
print(f"Warning: {str(e)}")
logger.warn(f"Warning: {str(e)}")
else:
raise e
@ -140,7 +142,7 @@ def knowledge_init(
doc_ids = [r.result() for r in as_completed(tasks)]
doc_ids = list(filter(lambda x: x, doc_ids))
if not doc_ids:
print("Warning: no document to sync")
logger.warn("Warning: no document to sync")
return
print(f"Begin sync document: {doc_ids}")
logger.info(f"Begin sync document: {doc_ids}")
client.document_sync(space.name, DocumentSyncRequest(doc_ids=doc_ids))

View File

@ -1,122 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DB-GPT command line tools.
You can use it for some background management:
- Lots of knowledge document initialization.
- Load the data into the database.
- Show server status
- ...
Maybe move this to pilot module and append to console_scripts in the future.
"""
import sys
import click
import os
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
)
from pilot.configs.model_config import DATASETS_DIR
API_ADDRESS: str = "http://127.0.0.1:5000"
@click.group()
@click.option(
"--api_address",
required=False,
default="http://127.0.0.1:5000",
type=str,
help="Api server address",
)
@click.version_option()
def cli(api_address: str):
global API_ADDRESS
API_ADDRESS = api_address
@cli.command()
@click.option(
"--vector_name",
required=False,
type=str,
default="default",
help="Your vector store name",
)
@click.option(
"--vector_store_type",
required=False,
type=str,
default="Chroma",
help="Vector store type",
)
@click.option(
"--local_doc_dir",
required=False,
type=str,
default=DATASETS_DIR,
help="Your document directory",
)
@click.option(
"--skip_wrong_doc",
required=False,
type=bool,
default=False,
help="Skip wrong document",
)
@click.option(
"--max_workers",
required=False,
type=int,
default=None,
help="The maximum number of threads that can be used to upload document",
)
@click.option(
"-v",
"--verbose",
required=False,
is_flag=True,
hidden=True,
help="Show debuggging information.",
)
def knowledge(
vector_name: str,
vector_store_type: str,
local_doc_dir: str,
skip_wrong_doc: bool,
max_workers: int,
verbose: bool,
):
"""Knowledge command line tool"""
from tools.cli.knowledge_client import knowledge_init
knowledge_init(
API_ADDRESS,
vector_name,
vector_store_type,
local_doc_dir,
skip_wrong_doc,
verbose,
max_workers,
)
# knowledge command
cli.add_command(knowledge)
# TODO add more command
def main():
return cli()
if __name__ == "__main__":
main()
raise Exception(
"The functionality of this script has been moved to the command line tool `dbgpt`. For details on usage, please execute the command `dbgpt --help`."
)

View File

@ -1,103 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import os
import sys
import traceback
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pilot.embedding_engine.knowledge_type import KnowledgeType
from pilot.server.knowledge.service import KnowledgeService
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from pilot.configs.config import Config
from pilot.configs.model_config import (
DATASETS_DIR,
LLM_MODEL_CONFIG,
KNOWLEDGE_UPLOAD_ROOT_PATH,
)
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
knowledge_space_service = KnowledgeService()
CFG = Config()
class LocalKnowledgeInit:
embeddings: object = None
def __init__(self, vector_store_config) -> None:
self.vector_store_config = vector_store_config
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
"""knowledge persist"""
docs = []
embedding_engine = None
for root, _, files in os.walk(file_path, topdown=False):
for file in files:
filename = os.path.join(root, file)
ke = EmbeddingEngine(
knowledge_source=filename,
knowledge_type=KnowledgeType.DOCUMENT.value,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
try:
embedding_engine = ke.init_knowledge_embedding()
doc = ke.read()
docs.extend(doc)
except Exception as e:
error_msg = traceback.format_exc()
if skip_wrong_doc:
print(
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
)
else:
raise e
embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
try:
space = KnowledgeSpaceRequest
space.name = self.vector_store_config["vector_store_name"]
space.desc = "knowledge_init.py"
space.vector_type = CFG.VECTOR_STORE_TYPE
space.owner = "DB-GPT"
knowledge_space_service.create_knowledge_space(space)
except Exception as e:
if "have already named" in str(e):
print(f"Warning: you have already named {space.name}")
else:
raise e
if __name__ == "__main__":
# TODO https://github.com/csunny/DB-GPT/issues/354
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
raise Exception(
"The functionality of this script has been moved to the command line tool `dbgpt`. For details on usage, please execute the command `dbgpt --help`."
)
parser.add_argument(
"--vector_name", type=str, default="default", help="Your vector store name"
)
parser.add_argument(
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
)
parser.add_argument(
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
)
args = parser.parse_args()
vector_name = args.vector_name
store_type = CFG.VECTOR_STORE_TYPE
file_path = args.file_path
skip_wrong_doc = args.skip_wrong_doc
vector_store_config = {
"vector_store_name": vector_name,
"vector_store_type": CFG.VECTOR_STORE_TYPE,
"chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
}
print(vector_store_config)
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
print("your knowledge embedding success...")