mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-25 13:06:53 +00:00
feat: Command-line tool with knowledge repository initialization
This commit is contained in:
parent
d42afb50a7
commit
e5bbd0bd86
@ -8,7 +8,7 @@ msgid ""
|
|||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: DB-GPT 0.3.0\n"
|
"Project-Id-Version: DB-GPT 0.3.0\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2023-07-31 17:04+0800\n"
|
"POT-Creation-Date: 2023-09-01 18:16+0800\n"
|
||||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"Language: zh_CN\n"
|
"Language: zh_CN\n"
|
||||||
@ -19,11 +19,11 @@ msgstr ""
|
|||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"Generated-By: Babel 2.12.1\n"
|
"Generated-By: Babel 2.12.1\n"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:1 b18cf12f806941f3b9d1c13b52d0dfe5
|
#: ../../modules/knownledge.md:1 ba585bf3ba464c32a156d308f39e65dc
|
||||||
msgid "Knownledge"
|
msgid "Knownledge"
|
||||||
msgstr "知识"
|
msgstr "知识"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:3 3fe78b30d3994e4484df41a677614eb2
|
#: ../../modules/knownledge.md:3 bc5d67c51b004ff8b2d1bbca17fd4aa7
|
||||||
msgid ""
|
msgid ""
|
||||||
"As the knowledge base is currently the most significant user demand "
|
"As the knowledge base is currently the most significant user demand "
|
||||||
"scenario, we natively support the construction and processing of "
|
"scenario, we natively support the construction and processing of "
|
||||||
@ -31,15 +31,15 @@ msgid ""
|
|||||||
"base management strategies in this project, such as:"
|
"base management strategies in this project, such as:"
|
||||||
msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:"
|
msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:4 17b2485a12744b5587655201be50e023
|
#: ../../modules/knownledge.md:4 519f2686500340d191ad5a91eabc7676
|
||||||
msgid "Default built-in knowledge base"
|
msgid "Default built-in knowledge base"
|
||||||
msgstr "默认内置知识库"
|
msgstr "默认内置知识库"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:5 e137d9916a0a4a0681dbfed5d5a5065f
|
#: ../../modules/knownledge.md:5 93a25018fc144dfe98fcea0755f2ea94
|
||||||
msgid "Custom addition of knowledge bases"
|
msgid "Custom addition of knowledge bases"
|
||||||
msgstr "自定义新增知识库"
|
msgstr "自定义新增知识库"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:6 0bca133996d4435b84245f2b53f43d72
|
#: ../../modules/knownledge.md:6 37359e14b2464b2c9fc4e5621755bb0d
|
||||||
msgid ""
|
msgid ""
|
||||||
"Various usage scenarios such as constructing knowledge bases through "
|
"Various usage scenarios such as constructing knowledge bases through "
|
||||||
"plugin capabilities and web crawling. Users only need to organize the "
|
"plugin capabilities and web crawling. Users only need to organize the "
|
||||||
@ -47,51 +47,52 @@ msgid ""
|
|||||||
"the knowledge base required for the large model."
|
"the knowledge base required for the large model."
|
||||||
msgstr "各种使用场景,例如通过插件功能和爬虫构建知识库。用户只需要组织知识文档,并且他们可以使用我们现有的功能来构建大型模型所需的知识库。"
|
msgstr "各种使用场景,例如通过插件功能和爬虫构建知识库。用户只需要组织知识文档,并且他们可以使用我们现有的功能来构建大型模型所需的知识库。"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:9 7355eed198514efc8e3bc178039b0251
|
#: ../../modules/knownledge.md:9 656fcb11886546df9e058227d94481b3
|
||||||
msgid "Create your own knowledge repository"
|
msgid "Create your own knowledge repository"
|
||||||
msgstr "创建你自己的知识库"
|
msgstr "创建你自己的知识库"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:11 96e0276a5d3047fea5410e9b33c33308
|
#: ../../modules/knownledge.md:11 37fc3ae2cfe044f8ac61de484bf0653d
|
||||||
msgid ""
|
msgid ""
|
||||||
"1.Place personal knowledge files or folders in the pilot/datasets "
|
"1.Place personal knowledge files or folders in the pilot/datasets "
|
||||||
"directory."
|
"directory."
|
||||||
msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
|
msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:13 8762c0a463094c19924cdd4b7b1b1ede
|
#: ../../modules/knownledge.md:13 a675d90485834690bfca68b41a10c085
|
||||||
msgid ""
|
msgid ""
|
||||||
"We currently support many document formats: txt, pdf, md, html, doc, ppt,"
|
"We currently support many document formats: txt, pdf, md, html, doc, ppt,"
|
||||||
" and url."
|
" and url."
|
||||||
msgstr "当前支持txt, pdf, md, doc, ppt, html文档格式"
|
msgstr "当前支持txt, pdf, md, doc, ppt, html文档格式"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:15 752a5e7c623a49439ecf1ce8e6ccca7d
|
#: ../../modules/knownledge.md:15 f2c25b0536ff4b3191e13f3020e883a6
|
||||||
msgid "before execution:"
|
msgid "before execution:"
|
||||||
msgstr "在执行之前"
|
msgstr "在执行之前"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:22 fbef967557a94b938f8e47497bb43c20
|
#: ../../modules/knownledge.md:22 65427906c8b54cd699a07ed482251c83
|
||||||
msgid ""
|
msgid ""
|
||||||
"2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma "
|
"2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma "
|
||||||
"(now only support Chroma and Milvus, if you set Milvus, please set "
|
"(now only support Chroma and Milvus, if you set Milvus, please set "
|
||||||
"MILVUS_URL and MILVUS_PORT)"
|
"MILVUS_URL and MILVUS_PORT)"
|
||||||
msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)"
|
msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:25 f1745e2f17864711a636ecbdd6cb9833
|
#: ../../modules/knownledge.md:25 287ae6ee51cc4b668d99e48b81147d3f
|
||||||
msgid "2.Run the knowledge repository script in the tools directory."
|
#, fuzzy
|
||||||
msgstr "2.在tools目录执行知识入库脚本"
|
msgid "2.Run the knowledge repository initialization command"
|
||||||
|
msgstr "2.执行知识入库命令"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:34 5ae832d038b245a29a9e089f9e169cb0
|
#: ../../modules/knownledge.md:31 1fe0ac58d8354c7fba782901cb0673d8
|
||||||
msgid ""
|
msgid ""
|
||||||
"Optionally, you can run `python tools/knowledge_init.py -h` command to "
|
"Optionally, you can run `dbgpt knowledge load --help` command to see more"
|
||||||
"see more usage."
|
" usage."
|
||||||
msgstr ""
|
msgstr "另外,你可以运行 `dbgpt knowledge load --help` 命令来查看更多的用法"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:36 4aab02276dfd41819dbd218ecc608326
|
#: ../../modules/knownledge.md:33 e1607e330195470f9087bd4ffbc6d45d
|
||||||
msgid ""
|
msgid ""
|
||||||
"3.Add the knowledge repository in the interface by entering the name of "
|
"3.Add the knowledge repository in the interface by entering the name of "
|
||||||
"your knowledge repository (if not specified, enter \"default\") so you "
|
"your knowledge repository (if not specified, enter \"default\") so you "
|
||||||
"can use it for Q&A based on your knowledge base."
|
"can use it for Q&A based on your knowledge base."
|
||||||
msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
|
msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
|
||||||
|
|
||||||
#: ../../modules/knownledge.md:38 f990c8495c994aa1beb040ede6b2329a
|
#: ../../modules/knownledge.md:35 0614e35ccfba42ea9e63881cb481815e
|
||||||
msgid ""
|
msgid ""
|
||||||
"Note that the default vector model used is text2vec-large-chinese (which "
|
"Note that the default vector model used is text2vec-large-chinese (which "
|
||||||
"is a large model, so if your personal computer configuration is not "
|
"is a large model, so if your personal computer configuration is not "
|
||||||
@ -101,3 +102,9 @@ msgstr ""
|
|||||||
"注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑配置不够建议采用text2vec-base-"
|
"注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑配置不够建议采用text2vec-base-"
|
||||||
"chinese),因此确保需要将模型download下来放到models目录中。"
|
"chinese),因此确保需要将模型download下来放到models目录中。"
|
||||||
|
|
||||||
|
#~ msgid ""
|
||||||
|
#~ "Optionally, you can run `python "
|
||||||
|
#~ "tools/knowledge_init.py -h` command to see "
|
||||||
|
#~ "more usage."
|
||||||
|
#~ msgstr ""
|
||||||
|
|
||||||
|
@ -22,16 +22,13 @@ python -m spacy download zh_core_web_sm
|
|||||||
2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma
|
2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma
|
||||||
(now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT)
|
(now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT)
|
||||||
|
|
||||||
2.Run the knowledge repository script in the tools directory.
|
2.Run the knowledge repository initialization command
|
||||||
|
|
||||||
```
|
|
||||||
python tools/knowledge_init.py
|
|
||||||
|
|
||||||
--vector_name : your vector store name default_value:default
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dbgpt knowledge load
|
||||||
```
|
```
|
||||||
|
|
||||||
Optionally, you can run `python tools/knowledge_init.py -h` command to see more usage.
|
Optionally, you can run `dbgpt knowledge load --help` command to see more usage.
|
||||||
|
|
||||||
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
|
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import click
|
import click
|
||||||
import functools
|
import functools
|
||||||
|
import logging
|
||||||
|
|
||||||
from pilot.model.controller.registry import ModelRegistryClient
|
from pilot.model.controller.registry import ModelRegistryClient
|
||||||
from pilot.model.base import WorkerApplyType
|
from pilot.model.base import WorkerApplyType
|
||||||
@ -13,6 +14,8 @@ from pilot.utils.parameter_utils import EnvArgumentParser
|
|||||||
|
|
||||||
MODEL_CONTROLLER_ADDRESS = "http://127.0.0.1:8000"
|
MODEL_CONTROLLER_ADDRESS = "http://127.0.0.1:8000"
|
||||||
|
|
||||||
|
logger = logging.getLogger("dbgpt_cli")
|
||||||
|
|
||||||
|
|
||||||
@click.group("model")
|
@click.group("model")
|
||||||
@click.option(
|
@click.option(
|
||||||
|
@ -8,6 +8,15 @@ sys.path.append(
|
|||||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.WARNING,
|
||||||
|
encoding="utf-8",
|
||||||
|
format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("dbgpt_cli")
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option(
|
@click.option(
|
||||||
@ -19,8 +28,7 @@ sys.path.append(
|
|||||||
)
|
)
|
||||||
@click.version_option()
|
@click.version_option()
|
||||||
def cli(log_level: str):
|
def cli(log_level: str):
|
||||||
# TODO not working now
|
logger.setLevel(logging.getLevelName(log_level.upper()))
|
||||||
logging.basicConfig(level=log_level, encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def add_command_alias(command, name: str, hidden: bool = False, parent_group=None):
|
def add_command_alias(command, name: str, hidden: bool = False, parent_group=None):
|
||||||
@ -73,6 +81,13 @@ try:
|
|||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.warning(f"Integrating dbgpt model command line tool failed: {e}")
|
logging.warning(f"Integrating dbgpt model command line tool failed: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pilot.server.knowledge._cli.knowledge_cli import knowledge_cli_group
|
||||||
|
|
||||||
|
add_command_alias(knowledge_cli_group, name="knowledge", parent_group=cli)
|
||||||
|
except ImportError as e:
|
||||||
|
logging.warning(f"Integrating dbgpt knowledge command line tool failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
return cli()
|
return cli()
|
||||||
|
0
pilot/server/knowledge/_cli/__init__.py
Normal file
0
pilot/server/knowledge/_cli/__init__.py
Normal file
83
pilot/server/knowledge/_cli/knowledge_cli.py
Normal file
83
pilot/server/knowledge/_cli/knowledge_cli.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import click
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from pilot.configs.model_config import DATASETS_DIR
|
||||||
|
|
||||||
|
API_ADDRESS: str = "http://127.0.0.1:5000"
|
||||||
|
|
||||||
|
logger = logging.getLogger("dbgpt_cli")
|
||||||
|
|
||||||
|
|
||||||
|
@click.group("knowledge")
|
||||||
|
@click.option(
|
||||||
|
"--address",
|
||||||
|
type=str,
|
||||||
|
default=API_ADDRESS,
|
||||||
|
required=False,
|
||||||
|
show_default=True,
|
||||||
|
help=("Address of the Api server."),
|
||||||
|
)
|
||||||
|
def knowledge_cli_group(address: str):
|
||||||
|
"""Knowledge command line tool"""
|
||||||
|
global API_ADDRESS
|
||||||
|
API_ADDRESS = address
|
||||||
|
|
||||||
|
|
||||||
|
@knowledge_cli_group.command()
|
||||||
|
@click.option(
|
||||||
|
"--vector_name",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default="default",
|
||||||
|
show_default=True,
|
||||||
|
help="Your vector store name",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--vector_store_type",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default="Chroma",
|
||||||
|
show_default=True,
|
||||||
|
help="Vector store type",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--local_doc_dir",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=DATASETS_DIR,
|
||||||
|
show_default=True,
|
||||||
|
help="Your document directory",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--skip_wrong_doc",
|
||||||
|
required=False,
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
show_default=True,
|
||||||
|
help="Skip wrong document",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--max_workers",
|
||||||
|
required=False,
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="The maximum number of threads that can be used to upload document",
|
||||||
|
)
|
||||||
|
def load(
|
||||||
|
vector_name: str,
|
||||||
|
vector_store_type: str,
|
||||||
|
local_doc_dir: str,
|
||||||
|
skip_wrong_doc: bool,
|
||||||
|
max_workers: int,
|
||||||
|
):
|
||||||
|
"""Load you local knowledge to DB-GPT"""
|
||||||
|
from pilot.server.knowledge._cli.knowledge_client import knowledge_init
|
||||||
|
|
||||||
|
knowledge_init(
|
||||||
|
API_ADDRESS,
|
||||||
|
vector_name,
|
||||||
|
vector_store_type,
|
||||||
|
local_doc_dir,
|
||||||
|
skip_wrong_doc,
|
||||||
|
max_workers,
|
||||||
|
)
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
@ -18,10 +19,12 @@ from pilot.server.knowledge.request.request import DocumentSyncRequest
|
|||||||
|
|
||||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
||||||
|
|
||||||
|
|
||||||
HTTP_HEADERS = {"Content-Type": "application/json"}
|
HTTP_HEADERS = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger("dbgpt_cli")
|
||||||
|
|
||||||
|
|
||||||
class ApiClient:
|
class ApiClient:
|
||||||
def __init__(self, api_address: str) -> None:
|
def __init__(self, api_address: str) -> None:
|
||||||
self.api_address = api_address
|
self.api_address = api_address
|
||||||
@ -40,9 +43,9 @@ class ApiClient:
|
|||||||
def _post(self, url: str, data=None):
|
def _post(self, url: str, data=None):
|
||||||
if not isinstance(data, dict):
|
if not isinstance(data, dict):
|
||||||
data = data.__dict__
|
data = data.__dict__
|
||||||
response = requests.post(
|
url = urljoin(self.api_address, url)
|
||||||
urljoin(self.api_address, url), data=json.dumps(data), headers=HTTP_HEADERS
|
logger.debug(f"Send request to {url}, data: {data}")
|
||||||
)
|
response = requests.post(url, data=json.dumps(data), headers=HTTP_HEADERS)
|
||||||
return self._handle_response(response)
|
return self._handle_response(response)
|
||||||
|
|
||||||
|
|
||||||
@ -55,7 +58,7 @@ class KnowledgeApiClient(ApiClient):
|
|||||||
return self._post("/knowledge/space/add", data=request)
|
return self._post("/knowledge/space/add", data=request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "have already named" in str(e):
|
if "have already named" in str(e):
|
||||||
print(f"Warning: you have already named {request.name}")
|
logger.warn(f"you have already named {request.name}")
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@ -98,7 +101,6 @@ def knowledge_init(
|
|||||||
vector_store_type: str,
|
vector_store_type: str,
|
||||||
local_doc_dir: str,
|
local_doc_dir: str,
|
||||||
skip_wrong_doc: bool,
|
skip_wrong_doc: bool,
|
||||||
verbose: bool,
|
|
||||||
max_workers: int = None,
|
max_workers: int = None,
|
||||||
):
|
):
|
||||||
client = KnowledgeApiClient(api_address)
|
client = KnowledgeApiClient(api_address)
|
||||||
@ -109,9 +111,9 @@ def knowledge_init(
|
|||||||
space.owner = "DB-GPT"
|
space.owner = "DB-GPT"
|
||||||
|
|
||||||
# Create space
|
# Create space
|
||||||
print(f"Create space: {space}")
|
logger.info(f"Create space: {space}")
|
||||||
client.space_add(space)
|
client.space_add(space)
|
||||||
print("Create space successfully")
|
logger.info("Create space successfully")
|
||||||
space_list = client.space_list(KnowledgeSpaceRequest(name=space.name))
|
space_list = client.space_list(KnowledgeSpaceRequest(name=space.name))
|
||||||
if len(space_list) != 1:
|
if len(space_list) != 1:
|
||||||
raise Exception(f"List space {space.name} error")
|
raise Exception(f"List space {space.name} error")
|
||||||
@ -121,13 +123,13 @@ def knowledge_init(
|
|||||||
|
|
||||||
def upload(filename: str):
|
def upload(filename: str):
|
||||||
try:
|
try:
|
||||||
print(f"Begin upload document: {filename} to {space.name}")
|
logger.info(f"Begin upload document: {filename} to {space.name}")
|
||||||
return client.document_upload(
|
return client.document_upload(
|
||||||
space.name, filename, KnowledgeType.DOCUMENT.value, filename
|
space.name, filename, KnowledgeType.DOCUMENT.value, filename
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if skip_wrong_doc:
|
if skip_wrong_doc:
|
||||||
print(f"Warning: {str(e)}")
|
logger.warn(f"Warning: {str(e)}")
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@ -140,7 +142,7 @@ def knowledge_init(
|
|||||||
doc_ids = [r.result() for r in as_completed(tasks)]
|
doc_ids = [r.result() for r in as_completed(tasks)]
|
||||||
doc_ids = list(filter(lambda x: x, doc_ids))
|
doc_ids = list(filter(lambda x: x, doc_ids))
|
||||||
if not doc_ids:
|
if not doc_ids:
|
||||||
print("Warning: no document to sync")
|
logger.warn("Warning: no document to sync")
|
||||||
return
|
return
|
||||||
print(f"Begin sync document: {doc_ids}")
|
logger.info(f"Begin sync document: {doc_ids}")
|
||||||
client.document_sync(space.name, DocumentSyncRequest(doc_ids=doc_ids))
|
client.document_sync(space.name, DocumentSyncRequest(doc_ids=doc_ids))
|
@ -1,122 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
"""
|
|
||||||
DB-GPT command line tools.
|
|
||||||
|
|
||||||
You can use it for some background management:
|
|
||||||
- Lots of knowledge document initialization.
|
|
||||||
- Load the data into the database.
|
|
||||||
- Show server status
|
|
||||||
- ...
|
|
||||||
|
|
||||||
|
|
||||||
Maybe move this to pilot module and append to console_scripts in the future.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import click
|
|
||||||
import os
|
|
||||||
|
|
||||||
sys.path.append(
|
|
||||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
from pilot.configs.model_config import DATASETS_DIR
|
|
||||||
|
|
||||||
|
|
||||||
API_ADDRESS: str = "http://127.0.0.1:5000"
|
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
|
||||||
@click.option(
|
|
||||||
"--api_address",
|
|
||||||
required=False,
|
|
||||||
default="http://127.0.0.1:5000",
|
|
||||||
type=str,
|
|
||||||
help="Api server address",
|
|
||||||
)
|
|
||||||
@click.version_option()
|
|
||||||
def cli(api_address: str):
|
|
||||||
global API_ADDRESS
|
|
||||||
API_ADDRESS = api_address
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option(
|
|
||||||
"--vector_name",
|
|
||||||
required=False,
|
|
||||||
type=str,
|
|
||||||
default="default",
|
|
||||||
help="Your vector store name",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--vector_store_type",
|
|
||||||
required=False,
|
|
||||||
type=str,
|
|
||||||
default="Chroma",
|
|
||||||
help="Vector store type",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--local_doc_dir",
|
|
||||||
required=False,
|
|
||||||
type=str,
|
|
||||||
default=DATASETS_DIR,
|
|
||||||
help="Your document directory",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--skip_wrong_doc",
|
|
||||||
required=False,
|
|
||||||
type=bool,
|
|
||||||
default=False,
|
|
||||||
help="Skip wrong document",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--max_workers",
|
|
||||||
required=False,
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="The maximum number of threads that can be used to upload document",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"-v",
|
|
||||||
"--verbose",
|
|
||||||
required=False,
|
|
||||||
is_flag=True,
|
|
||||||
hidden=True,
|
|
||||||
help="Show debuggging information.",
|
|
||||||
)
|
|
||||||
def knowledge(
|
|
||||||
vector_name: str,
|
|
||||||
vector_store_type: str,
|
|
||||||
local_doc_dir: str,
|
|
||||||
skip_wrong_doc: bool,
|
|
||||||
max_workers: int,
|
|
||||||
verbose: bool,
|
|
||||||
):
|
|
||||||
"""Knowledge command line tool"""
|
|
||||||
from tools.cli.knowledge_client import knowledge_init
|
|
||||||
|
|
||||||
knowledge_init(
|
|
||||||
API_ADDRESS,
|
|
||||||
vector_name,
|
|
||||||
vector_store_type,
|
|
||||||
local_doc_dir,
|
|
||||||
skip_wrong_doc,
|
|
||||||
verbose,
|
|
||||||
max_workers,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# knowledge command
|
|
||||||
cli.add_command(knowledge)
|
|
||||||
# TODO add more command
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
return cli()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
raise Exception(
|
||||||
|
"The functionality of this script has been moved to the command line tool `dbgpt`. For details on usage, please execute the command `dbgpt --help`."
|
||||||
|
)
|
||||||
|
@ -1,103 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
|
||||||
|
|
||||||
from pilot.embedding_engine.knowledge_type import KnowledgeType
|
|
||||||
from pilot.server.knowledge.service import KnowledgeService
|
|
||||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
|
||||||
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.configs.model_config import (
|
|
||||||
DATASETS_DIR,
|
|
||||||
LLM_MODEL_CONFIG,
|
|
||||||
KNOWLEDGE_UPLOAD_ROOT_PATH,
|
|
||||||
)
|
|
||||||
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
|
|
||||||
|
|
||||||
knowledge_space_service = KnowledgeService()
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class LocalKnowledgeInit:
|
|
||||||
embeddings: object = None
|
|
||||||
|
|
||||||
def __init__(self, vector_store_config) -> None:
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
self.model_name = LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
|
|
||||||
|
|
||||||
def knowledge_persist(self, file_path: str, skip_wrong_doc: bool = False):
|
|
||||||
"""knowledge persist"""
|
|
||||||
docs = []
|
|
||||||
embedding_engine = None
|
|
||||||
for root, _, files in os.walk(file_path, topdown=False):
|
|
||||||
for file in files:
|
|
||||||
filename = os.path.join(root, file)
|
|
||||||
ke = EmbeddingEngine(
|
|
||||||
knowledge_source=filename,
|
|
||||||
knowledge_type=KnowledgeType.DOCUMENT.value,
|
|
||||||
model_name=self.model_name,
|
|
||||||
vector_store_config=self.vector_store_config,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
embedding_engine = ke.init_knowledge_embedding()
|
|
||||||
doc = ke.read()
|
|
||||||
docs.extend(doc)
|
|
||||||
except Exception as e:
|
|
||||||
error_msg = traceback.format_exc()
|
|
||||||
if skip_wrong_doc:
|
|
||||||
print(
|
|
||||||
f"Warning: document file {filename} embedding error, skip it, error message: {error_msg}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
embedding_engine.index_to_store(docs)
|
|
||||||
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
|
|
||||||
try:
|
|
||||||
space = KnowledgeSpaceRequest
|
|
||||||
space.name = self.vector_store_config["vector_store_name"]
|
|
||||||
space.desc = "knowledge_init.py"
|
|
||||||
space.vector_type = CFG.VECTOR_STORE_TYPE
|
|
||||||
space.owner = "DB-GPT"
|
|
||||||
knowledge_space_service.create_knowledge_space(space)
|
|
||||||
except Exception as e:
|
|
||||||
if "have already named" in str(e):
|
|
||||||
print(f"Warning: you have already named {space.name}")
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# TODO https://github.com/csunny/DB-GPT/issues/354
|
raise Exception(
|
||||||
parser = argparse.ArgumentParser(
|
"The functionality of this script has been moved to the command line tool `dbgpt`. For details on usage, please execute the command `dbgpt --help`."
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--vector_name", type=str, default="default", help="Your vector store name"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--file_path", type=str, default=DATASETS_DIR, help="Your document path"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--skip_wrong_doc", type=bool, default=False, help="Skip wrong document"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
vector_name = args.vector_name
|
|
||||||
store_type = CFG.VECTOR_STORE_TYPE
|
|
||||||
file_path = args.file_path
|
|
||||||
skip_wrong_doc = args.skip_wrong_doc
|
|
||||||
vector_store_config = {
|
|
||||||
"vector_store_name": vector_name,
|
|
||||||
"vector_store_type": CFG.VECTOR_STORE_TYPE,
|
|
||||||
"chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
|
|
||||||
}
|
|
||||||
print(vector_store_config)
|
|
||||||
kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
|
|
||||||
kv.knowledge_persist(file_path=file_path, skip_wrong_doc=skip_wrong_doc)
|
|
||||||
print("your knowledge embedding success...")
|
|
||||||
|
Loading…
Reference in New Issue
Block a user