doc:update knowledge api

This commit is contained in:
aries_ckt 2023-07-10 17:00:45 +08:00
parent 56b32ab094
commit 6ff7ef9da4
6 changed files with 149 additions and 46 deletions

View File

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: DB-GPT 0.3.0\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-06-14 15:12+0800\n"
"POT-Creation-Date: 2023-07-10 16:59+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
@ -19,12 +19,12 @@ msgstr ""
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.12.1\n"
#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:30
#: e98ef6095fc54f8f8dc045cfa1733dc2
#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:84
#: ca36c0ca545c4d70b51fe811a3e7caca
msgid "Knowledge"
msgstr "知识"
#: ../../modules/knowledge.rst:4 51340dd2758e42ee8e96c3935de53438
#: ../../modules/knowledge.rst:4 37818bc0ace74e008a52dbd838898c87
#, fuzzy
msgid ""
"As the knowledge base is currently the most significant user demand "
@ -32,55 +32,64 @@ msgid ""
"knowledge bases. At the same time, we also provide multiple knowledge "
"base management strategies in this project, such as pdf knowledge,md "
"knowledge, txt knowledge, word knowledge, ppt knowledge:"
msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md "
", txt, word, ppt"
msgstr ""
"由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md , "
"txt, word, ppt"
#: ../../modules/knowledge.rst:7 25eeb187843a4d9baa4d0c0a404eec65
#: ../../modules/knowledge.rst:6 ../../modules/knowledge.rst:13
#: c92bd129bf5043fd9d6224d245cc9a55
#, fuzzy
msgid ""
"We currently support many document formats: raw text, txt, pdf, md, html,"
" doc, ppt, and url."
msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式"
#: ../../modules/knowledge.rst:9 eec1169fea7a4a669433c347a4d929a2
msgid "**Create your own knowledge repository**"
msgstr "创建你自己的知识库"
#: ../../modules/knowledge.rst:9 bed8a8f08c194ff59a31dc53f67561c1
msgid ""
"1.Place personal knowledge files or folders in the pilot/datasets "
"directory."
msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
#: ../../modules/knowledge.rst:11 20565959da0842aa9f5bb3fe8fb37e10
msgid "1.prepare"
msgstr "准备"
#: ../../modules/knowledge.rst:11 6e03e1a2799a432f8319c3aaf33e2867
msgid ""
"We currently support many document formats: txt, pdf, md, html, doc, ppt,"
" and url."
msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式"
#: ../../modules/knowledge.rst:15 515555d13e7548deb596d80ea1514bb2
msgid "before execution:"
msgstr ""
#: ../../modules/knowledge.rst:13 883ebf16fe7f4e1fbc73ef7430104e79
msgid "before execution: python -m spacy download zh_core_web_sm"
msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm"
#: ../../modules/knowledge.rst:15 59f4bfa8c1064391919ce2af69f2d4c9
#: ../../modules/knowledge.rst:21 8b790c0c37114dfc8eda4863af9314b4
msgid ""
"2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma "
"(now only support Chroma and Milvus, if you set Milvus, please set "
"MILVUS_URL and MILVUS_PORT)"
msgstr "2.更新你的.env设置你的向量存储类型VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus如果你设置了Milvus请设置MILVUS_URL和MILVUS_PORT)"
#: ../../modules/knowledge.rst:18 be600a4d93094045b78a43307dfc8f5f
#: ../../modules/knowledge.rst:24 058fa57484a64756ab2650b46f4b33bf
msgid ""
"3.init Url Type EmbeddingEngine api and embedding your document into "
"vector store in your code."
msgstr "初始化 Url类型 EmbeddingEngine api 将url文档embedding向量化到向量数据库 "
#: ../../modules/knowledge.rst:40 5f255b96abd346479ab3c371393e47dc
#, fuzzy
msgid "2.Run the knowledge repository script in the tools directory."
msgstr "3.在tools目录执行知识入库脚本"
#: ../../modules/knowledge.rst:20 b27eddbbf6c74993a6653575f57fff18
msgid ""
"python tools/knowledge_init.py note : --vector_name : your vector store "
"name default_value:default"
"4.init Document Type EmbeddingEngine api and embedding your document into"
" vector store in your code. Document type can be .txt, .pdf, .md, .doc, "
".ppt."
msgstr ""
"初始化 文档型类型 EmbeddingEngine api 将文档embedding向量化到向量数据库(文档可以是.txt, .pdf, "
".md, .html, .doc, .ppt)"
#: ../../modules/knowledge.rst:23 f32dc12aedc94ffc8fee77a4b6e0ec88
#: ../../modules/knowledge.rst:57 d8c85ba7714749269714b03857738f70
msgid ""
"3.Add the knowledge repository in the interface by entering the name of "
"your knowledge repository (if not specified, enter \"default\") so you "
"can use it for Q&A based on your knowledge base."
msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
"5.init TEXT Type EmbeddingEngine api and embedding your document into "
"vector store in your code."
msgstr "初始化TEXT类型 EmbeddingEngine api 将文档embedding向量化到向量数据库"
#: ../../modules/knowledge.rst:25 5b1412c8beb24784bd2a93fe5c487b7b
#: ../../modules/knowledge.rst:73 c59e4650d57e44ae8d967768dddf908a
msgid "4.similar search based on your knowledge base. ::"
msgstr "在知识库进行相似性搜索"
#: ../../modules/knowledge.rst:79 f500fcdc791c4286b411819ae9ab3dc6
msgid ""
"Note that the default vector model used is text2vec-large-chinese (which "
"is a large model, so if your personal computer configuration is not "
@ -90,9 +99,35 @@ msgstr ""
"注意这里默认向量模型是text2vec-large-chinese(模型比较大如果个人电脑配置不够建议采用text2vec-base-"
"chinese),因此确保需要将模型download下来放到models目录中。"
#: ../../modules/knowledge.rst:27 67773e32b01c48628c80b6fab8c90146
#: ../../modules/knowledge.rst:81 62a5e10a19844ba9955113f5c78cb460
msgid ""
"`pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf "
"embedding."
msgstr "pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf embedding."
#~ msgid "before execution: python -m spacy download zh_core_web_sm"
#~ msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm"
#~ msgid "2.Run the knowledge repository script in the tools directory."
#~ msgstr "3.在tools目录执行知识入库脚本"
#~ msgid ""
#~ "python tools/knowledge_init.py note : "
#~ "--vector_name : your vector store name"
#~ " default_value:default"
#~ msgstr ""
#~ msgid ""
#~ "3.Add the knowledge repository in the"
#~ " interface by entering the name of"
#~ " your knowledge repository (if not "
#~ "specified, enter \"default\") so you can"
#~ " use it for Q&A based on your"
#~ " knowledge base."
#~ msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
#~ msgid ""
#~ "1.Place personal knowledge files or "
#~ "folders in the pilot/datasets directory."
#~ msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"

View File

@ -3,24 +3,78 @@ Knowledge
| As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge:
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
**Create your own knowledge repository**
1.Place personal knowledge files or folders in the pilot/datasets directory.
1.prepare
We currently support many document formats: txt, pdf, md, html, doc, ppt, and url.
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
before execution: python -m spacy download zh_core_web_sm
before execution:
::
python -m spacy download zh_core_web_sm
2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma
(now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT)
2.Run the knowledge repository script in the tools directory.
3.init Url Type EmbeddingEngine api and embedding your document into vector store in your code.
python tools/knowledge_init.py
note : --vector_name : your vector store name default_value:default
::
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
embedding_model = "text2vec"
vector_store_config = {
"vector_store_name": your_name,
}
embedding_engine = EmbeddingEngine(
knowledge_source=url,
knowledge_type=KnowledgeType.URL.value,
model_name=embedding_model,
vector_store_config=vector_store_config)
embedding_engine.knowledge_embedding()
4.init Document Type EmbeddingEngine api and embedding your document into vector store in your code.
Document type can be .txt, .pdf, .md, .doc, .ppt.
::
document_path = "your_path/test.md"
embedding_model = "text2vec"
vector_store_config = {
"vector_store_name": your_name,
}
embedding_engine = EmbeddingEngine(
knowledge_source=document_path,
knowledge_type=KnowledgeType.DOCUMENT.value,
model_name=embedding_model,
vector_store_config=vector_store_config)
embedding_engine.knowledge_embedding()
5.init TEXT Type EmbeddingEngine api and embedding your document into vector store in your code.
::
raw_text = "a long passage"
embedding_model = "text2vec"
vector_store_config = {
"vector_store_name": your_name,
}
embedding_engine = EmbeddingEngine(
knowledge_source=raw_text,
knowledge_type=KnowledgeType.TEXT.value,
model_name=embedding_model,
vector_store_config=vector_store_config)
embedding_engine.knowledge_embedding()
4.similar search based on your knowledge base.
::
query = "please introduce the oceanbase"
topk = 5
docs = embedding_engine.similar_search(query, topk)
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.

View File

@ -37,7 +37,7 @@ vector_store_config = {
query = "your query"
embedding_engine = EmbeddingEngine(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"], vector_store_config=vector_store_config)
embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config)
embedding_engine.similar_search(query, 10)
```

View File

@ -1,3 +1,4 @@
from pilot.embedding_engine import SourceEmbedding, register
from pilot.embedding_engine import EmbeddingEngine, KnowledgeType
__all__ = ["SourceEmbedding", "register"]
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"]

View File

@ -1,4 +1,5 @@
from pilot.embedding_engine.source_embedding import SourceEmbedding, register
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
from pilot.embedding_engine.knowledge_type import KnowledgeType
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine"]
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"]

View File

@ -0,0 +1,12 @@
from pilot import EmbeddingEngine, KnowledgeType
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
embedding_model = "text2vec"
vector_store_config = {
"vector_store_name": url.replace(":", ""),
}
embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config)
# embedding url content to vector store
embedding_engine.knowledge_embedding()