mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-23 20:26:15 +00:00
doc:update knowledge api
This commit is contained in:
parent
56b32ab094
commit
6ff7ef9da4
@ -8,7 +8,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: DB-GPT 0.3.0\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2023-06-14 15:12+0800\n"
|
||||
"POT-Creation-Date: 2023-07-10 16:59+0800\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: zh_CN\n"
|
||||
@ -19,12 +19,12 @@ msgstr ""
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"Generated-By: Babel 2.12.1\n"
|
||||
|
||||
#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:30
|
||||
#: e98ef6095fc54f8f8dc045cfa1733dc2
|
||||
#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:84
|
||||
#: ca36c0ca545c4d70b51fe811a3e7caca
|
||||
msgid "Knowledge"
|
||||
msgstr "知识"
|
||||
|
||||
#: ../../modules/knowledge.rst:4 51340dd2758e42ee8e96c3935de53438
|
||||
#: ../../modules/knowledge.rst:4 37818bc0ace74e008a52dbd838898c87
|
||||
#, fuzzy
|
||||
msgid ""
|
||||
"As the knowledge base is currently the most significant user demand "
|
||||
@ -32,55 +32,64 @@ msgid ""
|
||||
"knowledge bases. At the same time, we also provide multiple knowledge "
|
||||
"base management strategies in this project, such as pdf knowledge,md "
|
||||
"knowledge, txt knowledge, word knowledge, ppt knowledge:"
|
||||
msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md "
|
||||
", txt, word, ppt"
|
||||
msgstr ""
|
||||
"由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md , "
|
||||
"txt, word, ppt"
|
||||
|
||||
#: ../../modules/knowledge.rst:7 25eeb187843a4d9baa4d0c0a404eec65
|
||||
#: ../../modules/knowledge.rst:6 ../../modules/knowledge.rst:13
|
||||
#: c92bd129bf5043fd9d6224d245cc9a55
|
||||
#, fuzzy
|
||||
msgid ""
|
||||
"We currently support many document formats: raw text, txt, pdf, md, html,"
|
||||
" doc, ppt, and url."
|
||||
msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式"
|
||||
|
||||
#: ../../modules/knowledge.rst:9 eec1169fea7a4a669433c347a4d929a2
|
||||
msgid "**Create your own knowledge repository**"
|
||||
msgstr "创建你自己的知识库"
|
||||
|
||||
#: ../../modules/knowledge.rst:9 bed8a8f08c194ff59a31dc53f67561c1
|
||||
msgid ""
|
||||
"1.Place personal knowledge files or folders in the pilot/datasets "
|
||||
"directory."
|
||||
msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
|
||||
#: ../../modules/knowledge.rst:11 20565959da0842aa9f5bb3fe8fb37e10
|
||||
msgid "1.prepare"
|
||||
msgstr "准备"
|
||||
|
||||
#: ../../modules/knowledge.rst:11 6e03e1a2799a432f8319c3aaf33e2867
|
||||
msgid ""
|
||||
"We currently support many document formats: txt, pdf, md, html, doc, ppt,"
|
||||
" and url."
|
||||
msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式"
|
||||
#: ../../modules/knowledge.rst:15 515555d13e7548deb596d80ea1514bb2
|
||||
msgid "before execution:"
|
||||
msgstr ""
|
||||
|
||||
#: ../../modules/knowledge.rst:13 883ebf16fe7f4e1fbc73ef7430104e79
|
||||
msgid "before execution: python -m spacy download zh_core_web_sm"
|
||||
msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm"
|
||||
|
||||
#: ../../modules/knowledge.rst:15 59f4bfa8c1064391919ce2af69f2d4c9
|
||||
#: ../../modules/knowledge.rst:21 8b790c0c37114dfc8eda4863af9314b4
|
||||
msgid ""
|
||||
"2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma "
|
||||
"(now only support Chroma and Milvus, if you set Milvus, please set "
|
||||
"MILVUS_URL and MILVUS_PORT)"
|
||||
msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)"
|
||||
|
||||
#: ../../modules/knowledge.rst:18 be600a4d93094045b78a43307dfc8f5f
|
||||
#: ../../modules/knowledge.rst:24 058fa57484a64756ab2650b46f4b33bf
|
||||
msgid ""
|
||||
"3.init Url Type EmbeddingEngine api and embedding your document into "
|
||||
"vector store in your code."
|
||||
msgstr "初始化 Url类型 EmbeddingEngine api, 将url文档embedding向量化到向量数据库 "
|
||||
|
||||
#: ../../modules/knowledge.rst:40 5f255b96abd346479ab3c371393e47dc
|
||||
#, fuzzy
|
||||
msgid "2.Run the knowledge repository script in the tools directory."
|
||||
msgstr "3.在tools目录执行知识入库脚本"
|
||||
|
||||
#: ../../modules/knowledge.rst:20 b27eddbbf6c74993a6653575f57fff18
|
||||
msgid ""
|
||||
"python tools/knowledge_init.py note : --vector_name : your vector store "
|
||||
"name default_value:default"
|
||||
"4.init Document Type EmbeddingEngine api and embedding your document into"
|
||||
" vector store in your code. Document type can be .txt, .pdf, .md, .doc, "
|
||||
".ppt."
|
||||
msgstr ""
|
||||
"初始化 文档型类型 EmbeddingEngine api, 将文档embedding向量化到向量数据库(文档可以是.txt, .pdf, "
|
||||
".md, .html, .doc, .ppt)"
|
||||
|
||||
#: ../../modules/knowledge.rst:23 f32dc12aedc94ffc8fee77a4b6e0ec88
|
||||
#: ../../modules/knowledge.rst:57 d8c85ba7714749269714b03857738f70
|
||||
msgid ""
|
||||
"3.Add the knowledge repository in the interface by entering the name of "
|
||||
"your knowledge repository (if not specified, enter \"default\") so you "
|
||||
"can use it for Q&A based on your knowledge base."
|
||||
msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
|
||||
"5.init TEXT Type EmbeddingEngine api and embedding your document into "
|
||||
"vector store in your code."
|
||||
msgstr "初始化TEXT类型 EmbeddingEngine api, 将文档embedding向量化到向量数据库"
|
||||
|
||||
#: ../../modules/knowledge.rst:25 5b1412c8beb24784bd2a93fe5c487b7b
|
||||
#: ../../modules/knowledge.rst:73 c59e4650d57e44ae8d967768dddf908a
|
||||
msgid "4.similar search based on your knowledge base. ::"
|
||||
msgstr "在知识库进行相似性搜索"
|
||||
|
||||
#: ../../modules/knowledge.rst:79 f500fcdc791c4286b411819ae9ab3dc6
|
||||
msgid ""
|
||||
"Note that the default vector model used is text2vec-large-chinese (which "
|
||||
"is a large model, so if your personal computer configuration is not "
|
||||
@ -90,9 +99,35 @@ msgstr ""
|
||||
"注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑配置不够建议采用text2vec-base-"
|
||||
"chinese),因此确保需要将模型download下来放到models目录中。"
|
||||
|
||||
#: ../../modules/knowledge.rst:27 67773e32b01c48628c80b6fab8c90146
|
||||
#: ../../modules/knowledge.rst:81 62a5e10a19844ba9955113f5c78cb460
|
||||
msgid ""
|
||||
"`pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf "
|
||||
"embedding."
|
||||
msgstr "pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf embedding."
|
||||
|
||||
#~ msgid "before execution: python -m spacy download zh_core_web_sm"
|
||||
#~ msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm"
|
||||
|
||||
#~ msgid "2.Run the knowledge repository script in the tools directory."
|
||||
#~ msgstr "3.在tools目录执行知识入库脚本"
|
||||
|
||||
#~ msgid ""
|
||||
#~ "python tools/knowledge_init.py note : "
|
||||
#~ "--vector_name : your vector store name"
|
||||
#~ " default_value:default"
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid ""
|
||||
#~ "3.Add the knowledge repository in the"
|
||||
#~ " interface by entering the name of"
|
||||
#~ " your knowledge repository (if not "
|
||||
#~ "specified, enter \"default\") so you can"
|
||||
#~ " use it for Q&A based on your"
|
||||
#~ " knowledge base."
|
||||
#~ msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名"
|
||||
|
||||
#~ msgid ""
|
||||
#~ "1.Place personal knowledge files or "
|
||||
#~ "folders in the pilot/datasets directory."
|
||||
#~ msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。"
|
||||
|
||||
|
@ -3,24 +3,78 @@ Knowledge
|
||||
|
||||
| As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge:
|
||||
|
||||
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
|
||||
|
||||
|
||||
**Create your own knowledge repository**
|
||||
|
||||
1.Place personal knowledge files or folders in the pilot/datasets directory.
|
||||
1.prepare
|
||||
|
||||
We currently support many document formats: txt, pdf, md, html, doc, ppt, and url.
|
||||
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
|
||||
|
||||
before execution: python -m spacy download zh_core_web_sm
|
||||
before execution:
|
||||
|
||||
::
|
||||
|
||||
python -m spacy download zh_core_web_sm
|
||||
|
||||
2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma
|
||||
(now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT)
|
||||
|
||||
2.Run the knowledge repository script in the tools directory.
|
||||
3.init Url Type EmbeddingEngine api and embedding your document into vector store in your code.
|
||||
|
||||
python tools/knowledge_init.py
|
||||
note : --vector_name : your vector store name default_value:default
|
||||
::
|
||||
|
||||
3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base.
|
||||
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
|
||||
embedding_model = "text2vec"
|
||||
vector_store_config = {
|
||||
"vector_store_name": your_name,
|
||||
}
|
||||
embedding_engine = EmbeddingEngine(
|
||||
knowledge_source=url,
|
||||
knowledge_type=KnowledgeType.URL.value,
|
||||
model_name=embedding_model,
|
||||
vector_store_config=vector_store_config)
|
||||
embedding_engine.knowledge_embedding()
|
||||
|
||||
4.init Document Type EmbeddingEngine api and embedding your document into vector store in your code.
|
||||
Document type can be .txt, .pdf, .md, .doc, .ppt.
|
||||
|
||||
::
|
||||
|
||||
document_path = "your_path/test.md"
|
||||
embedding_model = "text2vec"
|
||||
vector_store_config = {
|
||||
"vector_store_name": your_name,
|
||||
}
|
||||
embedding_engine = EmbeddingEngine(
|
||||
knowledge_source=document_path,
|
||||
knowledge_type=KnowledgeType.DOCUMENT.value,
|
||||
model_name=embedding_model,
|
||||
vector_store_config=vector_store_config)
|
||||
embedding_engine.knowledge_embedding()
|
||||
|
||||
5.init TEXT Type EmbeddingEngine api and embedding your document into vector store in your code.
|
||||
|
||||
::
|
||||
|
||||
raw_text = "a long passage"
|
||||
embedding_model = "text2vec"
|
||||
vector_store_config = {
|
||||
"vector_store_name": your_name,
|
||||
}
|
||||
embedding_engine = EmbeddingEngine(
|
||||
knowledge_source=raw_text,
|
||||
knowledge_type=KnowledgeType.TEXT.value,
|
||||
model_name=embedding_model,
|
||||
vector_store_config=vector_store_config)
|
||||
embedding_engine.knowledge_embedding()
|
||||
|
||||
4.similar search based on your knowledge base.
|
||||
::
|
||||
query = "please introduce the oceanbase"
|
||||
topk = 5
|
||||
docs = embedding_engine.similar_search(query, topk)
|
||||
|
||||
Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory.
|
||||
|
||||
|
@ -37,7 +37,7 @@ vector_store_config = {
|
||||
|
||||
query = "your query"
|
||||
|
||||
embedding_engine = EmbeddingEngine(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"], vector_store_config=vector_store_config)
|
||||
embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config)
|
||||
|
||||
embedding_engine.similar_search(query, 10)
|
||||
```
|
@ -1,3 +1,4 @@
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
from pilot.embedding_engine import EmbeddingEngine, KnowledgeType
|
||||
|
||||
__all__ = ["SourceEmbedding", "register"]
|
||||
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"]
|
||||
|
@ -1,4 +1,5 @@
|
||||
from pilot.embedding_engine.source_embedding import SourceEmbedding, register
|
||||
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
|
||||
from pilot.embedding_engine.knowledge_type import KnowledgeType
|
||||
|
||||
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine"]
|
||||
__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"]
|
||||
|
12
tests/unit/embedding_engine/test_url_embedding.py
Normal file
12
tests/unit/embedding_engine/test_url_embedding.py
Normal file
@ -0,0 +1,12 @@
|
||||
from pilot import EmbeddingEngine, KnowledgeType
|
||||
|
||||
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
|
||||
embedding_model = "text2vec"
|
||||
vector_store_config = {
|
||||
"vector_store_name": url.replace(":", ""),
|
||||
}
|
||||
embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config)
|
||||
|
||||
# embedding url content to vector store
|
||||
embedding_engine.knowledge_embedding()
|
||||
|
Loading…
Reference in New Issue
Block a user