diff --git a/docs/locales/zh_CN/LC_MESSAGES/modules/knowledge.po b/docs/locales/zh_CN/LC_MESSAGES/modules/knowledge.po index ecdde43b6..0e2bd4b8f 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/modules/knowledge.po +++ b/docs/locales/zh_CN/LC_MESSAGES/modules/knowledge.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.3.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-14 15:12+0800\n" +"POT-Creation-Date: 2023-07-10 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -19,12 +19,12 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:30 -#: e98ef6095fc54f8f8dc045cfa1733dc2 +#: ../../modules/knowledge.rst:2 ../../modules/knowledge.rst:84 +#: ca36c0ca545c4d70b51fe811a3e7caca msgid "Knowledge" msgstr "知识" -#: ../../modules/knowledge.rst:4 51340dd2758e42ee8e96c3935de53438 +#: ../../modules/knowledge.rst:4 37818bc0ace74e008a52dbd838898c87 #, fuzzy msgid "" "As the knowledge base is currently the most significant user demand " @@ -32,55 +32,64 @@ msgid "" "knowledge bases. At the same time, we also provide multiple knowledge " "base management strategies in this project, such as pdf knowledge,md " "knowledge, txt knowledge, word knowledge, ppt knowledge:" -msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md " -", txt, word, ppt" +msgstr "" +"由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:pdf,md , " +"txt, word, ppt" -#: ../../modules/knowledge.rst:7 25eeb187843a4d9baa4d0c0a404eec65 +#: ../../modules/knowledge.rst:6 ../../modules/knowledge.rst:13 +#: c92bd129bf5043fd9d6224d245cc9a55 +#, fuzzy +msgid "" +"We currently support many document formats: raw text, txt, pdf, md, html," +" doc, ppt, and url." +msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式" + +#: ../../modules/knowledge.rst:9 eec1169fea7a4a669433c347a4d929a2 msgid "**Create your own knowledge repository**" msgstr "创建你自己的知识库" -#: ../../modules/knowledge.rst:9 bed8a8f08c194ff59a31dc53f67561c1 -msgid "" -"1.Place personal knowledge files or folders in the pilot/datasets " -"directory." -msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。" +#: ../../modules/knowledge.rst:11 20565959da0842aa9f5bb3fe8fb37e10 +msgid "1.prepare" +msgstr "准备" -#: ../../modules/knowledge.rst:11 6e03e1a2799a432f8319c3aaf33e2867 -msgid "" -"We currently support many document formats: txt, pdf, md, html, doc, ppt," -" and url." -msgstr "当前支持txt, pdf, md, html, doc, ppt, url文档格式" +#: ../../modules/knowledge.rst:15 515555d13e7548deb596d80ea1514bb2 +msgid "before execution:" +msgstr "" -#: ../../modules/knowledge.rst:13 883ebf16fe7f4e1fbc73ef7430104e79 -msgid "before execution: python -m spacy download zh_core_web_sm" -msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm" - -#: ../../modules/knowledge.rst:15 59f4bfa8c1064391919ce2af69f2d4c9 +#: ../../modules/knowledge.rst:21 8b790c0c37114dfc8eda4863af9314b4 msgid "" "2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma " "(now only support Chroma and Milvus, if you set Milvus, please set " "MILVUS_URL and MILVUS_PORT)" msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)" -#: ../../modules/knowledge.rst:18 be600a4d93094045b78a43307dfc8f5f +#: ../../modules/knowledge.rst:24 058fa57484a64756ab2650b46f4b33bf +msgid "" +"3.init Url Type EmbeddingEngine api and embedding your document into " +"vector store in your code." +msgstr "初始化 Url类型 EmbeddingEngine api, 将url文档embedding向量化到向量数据库 " + +#: ../../modules/knowledge.rst:40 5f255b96abd346479ab3c371393e47dc #, fuzzy -msgid "2.Run the knowledge repository script in the tools directory." -msgstr "3.在tools目录执行知识入库脚本" - -#: ../../modules/knowledge.rst:20 b27eddbbf6c74993a6653575f57fff18 msgid "" -"python tools/knowledge_init.py note : --vector_name : your vector store " -"name default_value:default" +"4.init Document Type EmbeddingEngine api and embedding your document into" +" vector store in your code. Document type can be .txt, .pdf, .md, .doc, " +".ppt." msgstr "" +"初始化 文档型类型 EmbeddingEngine api, 将文档embedding向量化到向量数据库(文档可以是.txt, .pdf, " +".md, .html, .doc, .ppt)" -#: ../../modules/knowledge.rst:23 f32dc12aedc94ffc8fee77a4b6e0ec88 +#: ../../modules/knowledge.rst:57 d8c85ba7714749269714b03857738f70 msgid "" -"3.Add the knowledge repository in the interface by entering the name of " -"your knowledge repository (if not specified, enter \"default\") so you " -"can use it for Q&A based on your knowledge base." -msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名" +"5.init TEXT Type EmbeddingEngine api and embedding your document into " +"vector store in your code." +msgstr "初始化TEXT类型 EmbeddingEngine api, 将文档embedding向量化到向量数据库" -#: ../../modules/knowledge.rst:25 5b1412c8beb24784bd2a93fe5c487b7b +#: ../../modules/knowledge.rst:73 c59e4650d57e44ae8d967768dddf908a +msgid "4.similar search based on your knowledge base. ::" +msgstr "在知识库进行相似性搜索" + +#: ../../modules/knowledge.rst:79 f500fcdc791c4286b411819ae9ab3dc6 msgid "" "Note that the default vector model used is text2vec-large-chinese (which " "is a large model, so if your personal computer configuration is not " @@ -90,9 +99,35 @@ msgstr "" "注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑配置不够建议采用text2vec-base-" "chinese),因此确保需要将模型download下来放到models目录中。" -#: ../../modules/knowledge.rst:27 67773e32b01c48628c80b6fab8c90146 +#: ../../modules/knowledge.rst:81 62a5e10a19844ba9955113f5c78cb460 msgid "" "`pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf " "embedding." msgstr "pdf_embedding <./knowledge/pdf_embedding.html>`_: supported pdf embedding." +#~ msgid "before execution: python -m spacy download zh_core_web_sm" +#~ msgstr "在执行之前请先执行python -m spacy download zh_core_web_sm" + +#~ msgid "2.Run the knowledge repository script in the tools directory." +#~ msgstr "3.在tools目录执行知识入库脚本" + +#~ msgid "" +#~ "python tools/knowledge_init.py note : " +#~ "--vector_name : your vector store name" +#~ " default_value:default" +#~ msgstr "" + +#~ msgid "" +#~ "3.Add the knowledge repository in the" +#~ " interface by entering the name of" +#~ " your knowledge repository (if not " +#~ "specified, enter \"default\") so you can" +#~ " use it for Q&A based on your" +#~ " knowledge base." +#~ msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名" + +#~ msgid "" +#~ "1.Place personal knowledge files or " +#~ "folders in the pilot/datasets directory." +#~ msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。" + diff --git a/docs/modules/knowledge.rst b/docs/modules/knowledge.rst index 72b97af98..313df1512 100644 --- a/docs/modules/knowledge.rst +++ b/docs/modules/knowledge.rst @@ -3,24 +3,78 @@ Knowledge | As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge: +We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url. + **Create your own knowledge repository** -1.Place personal knowledge files or folders in the pilot/datasets directory. +1.prepare -We currently support many document formats: txt, pdf, md, html, doc, ppt, and url. +We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url. -before execution: python -m spacy download zh_core_web_sm +before execution: + +:: + + python -m spacy download zh_core_web_sm 2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma (now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT) -2.Run the knowledge repository script in the tools directory. +3.init Url Type EmbeddingEngine api and embedding your document into vector store in your code. -python tools/knowledge_init.py -note : --vector_name : your vector store name default_value:default +:: -3.Add the knowledge repository in the interface by entering the name of your knowledge repository (if not specified, enter "default") so you can use it for Q&A based on your knowledge base. + url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html" + embedding_model = "text2vec" + vector_store_config = { + "vector_store_name": your_name, + } + embedding_engine = EmbeddingEngine( + knowledge_source=url, + knowledge_type=KnowledgeType.URL.value, + model_name=embedding_model, + vector_store_config=vector_store_config) + embedding_engine.knowledge_embedding() + +4.init Document Type EmbeddingEngine api and embedding your document into vector store in your code. +Document type can be .txt, .pdf, .md, .doc, .ppt. + +:: + + document_path = "your_path/test.md" + embedding_model = "text2vec" + vector_store_config = { + "vector_store_name": your_name, + } + embedding_engine = EmbeddingEngine( + knowledge_source=document_path, + knowledge_type=KnowledgeType.DOCUMENT.value, + model_name=embedding_model, + vector_store_config=vector_store_config) + embedding_engine.knowledge_embedding() + +5.init TEXT Type EmbeddingEngine api and embedding your document into vector store in your code. + +:: + + raw_text = "a long passage" + embedding_model = "text2vec" + vector_store_config = { + "vector_store_name": your_name, + } + embedding_engine = EmbeddingEngine( + knowledge_source=raw_text, + knowledge_type=KnowledgeType.TEXT.value, + model_name=embedding_model, + vector_store_config=vector_store_config) + embedding_engine.knowledge_embedding() + +4.similar search based on your knowledge base. +:: + query = "please introduce the oceanbase" + topk = 5 + docs = embedding_engine.similar_search(query, topk) Note that the default vector model used is text2vec-large-chinese (which is a large model, so if your personal computer configuration is not enough, it is recommended to use text2vec-base-chinese). Therefore, ensure that you download the model and place it in the models directory. diff --git a/docs/use_cases/knownledge_based_qa.md b/docs/use_cases/knownledge_based_qa.md index 0e2731ec0..2cafed421 100644 --- a/docs/use_cases/knownledge_based_qa.md +++ b/docs/use_cases/knownledge_based_qa.md @@ -37,7 +37,7 @@ vector_store_config = { query = "your query" -embedding_engine = EmbeddingEngine(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"], vector_store_config=vector_store_config) +embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config) embedding_engine.similar_search(query, 10) ``` \ No newline at end of file diff --git a/pilot/__init__.py b/pilot/__init__.py index 86aa3585f..d207a5bc3 100644 --- a/pilot/__init__.py +++ b/pilot/__init__.py @@ -1,3 +1,4 @@ from pilot.embedding_engine import SourceEmbedding, register +from pilot.embedding_engine import EmbeddingEngine, KnowledgeType -__all__ = ["SourceEmbedding", "register"] +__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"] diff --git a/pilot/embedding_engine/__init__.py b/pilot/embedding_engine/__init__.py index dbece4638..c12543a1f 100644 --- a/pilot/embedding_engine/__init__.py +++ b/pilot/embedding_engine/__init__.py @@ -1,4 +1,5 @@ from pilot.embedding_engine.source_embedding import SourceEmbedding, register from pilot.embedding_engine.embedding_engine import EmbeddingEngine +from pilot.embedding_engine.knowledge_type import KnowledgeType -__all__ = ["SourceEmbedding", "register", "EmbeddingEngine"] +__all__ = ["SourceEmbedding", "register", "EmbeddingEngine", "KnowledgeType"] diff --git a/tests/unit/embedding_engine/test_url_embedding.py b/tests/unit/embedding_engine/test_url_embedding.py new file mode 100644 index 000000000..b281e1004 --- /dev/null +++ b/tests/unit/embedding_engine/test_url_embedding.py @@ -0,0 +1,12 @@ +from pilot import EmbeddingEngine, KnowledgeType + +url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html" +embedding_model = "text2vec" +vector_store_config = { + "vector_store_name": url.replace(":", ""), + } +embedding_engine = EmbeddingEngine(knowledge_source=url, knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config) + +# embedding url content to vector store +embedding_engine.knowledge_embedding() +