mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 21:35:08 +00:00
Add Vearch vectorstore (#9846)
--------- Co-authored-by: zhanghexian1 <zhanghexian1@jd.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
e93240f023
commit
62fa2bc518
413
docs/extras/integrations/vectorstores/vearch.ipynb
Normal file
413
docs/extras/integrations/vectorstores/vearch.ipynb
Normal file
@ -0,0 +1,413 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/export/anaconda3/envs/langchainGLM6B/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"INFO 2023-08-28 18:26:07,485-1d: \n",
|
||||
"loading model config\n",
|
||||
"llm device: cuda\n",
|
||||
"embedding device: cuda\n",
|
||||
"dir: /data/zhx/zhx/langchain-ChatGLM_new\n",
|
||||
"flagging username: e2fc35b8e87c4de18d692e951a5f7c46\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.01it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import os, sys, torch\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n",
|
||||
"from langchain import HuggingFacePipeline, ConversationChain\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"from langchain.vectorstores.vearch import VearchDb\n",
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"# your local model path\n",
|
||||
"model_path =\"/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b\" \n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
|
||||
"model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Human: 你好!\n",
|
||||
"ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。\n",
|
||||
"\n",
|
||||
"Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n",
|
||||
"ChatGLM:凌波微步是一种步伐,最早出自于《倚天屠龙记》。在小说中,灭绝师太曾因与练习凌波微步的杨过的恩怨纠葛,而留下了一部经书,内容是记载凌波微步的起源和作用。后来,凌波微步便成为杨过和小龙女的感情象征。在现实生活中,凌波微步是一句口号,是清华大学学生社团“模型社”的社训。\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"你好!\"\n",
|
||||
"response, history = model.chat(tokenizer, query, history=[])\n",
|
||||
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n",
|
||||
"query = \"你知道凌波微步吗,你知道都有谁学会了吗?\"\n",
|
||||
"response, history = model.chat(tokenizer, query, history=history)\n",
|
||||
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO 2023-08-28 18:27:36,037-1d: Load pretrained SentenceTransformer: /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese\n",
|
||||
"WARNING 2023-08-28 18:27:36,038-1d: No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n",
|
||||
"INFO 2023-08-28 18:27:38,936-1d: Use pytorch device: cuda\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Add your local knowledge files\n",
|
||||
"file_path = \"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt\"#Your local file path\"\n",
|
||||
"loader = TextLoader(file_path,encoding=\"utf-8\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"\n",
|
||||
"# split text into sentences and embedding the sentences\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
||||
" chunk_size=500, chunk_overlap=100)\n",
|
||||
"texts = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"#your model path\n",
|
||||
"embedding_path = '/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese'\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 4.56it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed', '9a640124fc324a8abb0eaa31acb638b7']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#first add your document into vearch vectorstore\n",
|
||||
"vearch_db = VearchDb.from_documents(texts,embeddings,table_name=\"your_table_name\",metadata_path=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/your_table_name\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 22.49it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"####################第1段相关文档####################\n",
|
||||
"\n",
|
||||
"午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\n",
|
||||
"\n",
|
||||
"这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"百度简介\n",
|
||||
"\n",
|
||||
"凌波微步是「逍遥派」独门轻功身法,精妙异常。\n",
|
||||
"\n",
|
||||
"凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。\n",
|
||||
"\n",
|
||||
"####################第2段相关文档####################\n",
|
||||
"\n",
|
||||
"《天龙八部》第五回 微步縠纹生\n",
|
||||
"\n",
|
||||
"卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\n",
|
||||
"\n",
|
||||
"卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”\n",
|
||||
"\n",
|
||||
"####################第3段相关文档####################\n",
|
||||
"\n",
|
||||
"《天龙八部》第二回 玉壁月华明\n",
|
||||
"\n",
|
||||
"再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\n",
|
||||
"\n",
|
||||
"帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\n",
|
||||
"\n",
|
||||
"段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n",
|
||||
"卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n",
|
||||
"\n",
|
||||
"********ChatGLM:凌波微步是一种轻功身法,属于逍遥派独门轻功。它以《易经》中的六十四卦为基础,按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。凌波微步精妙异常,可以让人内力相助,自身内力颇为深厚之后再练。《天龙八部》第五回中有描述。\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"res=vearch_db.similarity_search(query, 3)\n",
|
||||
"query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n",
|
||||
"for idx,tmp in enumerate(res): \n",
|
||||
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
|
||||
"\n",
|
||||
"# combine your local knowleadge and query \n",
|
||||
"context = \"\".join([tmp.page_content for tmp in res])\n",
|
||||
"new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n",
|
||||
"response, history = model.chat(tokenizer, new_query, history=[])\n",
|
||||
"print(f\"********ChatGLM:{response}\\n\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Human: 你知道vearch是什么吗?\n",
|
||||
"ChatGLM:是的,我知道 Vearch。Vearch 是一种矩阵分解 technique,用于将矩阵分解为若干个不可约矩阵的乘积。它是由 Linus Torvalds 开发的,旨在提高 Linux 内核中矩阵操作的性能。\n",
|
||||
"\n",
|
||||
"Vearch 可以通过使用特殊的操作来对矩阵进行操作,从而避免了使用昂贵的矩阵操作库。它也被广泛用于其他操作系统中,如 FreeBSD 和 Solaris。\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['04bc84fff5074b7b8990441e92e6df07',\n",
|
||||
" 'e221906153bb4e03bc7095dadea144de',\n",
|
||||
" '126034ba51934093920d8732860f340b']"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"你知道vearch是什么吗?\"\n",
|
||||
"response, history = model.chat(tokenizer, query, history=history)\n",
|
||||
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n",
|
||||
" \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n",
|
||||
" \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n",
|
||||
"vearch_source=[{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n",
|
||||
"vearch_db.add_texts(vearch_info,vearch_source)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 25.57it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"####################第1段相关文档####################\n",
|
||||
"\n",
|
||||
"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\n",
|
||||
"\n",
|
||||
"####################第2段相关文档####################\n",
|
||||
"\n",
|
||||
"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\n",
|
||||
"\n",
|
||||
"####################第3段相关文档####################\n",
|
||||
"\n",
|
||||
"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n",
|
||||
"\n",
|
||||
"***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、Llama和ChatGLM等模型,并可以直接通过pip安装。Varch是一个基于C语言和Go语言开发的项目,并提供了Python接口。\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query3 = \"你知道vearch是什么吗?\"\n",
|
||||
"res1 = vearch_db.similarity_search(query3, 3)\n",
|
||||
"for idx,tmp in enumerate(res1): \n",
|
||||
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
|
||||
"\n",
|
||||
"context1 = \"\".join([tmp.page_content for tmp in res1])\n",
|
||||
"new_query1 = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1} \\n 回答用户这个问题:{query3}\\n\\n\"\n",
|
||||
"response, history = model.chat(tokenizer, new_query1, history=[])\n",
|
||||
"\n",
|
||||
"print(f\"***************ChatGLM:{response}\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"delete docid True\n",
|
||||
"Human: 你知道vearch是什么吗?\n",
|
||||
"ChatGLM:Vearch是一种高分子化合物,也称为聚合物、高分子材料或合成材料。它是由重复单元组成的大型聚合物,通常由一些重复单元组成,这些单元在聚合过程中结合在一起形成一个连续的高分子链。\n",
|
||||
"\n",
|
||||
"Vearch具有许多独特的性质,例如高强度、高刚性、耐磨、耐腐蚀、耐高温等。它们通常用于制造各种应用,例如塑料制品、橡胶、纤维、建筑材料等。\n",
|
||||
"\n",
|
||||
"after delete docid to query again: {}\n",
|
||||
"get existed docid {'7aae36236f784105a0004d8ff3c7c3ad': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '7e495d4e5962497db2080e84d52e75ed': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"##delete and get function need to maintian docids \n",
|
||||
"##your docid\n",
|
||||
"res_d=vearch_db.delete(['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b'])\n",
|
||||
"print(\"delete docid\",res_d)\n",
|
||||
"query = \"你知道vearch是什么吗?\"\n",
|
||||
"response, history = model.chat(tokenizer, query, history=[])\n",
|
||||
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n",
|
||||
"get_id_doc=vearch_db.get(['04bc84fff5074b7b8990441e92e6df07'])\n",
|
||||
"print(\"after delete docid to query again:\",get_id_doc)\n",
|
||||
"get_delet_doc=vearch_db.get(['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed'])\n",
|
||||
"print(\"get existed docid\",get_delet_doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.10.12 ('langchainGLM6B')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "1fd24e7ef183310e43cbf656d21568350c6a30580b6df7fe3b34654b3770f74d"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
15
docs/integrations/vearch.md
Normal file
15
docs/integrations/vearch.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Vearch
|
||||
|
||||
Vearch is a scalable distributed system for efficient similarity search of deep learning vectors.
|
||||
|
||||
# Installation and Setup
|
||||
|
||||
Vearch Python SDK enables vearch to use locally. Vearch python sdk can be installed easily by pip install vearch.
|
||||
|
||||
# Vectorstore
|
||||
|
||||
Vearch also can used as vectorstore. Most detalis in [this notebook](docs/modules/indexes/vectorstores/examples/vearch.ipynb)
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import Vearch
|
||||
```
|
401
libs/langchain/langchain/vectorstores/vearch.py
Normal file
401
libs/langchain/langchain/vectorstores/vearch.py
Normal file
@ -0,0 +1,401 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import vearch
|
||||
DEFAULT_TOPN = 4
|
||||
|
||||
|
||||
class VearchDb(VectorStore):
|
||||
_DEFAULT_TABLE_NAME = "langchain_vearch"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_function: Embeddings,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize vearch vector store"""
|
||||
try:
|
||||
import vearch
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import vearch python package. "
|
||||
"Please install it with `pip install vearch`."
|
||||
)
|
||||
|
||||
if metadata_path is None:
|
||||
metadata_path = os.getcwd().replace("\\", "/")
|
||||
if not os.path.isdir(metadata_path):
|
||||
os.makedirs(metadata_path)
|
||||
log_path = os.path.join(metadata_path, "log")
|
||||
if not os.path.isdir(log_path):
|
||||
os.makedirs(log_path)
|
||||
self.vearch_engine = vearch.Engine(metadata_path, log_path)
|
||||
|
||||
if not table_name:
|
||||
table_name = self._DEFAULT_TABLE_NAME
|
||||
table_name += "_"
|
||||
table_name += str(uuid.uuid4()).split("-")[-1]
|
||||
self.using_table_name = table_name
|
||||
self.using_metapath = metadata_path
|
||||
self.embedding_func = embedding_function
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
return self.embedding_func
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls: Type[VearchDb],
|
||||
documents: List[Document],
|
||||
embedding: Embeddings,
|
||||
table_name: str = "langchain_vearch",
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Return Vearch VectorStore"""
|
||||
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
metadatas=metadatas,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VearchDb],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Return Vearch VectorStore"""
|
||||
|
||||
vearch_db = cls(
|
||||
embedding_function=embedding,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
)
|
||||
vearch_db.add_texts(texts=texts, metadatas=metadatas)
|
||||
return vearch_db
|
||||
|
||||
def _create_table(
|
||||
self,
|
||||
dim: int = 1024,
|
||||
filed_list: List[dict] = [
|
||||
{"filed": "text", "type": "str"},
|
||||
{"filed": "metadata", "type": "str"},
|
||||
],
|
||||
) -> int:
|
||||
"""
|
||||
Create VectorStore Table
|
||||
Args:
|
||||
dim:dimension of vector
|
||||
fileds_list: the filed you want to store
|
||||
Return:
|
||||
code,0 for success,1 for failed
|
||||
"""
|
||||
type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING}
|
||||
engine_info = {
|
||||
"index_size": 10000,
|
||||
"retrieval_type": "IVFPQ",
|
||||
"retrieval_param": {"ncentroids": 2048, "nsubvector": 32},
|
||||
}
|
||||
fields = [
|
||||
vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]])
|
||||
for fi in filed_list
|
||||
]
|
||||
vector_field = vearch.GammaVectorInfo(
|
||||
name="text_embedding",
|
||||
type=vearch.dataType.VECTOR,
|
||||
is_index=True,
|
||||
dimension=dim,
|
||||
model_id="",
|
||||
store_type="MemoryOnly",
|
||||
store_param={"cache_size": 10000},
|
||||
has_source=False,
|
||||
)
|
||||
response_code = self.vearch_engine.create_table(
|
||||
engine_info,
|
||||
name=self.using_table_name,
|
||||
fields=fields,
|
||||
vector_field=vector_field,
|
||||
)
|
||||
return response_code
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
embeddings = None
|
||||
if self.embedding_func is not None:
|
||||
embeddings = self.embedding_func.embed_documents(list(texts))
|
||||
table_path = os.path.join(
|
||||
self.using_metapath, self.using_table_name + ".schema"
|
||||
)
|
||||
if not os.path.exists(table_path):
|
||||
if embeddings is None:
|
||||
raise ValueError("embeddings is None")
|
||||
dim = len(embeddings[0])
|
||||
response_code = self._create_table(dim)
|
||||
if response_code:
|
||||
raise ValueError("create table failed!!!")
|
||||
if embeddings is not None and metadatas is not None:
|
||||
doc_items = []
|
||||
for text, metadata, embed in zip(texts, metadatas, embeddings):
|
||||
profiles: dict[str, Any] = {}
|
||||
profiles["text"] = text
|
||||
profiles["metadata"] = metadata["source"]
|
||||
profiles["text_embedding"] = embed
|
||||
doc_items.append(profiles)
|
||||
|
||||
docid = self.vearch_engine.add(doc_items)
|
||||
t_time = 0
|
||||
while len(docid) != len(embeddings):
|
||||
time.sleep(0.5)
|
||||
if t_time > 6:
|
||||
break
|
||||
t_time += 1
|
||||
self.vearch_engine.dump()
|
||||
return docid
|
||||
|
||||
def _load(self) -> None:
|
||||
"""
|
||||
load vearch engine
|
||||
"""
|
||||
self.vearch_engine.load()
|
||||
|
||||
@classmethod
|
||||
def load_local(
|
||||
cls,
|
||||
embedding: Embeddings,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Load the local specified table.
|
||||
Returns:
|
||||
Success or failure of loading the local specified table
|
||||
"""
|
||||
if not metadata_path:
|
||||
raise ValueError("No metadata path!!!")
|
||||
if not table_name:
|
||||
raise ValueError("No table name!!!")
|
||||
table_path = os.path.join(metadata_path, table_name + ".schema")
|
||||
if not os.path.exists(table_path):
|
||||
raise ValueError("vearch vectorbase table not exist!!!")
|
||||
vearch_db = cls(
|
||||
embedding_function=embedding,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
)
|
||||
vearch_db._load()
|
||||
return vearch_db
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Return docs most similar to query.
|
||||
|
||||
"""
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("Vearch engine is None!!!")
|
||||
if self.embedding_func is None:
|
||||
raise ValueError("embedding_func is None!!!")
|
||||
embeddings = self.embedding_func.embed_query(query)
|
||||
docs = self.similarity_search_by_vector(embeddings, k)
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
|
||||
Args:
|
||||
embeddings: embedding vector of the query.
|
||||
k: The k most similar documents to the text query.
|
||||
min_score: the score of similar documents to the text query
|
||||
Returns:
|
||||
The k most similar documents to the specified text query.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
query_data = {
|
||||
"vector": [
|
||||
{
|
||||
"field": "text_embedding",
|
||||
"feature": np.array(embedding),
|
||||
}
|
||||
],
|
||||
"fields": [],
|
||||
"is_brute_search": 1,
|
||||
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||||
"topn": k,
|
||||
}
|
||||
query_result = self.vearch_engine.search(query_data)
|
||||
docs = []
|
||||
for item in query_result[0]["result_items"]:
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item:
|
||||
if item_key == "text":
|
||||
content = item[item_key]
|
||||
continue
|
||||
if item_key == "metadata":
|
||||
meta_data["source"] = item[item_key]
|
||||
continue
|
||||
docs.append(Document(page_content=content, metadata=meta_data))
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
|
||||
Args:
|
||||
embeddings: embedding vector of the query.
|
||||
k: The k most similar documents to the text query.
|
||||
min_score: the score of similar documents to the text query
|
||||
Returns:
|
||||
The k most similar documents to the specified text query.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
if self.embedding_func is None:
|
||||
raise ValueError("embedding_func is None!!!")
|
||||
embeddings = self.embedding_func.embed_query(query)
|
||||
query_data = {
|
||||
"vector": [
|
||||
{
|
||||
"field": "text_embedding",
|
||||
"feature": np.array(embeddings),
|
||||
}
|
||||
],
|
||||
"fields": [],
|
||||
"is_brute_search": 1,
|
||||
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||||
"topn": k,
|
||||
}
|
||||
query_result = self.vearch_engine.search(query_data)
|
||||
results: List[Tuple[Document, float]] = []
|
||||
for item in query_result[0]["result_items"]:
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item:
|
||||
if item_key == "text":
|
||||
content = item[item_key]
|
||||
continue
|
||||
if item_key == "metadata":
|
||||
meta_data["source"] = item[item_key]
|
||||
continue
|
||||
if item_key == "score":
|
||||
score = item[item_key]
|
||||
continue
|
||||
tmp_res = (Document(page_content=content, metadata=meta_data), score)
|
||||
results.append(tmp_res)
|
||||
return results
|
||||
|
||||
def _similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return self.similarity_search_with_score(query, k, **kwargs)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Optional[bool]:
|
||||
"""Delete the documents which have the specified ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
**kwargs: Other keyword arguments that subclasses might use.
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful.
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("Verach Engine is None!!!")
|
||||
ret: Optional[bool] = None
|
||||
tmp_res = []
|
||||
if ids is None or ids.__len__() == 0:
|
||||
return ret
|
||||
for _id in ids:
|
||||
ret = self.vearch_engine.del_doc(_id)
|
||||
tmp_res.append(ret)
|
||||
ret = all(i == 0 for i in tmp_res)
|
||||
return ret
|
||||
|
||||
def get(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Document]:
|
||||
"""Return docs according ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
Returns:
|
||||
Documents which satisfy the input conditions.
|
||||
"""
|
||||
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("vearch engine is None!!!")
|
||||
results: Dict[str, Document] = {}
|
||||
if ids is None or ids.__len__() == 0:
|
||||
return results
|
||||
for id in ids:
|
||||
docs_detail = self.vearch_engine.get_doc_by_id(id)
|
||||
if docs_detail == {}:
|
||||
continue
|
||||
|
||||
content = ""
|
||||
meta_info = {}
|
||||
for field in docs_detail:
|
||||
if field == "text":
|
||||
content = docs_detail[field]
|
||||
continue
|
||||
elif field == "metadata":
|
||||
meta_info["source"] = docs_detail[field]
|
||||
continue
|
||||
results[docs_detail["_id"]] = Document(
|
||||
page_content=content, metadata=meta_info
|
||||
)
|
||||
return results
|
97
tests/integration_tests/vectorstores/test_vearch.py
Normal file
97
tests/integration_tests/vectorstores/test_vearch.py
Normal file
@ -0,0 +1,97 @@
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.vearch import VearchDb
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
|
||||
def test_vearch() -> None:
|
||||
"""
|
||||
Test end to end create vearch ,store vector into it and search
|
||||
"""
|
||||
texts = [
|
||||
"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用",
|
||||
"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库",
|
||||
"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装",
|
||||
]
|
||||
metadatas = [
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
]
|
||||
vearch_db = VearchDb.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
table_name="test_vearch",
|
||||
metadata_path="./",
|
||||
)
|
||||
result = vearch_db.similarity_search(
|
||||
"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 1
|
||||
)
|
||||
assert result == [
|
||||
Document(
|
||||
page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库",
|
||||
metadata={
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_vearch_add_texts() -> None:
|
||||
"""Test end to end adding of texts."""
|
||||
texts = [
|
||||
"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用",
|
||||
"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库",
|
||||
"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装",
|
||||
]
|
||||
|
||||
metadatas = [
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
{
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
]
|
||||
vearch_db = VearchDb.from_texts(
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
table_name="test_vearch",
|
||||
metadata_path="./",
|
||||
)
|
||||
|
||||
vearch_db.add_texts(
|
||||
texts=["Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库"],
|
||||
metadatas={
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
)
|
||||
result = vearch_db.similarity_search(
|
||||
"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 2
|
||||
)
|
||||
|
||||
assert result == [
|
||||
Document(
|
||||
page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库",
|
||||
metadata={
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库",
|
||||
metadata={
|
||||
"source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt"
|
||||
},
|
||||
),
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user