From b95084b89ff4ee85d0a1b6be2dffe1fdc2dbeb2b Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Mon, 19 Jun 2023 09:56:54 +0800 Subject: [PATCH 1/3] feat: integrate Weaviate vector database in DB-GPT. 1.Weaviate default schema update 2.Weaviate database config 3.requirement --- pilot/configs/config.py | 3 + pilot/scene/chat_knowledge/custom/chat.py | 2 +- pilot/vector_store/connector.py | 5 +- pilot/vector_store/weaviate_store.py | 81 ++++++++++++----------- requirements.txt | 3 +- 5 files changed, 53 insertions(+), 41 deletions(-) diff --git a/pilot/configs/config.py b/pilot/configs/config.py index 9e6542db9..1c4a52c35 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -150,6 +150,9 @@ class Config(metaclass=Singleton): self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None) self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None) + self.WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://127.0.0.1:8080") + + # QLoRA self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") diff --git a/pilot/scene/chat_knowledge/custom/chat.py b/pilot/scene/chat_knowledge/custom/chat.py index a56b2a098..f9f27f603 100644 --- a/pilot/scene/chat_knowledge/custom/chat.py +++ b/pilot/scene/chat_knowledge/custom/chat.py @@ -53,7 +53,7 @@ class ChatNewKnowledge(BaseChat): docs = self.knowledge_embedding_client.similar_search( self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE ) - context = [d.page_content for d in docs] + context = [d["page_content"] for d in docs] context = context[:2000] input_values = {"context": context, "question": self.current_user_input} return input_values diff --git a/pilot/vector_store/connector.py b/pilot/vector_store/connector.py index 6672d3d23..8ba6df253 100644 --- a/pilot/vector_store/connector.py +++ b/pilot/vector_store/connector.py @@ -1,8 +1,9 @@ from pilot.vector_store.chroma_store import ChromaStore -# from pilot.vector_store.milvus_store import MilvusStore +from pilot.vector_store.milvus_store import MilvusStore +from pilot.vector_store.weaviate_store import WeaviateStore -connector = {"Chroma": ChromaStore, "Milvus": None} +connector = {"Chroma": ChromaStore, "Milvus": MilvusStore, "Weaviate": WeaviateStore} class VectorStoreConnector: diff --git a/pilot/vector_store/weaviate_store.py b/pilot/vector_store/weaviate_store.py index e208dde35..f4e35bcfd 100644 --- a/pilot/vector_store/weaviate_store.py +++ b/pilot/vector_store/weaviate_store.py @@ -2,15 +2,19 @@ import os import json import weaviate from langchain.vectorstores import Weaviate + +from pilot.configs.config import Config from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH from pilot.logs import logger from pilot.vector_store.vector_store_base import VectorStoreBase +CFG = Config() + class WeaviateStore(VectorStoreBase): """Weaviate database""" - def __init__(self, ctx: dict, weaviate_url: str) -> None: + def __init__(self, ctx: dict) -> None: """Initialize with Weaviate client.""" try: import weaviate @@ -21,9 +25,11 @@ class WeaviateStore(VectorStoreBase): ) self.ctx = ctx - self.weaviate_url = weaviate_url + self.weaviate_url = CFG.WEAVIATE_URL + self.embedding = ctx.get("embeddings", None) + self.vector_name = ctx["vector_store_name"] self.persist_dir = os.path.join( - KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb" + KNOWLEDGE_UPLOAD_ROOT_PATH, self.vector_name + ".vectordb" ) self.vector_store_client = weaviate.Client(self.weaviate_url) @@ -31,26 +37,26 @@ class WeaviateStore(VectorStoreBase): def similar_search(self, text: str, topk: int) -> None: """Perform similar search in Weaviate""" logger.info("Weaviate similar search") - nearText = { - "concepts": [text], - "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance" - } + # nearText = { + # "concepts": [text], + # "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance" + # } + # vector = self.embedding.embed_query(text) response = ( - self.vector_store_client.query.get("Document", ["metadata", "text"]) - .with_near_vector({"vector": nearText}) + self.vector_store_client.query.get(self.vector_name, ["metadata", "page_content"]) + # .with_near_vector({"vector": vector}) .with_limit(topk) - .with_additional(["distance"]) .do() ) - - return json.dumps(response, indent=2) + docs = response['data']['Get'][list(response['data']['Get'].keys())[0]] + return docs def vector_name_exists(self) -> bool: """Check if a vector name exists for a given class in Weaviate. Returns: bool: True if the vector name exists, False otherwise. """ - if self.vector_store_client.schema.get("Document"): + if self.vector_store_client.schema.get(self.vector_name): return True return False @@ -62,39 +68,39 @@ class WeaviateStore(VectorStoreBase): schema = { "classes": [ { - "class": "Document", + "class": self.vector_name, "description": "A document with metadata and text", - "moduleConfig": { - "text2vec-transformers": { - "poolingStrategy": "masked_mean", - "vectorizeClassName": False, - } - }, + # "moduleConfig": { + # "text2vec-transformers": { + # "poolingStrategy": "masked_mean", + # "vectorizeClassName": False, + # } + # }, "properties": [ { "dataType": ["text"], - "moduleConfig": { - "text2vec-transformers": { - "skip": False, - "vectorizePropertyName": False, - } - }, + # "moduleConfig": { + # "text2vec-transformers": { + # "skip": False, + # "vectorizePropertyName": False, + # } + # }, "description": "Metadata of the document", "name": "metadata", }, { "dataType": ["text"], - "moduleConfig": { - "text2vec-transformers": { - "skip": False, - "vectorizePropertyName": False, - } - }, + # "moduleConfig": { + # "text2vec-transformers": { + # "skip": False, + # "vectorizePropertyName": False, + # } + # }, "description": "Text content of the document", - "name": "text", - }, + "name": "page_content", + } ], - "vectorizer": "text2vec-transformers", + # "vectorizer": "text2vec-transformers", } ] } @@ -114,6 +120,7 @@ class WeaviateStore(VectorStoreBase): # Batch import all documents for i in range(len(texts)): - properties = {"metadata": metadatas[i], "text": texts[i]} + properties = {"metadata": metadatas[i]['source'], "page_content": texts[i]} - self.vector_store_client.batch.add_data_object(properties, "Document") + self.vector_store_client.batch.add_data_object(data_object=properties, class_name=self.vector_name) + self.vector_store_client.batch.flush() diff --git a/requirements.txt b/requirements.txt index 555592f98..594d5bfae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -59,12 +59,13 @@ nltk python-dotenv==1.0.0 # pymilvus==2.2.1 vcrpy -chromadb +chromadb=0.3.22 markdown2 colorama playsound distro pypdf +weaviate-client # Testing dependencies pytest From 05a74d89cdc6cb46795c643321b1405b29854cf4 Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Mon, 19 Jun 2023 16:44:18 +0800 Subject: [PATCH 2/3] fix: Weaviate document format. 1.similar search: docs format 2.conf SUMMARY_CONFIG --- pilot/configs/config.py | 2 +- pilot/scene/chat_knowledge/custom/chat.py | 3 ++- pilot/vector_store/weaviate_store.py | 20 ++++++++++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pilot/configs/config.py b/pilot/configs/config.py index 1c4a52c35..9b915275a 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -161,7 +161,7 @@ class Config(metaclass=Singleton): self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)) self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5)) ### SUMMARY_CONFIG Configuration - self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR") + self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "FAST") def set_debug_mode(self, value: bool) -> None: """Set the debug mode value""" diff --git a/pilot/scene/chat_knowledge/custom/chat.py b/pilot/scene/chat_knowledge/custom/chat.py index f9f27f603..f6582d343 100644 --- a/pilot/scene/chat_knowledge/custom/chat.py +++ b/pilot/scene/chat_knowledge/custom/chat.py @@ -53,7 +53,8 @@ class ChatNewKnowledge(BaseChat): docs = self.knowledge_embedding_client.similar_search( self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE ) - context = [d["page_content"] for d in docs] + context = [d.page_content for d in docs] + self.metadata = [d.metadata for d in docs] context = context[:2000] input_values = {"context": context, "question": self.current_user_input} return input_values diff --git a/pilot/vector_store/weaviate_store.py b/pilot/vector_store/weaviate_store.py index f4e35bcfd..2dc4d2f1f 100644 --- a/pilot/vector_store/weaviate_store.py +++ b/pilot/vector_store/weaviate_store.py @@ -1,7 +1,9 @@ import os import json import weaviate +from langchain.schema import Document from langchain.vectorstores import Weaviate +from weaviate.exceptions import WeaviateBaseError from pilot.configs.config import Config from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH @@ -48,7 +50,13 @@ class WeaviateStore(VectorStoreBase): .with_limit(topk) .do() ) - docs = response['data']['Get'][list(response['data']['Get'].keys())[0]] + res = response['data']['Get'][list(response['data']['Get'].keys())[0]] + docs = [] + for r in res: + docs.append(Document( + page_content=r['page_content'], + metadata={"metadata": r['metadata']}, + )) return docs def vector_name_exists(self) -> bool: @@ -56,9 +64,13 @@ class WeaviateStore(VectorStoreBase): Returns: bool: True if the vector name exists, False otherwise. """ - if self.vector_store_client.schema.get(self.vector_name): - return True - return False + try: + if self.vector_store_client.schema.get(self.vector_name): + return True + return False + except WeaviateBaseError as e: + logger.error("vector_name_exists error", e.message) + return False def _default_schema(self) -> None: """ From 6b0ab59201300d705c397b0bdcb3d7af010e636a Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Mon, 19 Jun 2023 16:58:24 +0800 Subject: [PATCH 3/3] style: code format. code style format --- pilot/configs/config.py | 1 - pilot/vector_store/weaviate_store.py | 30 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pilot/configs/config.py b/pilot/configs/config.py index 9b915275a..594b8b4ae 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -152,7 +152,6 @@ class Config(metaclass=Singleton): self.WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://127.0.0.1:8080") - # QLoRA self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") diff --git a/pilot/vector_store/weaviate_store.py b/pilot/vector_store/weaviate_store.py index 2dc4d2f1f..fc5455672 100644 --- a/pilot/vector_store/weaviate_store.py +++ b/pilot/vector_store/weaviate_store.py @@ -45,18 +45,21 @@ class WeaviateStore(VectorStoreBase): # } # vector = self.embedding.embed_query(text) response = ( - self.vector_store_client.query.get(self.vector_name, ["metadata", "page_content"]) + self.vector_store_client.query.get( + self.vector_name, ["metadata", "page_content"] + ) # .with_near_vector({"vector": vector}) - .with_limit(topk) - .do() + .with_limit(topk).do() ) - res = response['data']['Get'][list(response['data']['Get'].keys())[0]] + res = response["data"]["Get"][list(response["data"]["Get"].keys())[0]] docs = [] for r in res: - docs.append(Document( - page_content=r['page_content'], - metadata={"metadata": r['metadata']}, - )) + docs.append( + Document( + page_content=r["page_content"], + metadata={"metadata": r["metadata"]}, + ) + ) return docs def vector_name_exists(self) -> bool: @@ -110,7 +113,7 @@ class WeaviateStore(VectorStoreBase): # }, "description": "Text content of the document", "name": "page_content", - } + }, ], # "vectorizer": "text2vec-transformers", } @@ -132,7 +135,12 @@ class WeaviateStore(VectorStoreBase): # Batch import all documents for i in range(len(texts)): - properties = {"metadata": metadatas[i]['source'], "page_content": texts[i]} + properties = { + "metadata": metadatas[i]["source"], + "page_content": texts[i], + } - self.vector_store_client.batch.add_data_object(data_object=properties, class_name=self.vector_name) + self.vector_store_client.batch.add_data_object( + data_object=properties, class_name=self.vector_name + ) self.vector_store_client.batch.flush()