feat: integrate Weaviate vector database in DB-GPT.

1.Weaviate default schema update
2.Weaviate database config
3.requirement
This commit is contained in:
aries-ckt 2023-06-19 09:56:54 +08:00
parent 3aa8917088
commit 8299f5e0fa
5 changed files with 53 additions and 41 deletions

View File

@ -150,6 +150,9 @@ class Config(metaclass=Singleton):
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None) self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None) self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)
self.WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://127.0.0.1:8080")
# QLoRA # QLoRA
self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")

View File

@ -53,7 +53,7 @@ class ChatNewKnowledge(BaseChat):
docs = self.knowledge_embedding_client.similar_search( docs = self.knowledge_embedding_client.similar_search(
self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
) )
context = [d.page_content for d in docs] context = [d["page_content"] for d in docs]
context = context[:2000] context = context[:2000]
input_values = {"context": context, "question": self.current_user_input} input_values = {"context": context, "question": self.current_user_input}
return input_values return input_values

View File

@ -1,8 +1,9 @@
from pilot.vector_store.chroma_store import ChromaStore from pilot.vector_store.chroma_store import ChromaStore
# from pilot.vector_store.milvus_store import MilvusStore from pilot.vector_store.milvus_store import MilvusStore
from pilot.vector_store.weaviate_store import WeaviateStore
connector = {"Chroma": ChromaStore, "Milvus": None} connector = {"Chroma": ChromaStore, "Milvus": MilvusStore, "Weaviate": WeaviateStore}
class VectorStoreConnector: class VectorStoreConnector:

View File

@ -2,15 +2,19 @@ import os
import json import json
import weaviate import weaviate
from langchain.vectorstores import Weaviate from langchain.vectorstores import Weaviate
from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
from pilot.logs import logger from pilot.logs import logger
from pilot.vector_store.vector_store_base import VectorStoreBase from pilot.vector_store.vector_store_base import VectorStoreBase
CFG = Config()
class WeaviateStore(VectorStoreBase): class WeaviateStore(VectorStoreBase):
"""Weaviate database""" """Weaviate database"""
def __init__(self, ctx: dict, weaviate_url: str) -> None: def __init__(self, ctx: dict) -> None:
"""Initialize with Weaviate client.""" """Initialize with Weaviate client."""
try: try:
import weaviate import weaviate
@ -21,9 +25,11 @@ class WeaviateStore(VectorStoreBase):
) )
self.ctx = ctx self.ctx = ctx
self.weaviate_url = weaviate_url self.weaviate_url = CFG.WEAVIATE_URL
self.embedding = ctx.get("embeddings", None)
self.vector_name = ctx["vector_store_name"]
self.persist_dir = os.path.join( self.persist_dir = os.path.join(
KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb" KNOWLEDGE_UPLOAD_ROOT_PATH, self.vector_name + ".vectordb"
) )
self.vector_store_client = weaviate.Client(self.weaviate_url) self.vector_store_client = weaviate.Client(self.weaviate_url)
@ -31,26 +37,26 @@ class WeaviateStore(VectorStoreBase):
def similar_search(self, text: str, topk: int) -> None: def similar_search(self, text: str, topk: int) -> None:
"""Perform similar search in Weaviate""" """Perform similar search in Weaviate"""
logger.info("Weaviate similar search") logger.info("Weaviate similar search")
nearText = { # nearText = {
"concepts": [text], # "concepts": [text],
"distance": 0.75, # prior to v1.14 use "certainty" instead of "distance" # "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
} # }
# vector = self.embedding.embed_query(text)
response = ( response = (
self.vector_store_client.query.get("Document", ["metadata", "text"]) self.vector_store_client.query.get(self.vector_name, ["metadata", "page_content"])
.with_near_vector({"vector": nearText}) # .with_near_vector({"vector": vector})
.with_limit(topk) .with_limit(topk)
.with_additional(["distance"])
.do() .do()
) )
docs = response['data']['Get'][list(response['data']['Get'].keys())[0]]
return json.dumps(response, indent=2) return docs
def vector_name_exists(self) -> bool: def vector_name_exists(self) -> bool:
"""Check if a vector name exists for a given class in Weaviate. """Check if a vector name exists for a given class in Weaviate.
Returns: Returns:
bool: True if the vector name exists, False otherwise. bool: True if the vector name exists, False otherwise.
""" """
if self.vector_store_client.schema.get("Document"): if self.vector_store_client.schema.get(self.vector_name):
return True return True
return False return False
@ -62,39 +68,39 @@ class WeaviateStore(VectorStoreBase):
schema = { schema = {
"classes": [ "classes": [
{ {
"class": "Document", "class": self.vector_name,
"description": "A document with metadata and text", "description": "A document with metadata and text",
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"poolingStrategy": "masked_mean", # "poolingStrategy": "masked_mean",
"vectorizeClassName": False, # "vectorizeClassName": False,
} # }
}, # },
"properties": [ "properties": [
{ {
"dataType": ["text"], "dataType": ["text"],
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"skip": False, # "skip": False,
"vectorizePropertyName": False, # "vectorizePropertyName": False,
} # }
}, # },
"description": "Metadata of the document", "description": "Metadata of the document",
"name": "metadata", "name": "metadata",
}, },
{ {
"dataType": ["text"], "dataType": ["text"],
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"skip": False, # "skip": False,
"vectorizePropertyName": False, # "vectorizePropertyName": False,
} # }
}, # },
"description": "Text content of the document", "description": "Text content of the document",
"name": "text", "name": "page_content",
}, }
], ],
"vectorizer": "text2vec-transformers", # "vectorizer": "text2vec-transformers",
} }
] ]
} }
@ -114,6 +120,7 @@ class WeaviateStore(VectorStoreBase):
# Batch import all documents # Batch import all documents
for i in range(len(texts)): for i in range(len(texts)):
properties = {"metadata": metadatas[i], "text": texts[i]} properties = {"metadata": metadatas[i]['source'], "page_content": texts[i]}
self.vector_store_client.batch.add_data_object(properties, "Document") self.vector_store_client.batch.add_data_object(data_object=properties, class_name=self.vector_name)
self.vector_store_client.batch.flush()

View File

@ -59,12 +59,13 @@ nltk
python-dotenv==1.0.0 python-dotenv==1.0.0
# pymilvus==2.2.1 # pymilvus==2.2.1
vcrpy vcrpy
chromadb chromadb=0.3.22
markdown2 markdown2
colorama colorama
playsound playsound
distro distro
pypdf pypdf
weaviate-client
# Testing dependencies # Testing dependencies
pytest pytest