mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-07 03:14:42 +00:00
- add Weaviate database (#222)
The contribution aims to enhance the functionality of the DBGPT repository by integrating support for the Weaviate database. Weaviate is a vector database that provides advanced indexing and search capabilities for textual data. By incorporating Weaviate into the DBGPT repository, users will have access to efficient storage, retrieval, and similarity search features for their text-based data. Proposed Changes: 1. Implement WeaviateStore Class: - Create a new class, "WeaviateStore," that extends the existing vector store functionality in the DBGPT repository. - The WeaviateStore class will serve as a wrapper around the Weaviate database and provide methods for data loading, searching, and vectorization. - The class will utilize the Weaviate Python client library for seamless integration with the Weaviate database. 2. Schema Definition: - Define the schema for the Weaviate database to support the required data structure in the DBGPT repository. - The schema will include a "Document" class with properties for metadata and text. - The "metadata" property will store the metadata associated with each document. - The "text" property will store the textual content of each document. 3. Data Loading: - Implement a method within the WeaviateStore class to load documents into the Weaviate database. - Iterate through the documents in the DBGPT repository and extract the necessary metadata and text. - Use the Weaviate Python client to add each document to the Weaviate database, mapping the metadata and text to the corresponding properties defined in the schema. 4. Similar Search: - Implement a method within the WeaviateStore class to perform a similar search in the Weaviate database based on a given text query. - Utilize Weaviate's vector-based search capabilities to find documents similar to the provided text query. - Return the relevant documents along with additional information such as distance or relevance scores. 5. Vector Name Existence: - Implement a method within the WeaviateStore class to check if a vector name exists for a given class in the Weaviate database. - The method will query the Weaviate database's schema and determine if the vector name exists for the specified class.
This commit is contained in:
commit
fba6efa8c1
108
pilot/vector_store/weaviate_store.py
Normal file
108
pilot/vector_store/weaviate_store.py
Normal file
@ -0,0 +1,108 @@
|
||||
import os
|
||||
import json
|
||||
import weaviate
|
||||
from langchain.vectorstores import Weaviate
|
||||
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
|
||||
from pilot.logs import logger
|
||||
from pilot.vector_store.vector_store_base import VectorStoreBase
|
||||
|
||||
|
||||
class WeaviateStore(VectorStoreBase):
|
||||
"""Weaviate database"""
|
||||
|
||||
def __init__(self, ctx: dict, weaviate_url: str) -> None:
|
||||
"""Initialize with Weaviate client."""
|
||||
try:
|
||||
import weaviate
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import weaviate python package. "
|
||||
"Please install it with `pip install weaviate-client`."
|
||||
)
|
||||
|
||||
self.ctx = ctx
|
||||
self.weaviate_url = weaviate_url
|
||||
self.persist_dir = os.path.join(
|
||||
KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb"
|
||||
)
|
||||
|
||||
self.vector_store_client = weaviate.Client(
|
||||
self.weaviate_url
|
||||
)
|
||||
|
||||
def similar_search(self, text: str, topk: int) -> None:
|
||||
"""Perform similar search in Weaviate"""
|
||||
logger.info("Weaviate similar search")
|
||||
nearText = {
|
||||
"concepts": [text],
|
||||
"distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
|
||||
}
|
||||
response = (self.vector_store_client.query.get("Document", ["metadata", "text"]).with_near_vector(
|
||||
{"vector": nearText}).with_limit(topk).with_additional(["distance"]).do())
|
||||
|
||||
return json.dumps(response, indent=2)
|
||||
|
||||
def vector_name_exists(self) -> bool:
|
||||
"""Check if a vector name exists for a given class in Weaviate.
|
||||
Returns:
|
||||
bool: True if the vector name exists, False otherwise.
|
||||
"""
|
||||
if self.vector_store_client.schema.get("Document"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _default_schema(self) -> None:
|
||||
"""
|
||||
Create the schema for Weaviate with a Document class containing metadata and text properties.
|
||||
"""
|
||||
|
||||
schema = {
|
||||
"classes": [
|
||||
{
|
||||
"class": "Document",
|
||||
"description": "A document with metadata and text",
|
||||
"moduleConfig": {"text2vec-transformers": {"poolingStrategy": "masked_mean", "vectorizeClassName": False}
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"moduleConfig": {
|
||||
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}},
|
||||
"description": "Metadata of the document",
|
||||
"name": "metadata"
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"moduleConfig": {
|
||||
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}},
|
||||
"description": "Text content of the document",
|
||||
"name": "text"
|
||||
}
|
||||
],
|
||||
"vectorizer": "text2vec-transformers"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Create the schema in Weaviate
|
||||
self.vector_store_client.schema.create(schema)
|
||||
|
||||
def load_document(self, documents: list) -> None:
|
||||
"""Load documents into Weaviate"""
|
||||
logger.info("Weaviate load document")
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
|
||||
# Import data
|
||||
with self.vector_store_client.batch as batch:
|
||||
batch.batch_size = 100
|
||||
|
||||
# Batch import all documents
|
||||
for i in range(len(texts)):
|
||||
properties = {
|
||||
"metadata": metadatas[i],
|
||||
"text": texts[i]
|
||||
}
|
||||
|
||||
self.vector_store_client.batch.add_data_object(
|
||||
properties, "Document")
|
Loading…
Reference in New Issue
Block a user