- add Weaviate database

This commit is contained in:
Younis Bashir 2023-06-15 02:51:36 +03:00
parent 3c9fffe815
commit c9768a0948

View File

@ -0,0 +1,108 @@
import os
import json
import weaviate
from langchain.vectorstores import Weaviate
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
from pilot.logs import logger
from pilot.vector_store.vector_store_base import VectorStoreBase
class WeaviateStore(VectorStoreBase):
"""Weaviate database"""
def __init__(self, ctx: dict, weaviate_url: str) -> None:
"""Initialize with Weaviate client."""
try:
import weaviate
except ImportError:
raise ValueError(
"Could not import weaviate python package. "
"Please install it with `pip install weaviate-client`."
)
self.ctx = ctx
self.weaviate_url = weaviate_url
self.persist_dir = os.path.join(
KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb"
)
self.vector_store_client = weaviate.Client(
self.weaviate_url
)
def similar_search(self, text: str, topk: int) -> None:
"""Perform similar search in Weaviate"""
logger.info("Weaviate similar search")
nearText = {
"concepts": [text],
"distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
}
response = (self.vector_store_client.query.get("Document", ["metadata", "text"]).with_near_vector(
{"vector": nearText}).with_limit(topk).with_additional(["distance"]).do())
return json.dumps(response, indent=2)
def vector_name_exists(self) -> bool:
"""Check if a vector name exists for a given class in Weaviate.
Returns:
bool: True if the vector name exists, False otherwise.
"""
if self.vector_store_client.schema.get("Document"):
return True
return False
def _default_schema(self) -> None:
"""
Create the schema for Weaviate with a Document class containing metadata and text properties.
"""
schema = {
"classes": [
{
"class": "Document",
"description": "A document with metadata and text",
"moduleConfig": {"text2vec-transformers": {"poolingStrategy": "masked_mean", "vectorizeClassName": False}
},
"properties": [
{
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}},
"description": "Metadata of the document",
"name": "metadata"
},
{
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}},
"description": "Text content of the document",
"name": "text"
}
],
"vectorizer": "text2vec-transformers"
}
]
}
# Create the schema in Weaviate
self.vector_store_client.schema.create(schema)
def load_document(self, documents: list) -> None:
"""Load documents into Weaviate"""
logger.info("Weaviate load document")
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
# Import data
with self.vector_store_client.batch as batch:
batch.batch_size = 100
# Batch import all documents
for i in range(len(texts)):
properties = {
"metadata": metadatas[i],
"text": texts[i]
}
self.vector_store_client.batch.add_data_object(
properties, "Document")