From c9768a0948722f3123af3a009a05da08affc00aa Mon Sep 17 00:00:00 2001 From: Younis Bashir Date: Thu, 15 Jun 2023 02:51:36 +0300 Subject: [PATCH] - add Weaviate database --- pilot/vector_store/weaviate_store.py | 108 +++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 pilot/vector_store/weaviate_store.py diff --git a/pilot/vector_store/weaviate_store.py b/pilot/vector_store/weaviate_store.py new file mode 100644 index 000000000..b461122db --- /dev/null +++ b/pilot/vector_store/weaviate_store.py @@ -0,0 +1,108 @@ +import os +import json +import weaviate +from langchain.vectorstores import Weaviate +from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH +from pilot.logs import logger +from pilot.vector_store.vector_store_base import VectorStoreBase + + +class WeaviateStore(VectorStoreBase): + """Weaviate database""" + + def __init__(self, ctx: dict, weaviate_url: str) -> None: + """Initialize with Weaviate client.""" + try: + import weaviate + except ImportError: + raise ValueError( + "Could not import weaviate python package. " + "Please install it with `pip install weaviate-client`." + ) + + self.ctx = ctx + self.weaviate_url = weaviate_url + self.persist_dir = os.path.join( + KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb" + ) + + self.vector_store_client = weaviate.Client( + self.weaviate_url + ) + + def similar_search(self, text: str, topk: int) -> None: + """Perform similar search in Weaviate""" + logger.info("Weaviate similar search") + nearText = { + "concepts": [text], + "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance" + } + response = (self.vector_store_client.query.get("Document", ["metadata", "text"]).with_near_vector( + {"vector": nearText}).with_limit(topk).with_additional(["distance"]).do()) + + return json.dumps(response, indent=2) + + def vector_name_exists(self) -> bool: + """Check if a vector name exists for a given class in Weaviate. + Returns: + bool: True if the vector name exists, False otherwise. + """ + if self.vector_store_client.schema.get("Document"): + return True + return False + + def _default_schema(self) -> None: + """ + Create the schema for Weaviate with a Document class containing metadata and text properties. + """ + + schema = { + "classes": [ + { + "class": "Document", + "description": "A document with metadata and text", + "moduleConfig": {"text2vec-transformers": {"poolingStrategy": "masked_mean", "vectorizeClassName": False} + }, + "properties": [ + { + "dataType": ["text"], + "moduleConfig": { + "text2vec-transformers": {"skip": False, "vectorizePropertyName": False}}, + "description": "Metadata of the document", + "name": "metadata" + }, + { + "dataType": ["text"], + "moduleConfig": { + "text2vec-transformers": {"skip": False, "vectorizePropertyName": False}}, + "description": "Text content of the document", + "name": "text" + } + ], + "vectorizer": "text2vec-transformers" + } + ] + } + + # Create the schema in Weaviate + self.vector_store_client.schema.create(schema) + + def load_document(self, documents: list) -> None: + """Load documents into Weaviate""" + logger.info("Weaviate load document") + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + + # Import data + with self.vector_store_client.batch as batch: + batch.batch_size = 100 + + # Batch import all documents + for i in range(len(texts)): + properties = { + "metadata": metadatas[i], + "text": texts[i] + } + + self.vector_store_client.batch.add_data_object( + properties, "Document")