fix: Weaviate document format. (#245)

1.similar search: docs format
2.conf SUMMARY_CONFIG
This commit is contained in:
magic.chen 2023-06-19 17:02:52 +08:00 committed by GitHub
commit 6218990a1f
5 changed files with 76 additions and 44 deletions

View File

@ -150,6 +150,8 @@ class Config(metaclass=Singleton):
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None) self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None) self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)
self.WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://127.0.0.1:8080")
# QLoRA # QLoRA
self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")
@ -158,7 +160,7 @@ class Config(metaclass=Singleton):
self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)) self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100))
self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5)) self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5))
### SUMMARY_CONFIG Configuration ### SUMMARY_CONFIG Configuration
self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR") self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "FAST")
def set_debug_mode(self, value: bool) -> None: def set_debug_mode(self, value: bool) -> None:
"""Set the debug mode value""" """Set the debug mode value"""

View File

@ -54,6 +54,7 @@ class ChatNewKnowledge(BaseChat):
self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
) )
context = [d.page_content for d in docs] context = [d.page_content for d in docs]
self.metadata = [d.metadata for d in docs]
context = context[:2000] context = context[:2000]
input_values = {"context": context, "question": self.current_user_input} input_values = {"context": context, "question": self.current_user_input}
return input_values return input_values

View File

@ -1,8 +1,9 @@
from pilot.vector_store.chroma_store import ChromaStore from pilot.vector_store.chroma_store import ChromaStore
# from pilot.vector_store.milvus_store import MilvusStore from pilot.vector_store.milvus_store import MilvusStore
from pilot.vector_store.weaviate_store import WeaviateStore
connector = {"Chroma": ChromaStore, "Milvus": None} connector = {"Chroma": ChromaStore, "Milvus": MilvusStore, "Weaviate": WeaviateStore}
class VectorStoreConnector: class VectorStoreConnector:

View File

@ -1,16 +1,22 @@
import os import os
import json import json
import weaviate import weaviate
from langchain.schema import Document
from langchain.vectorstores import Weaviate from langchain.vectorstores import Weaviate
from weaviate.exceptions import WeaviateBaseError
from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
from pilot.logs import logger from pilot.logs import logger
from pilot.vector_store.vector_store_base import VectorStoreBase from pilot.vector_store.vector_store_base import VectorStoreBase
CFG = Config()
class WeaviateStore(VectorStoreBase): class WeaviateStore(VectorStoreBase):
"""Weaviate database""" """Weaviate database"""
def __init__(self, ctx: dict, weaviate_url: str) -> None: def __init__(self, ctx: dict) -> None:
"""Initialize with Weaviate client.""" """Initialize with Weaviate client."""
try: try:
import weaviate import weaviate
@ -21,9 +27,11 @@ class WeaviateStore(VectorStoreBase):
) )
self.ctx = ctx self.ctx = ctx
self.weaviate_url = weaviate_url self.weaviate_url = CFG.WEAVIATE_URL
self.embedding = ctx.get("embeddings", None)
self.vector_name = ctx["vector_store_name"]
self.persist_dir = os.path.join( self.persist_dir = os.path.join(
KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb" KNOWLEDGE_UPLOAD_ROOT_PATH, self.vector_name + ".vectordb"
) )
self.vector_store_client = weaviate.Client(self.weaviate_url) self.vector_store_client = weaviate.Client(self.weaviate_url)
@ -31,28 +39,41 @@ class WeaviateStore(VectorStoreBase):
def similar_search(self, text: str, topk: int) -> None: def similar_search(self, text: str, topk: int) -> None:
"""Perform similar search in Weaviate""" """Perform similar search in Weaviate"""
logger.info("Weaviate similar search") logger.info("Weaviate similar search")
nearText = { # nearText = {
"concepts": [text], # "concepts": [text],
"distance": 0.75, # prior to v1.14 use "certainty" instead of "distance" # "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
} # }
# vector = self.embedding.embed_query(text)
response = ( response = (
self.vector_store_client.query.get("Document", ["metadata", "text"]) self.vector_store_client.query.get(
.with_near_vector({"vector": nearText}) self.vector_name, ["metadata", "page_content"]
.with_limit(topk) )
.with_additional(["distance"]) # .with_near_vector({"vector": vector})
.do() .with_limit(topk).do()
) )
res = response["data"]["Get"][list(response["data"]["Get"].keys())[0]]
return json.dumps(response, indent=2) docs = []
for r in res:
docs.append(
Document(
page_content=r["page_content"],
metadata={"metadata": r["metadata"]},
)
)
return docs
def vector_name_exists(self) -> bool: def vector_name_exists(self) -> bool:
"""Check if a vector name exists for a given class in Weaviate. """Check if a vector name exists for a given class in Weaviate.
Returns: Returns:
bool: True if the vector name exists, False otherwise. bool: True if the vector name exists, False otherwise.
""" """
if self.vector_store_client.schema.get("Document"): try:
return True if self.vector_store_client.schema.get(self.vector_name):
return False return True
return False
except WeaviateBaseError as e:
logger.error("vector_name_exists error", e.message)
return False
def _default_schema(self) -> None: def _default_schema(self) -> None:
""" """
@ -62,39 +83,39 @@ class WeaviateStore(VectorStoreBase):
schema = { schema = {
"classes": [ "classes": [
{ {
"class": "Document", "class": self.vector_name,
"description": "A document with metadata and text", "description": "A document with metadata and text",
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"poolingStrategy": "masked_mean", # "poolingStrategy": "masked_mean",
"vectorizeClassName": False, # "vectorizeClassName": False,
} # }
}, # },
"properties": [ "properties": [
{ {
"dataType": ["text"], "dataType": ["text"],
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"skip": False, # "skip": False,
"vectorizePropertyName": False, # "vectorizePropertyName": False,
} # }
}, # },
"description": "Metadata of the document", "description": "Metadata of the document",
"name": "metadata", "name": "metadata",
}, },
{ {
"dataType": ["text"], "dataType": ["text"],
"moduleConfig": { # "moduleConfig": {
"text2vec-transformers": { # "text2vec-transformers": {
"skip": False, # "skip": False,
"vectorizePropertyName": False, # "vectorizePropertyName": False,
} # }
}, # },
"description": "Text content of the document", "description": "Text content of the document",
"name": "text", "name": "page_content",
}, },
], ],
"vectorizer": "text2vec-transformers", # "vectorizer": "text2vec-transformers",
} }
] ]
} }
@ -114,6 +135,12 @@ class WeaviateStore(VectorStoreBase):
# Batch import all documents # Batch import all documents
for i in range(len(texts)): for i in range(len(texts)):
properties = {"metadata": metadatas[i], "text": texts[i]} properties = {
"metadata": metadatas[i]["source"],
"page_content": texts[i],
}
self.vector_store_client.batch.add_data_object(properties, "Document") self.vector_store_client.batch.add_data_object(
data_object=properties, class_name=self.vector_name
)
self.vector_store_client.batch.flush()

View File

@ -59,12 +59,13 @@ nltk
python-dotenv==1.0.0 python-dotenv==1.0.0
# pymilvus==2.2.1 # pymilvus==2.2.1
vcrpy vcrpy
chromadb chromadb=0.3.22
markdown2 markdown2
colorama colorama
playsound playsound
distro distro
pypdf pypdf
weaviate-client
# Testing dependencies # Testing dependencies
pytest pytest