mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-13 22:15:35 +00:00
fix: Weaviate document format. (#245)
1.similar search: docs format 2.conf SUMMARY_CONFIG
This commit is contained in:
commit
6218990a1f
@ -150,6 +150,8 @@ class Config(metaclass=Singleton):
|
|||||||
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
|
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
|
||||||
self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)
|
self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)
|
||||||
|
|
||||||
|
self.WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://127.0.0.1:8080")
|
||||||
|
|
||||||
# QLoRA
|
# QLoRA
|
||||||
self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")
|
self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")
|
||||||
|
|
||||||
@ -158,7 +160,7 @@ class Config(metaclass=Singleton):
|
|||||||
self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100))
|
self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100))
|
||||||
self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5))
|
self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5))
|
||||||
### SUMMARY_CONFIG Configuration
|
### SUMMARY_CONFIG Configuration
|
||||||
self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR")
|
self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "FAST")
|
||||||
|
|
||||||
def set_debug_mode(self, value: bool) -> None:
|
def set_debug_mode(self, value: bool) -> None:
|
||||||
"""Set the debug mode value"""
|
"""Set the debug mode value"""
|
||||||
|
@ -54,6 +54,7 @@ class ChatNewKnowledge(BaseChat):
|
|||||||
self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
|
self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
|
||||||
)
|
)
|
||||||
context = [d.page_content for d in docs]
|
context = [d.page_content for d in docs]
|
||||||
|
self.metadata = [d.metadata for d in docs]
|
||||||
context = context[:2000]
|
context = context[:2000]
|
||||||
input_values = {"context": context, "question": self.current_user_input}
|
input_values = {"context": context, "question": self.current_user_input}
|
||||||
return input_values
|
return input_values
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from pilot.vector_store.chroma_store import ChromaStore
|
from pilot.vector_store.chroma_store import ChromaStore
|
||||||
|
|
||||||
# from pilot.vector_store.milvus_store import MilvusStore
|
from pilot.vector_store.milvus_store import MilvusStore
|
||||||
|
from pilot.vector_store.weaviate_store import WeaviateStore
|
||||||
|
|
||||||
connector = {"Chroma": ChromaStore, "Milvus": None}
|
connector = {"Chroma": ChromaStore, "Milvus": MilvusStore, "Weaviate": WeaviateStore}
|
||||||
|
|
||||||
|
|
||||||
class VectorStoreConnector:
|
class VectorStoreConnector:
|
||||||
|
@ -1,16 +1,22 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import weaviate
|
import weaviate
|
||||||
|
from langchain.schema import Document
|
||||||
from langchain.vectorstores import Weaviate
|
from langchain.vectorstores import Weaviate
|
||||||
|
from weaviate.exceptions import WeaviateBaseError
|
||||||
|
|
||||||
|
from pilot.configs.config import Config
|
||||||
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
|
from pilot.configs.model_config import KNOWLEDGE_UPLOAD_ROOT_PATH
|
||||||
from pilot.logs import logger
|
from pilot.logs import logger
|
||||||
from pilot.vector_store.vector_store_base import VectorStoreBase
|
from pilot.vector_store.vector_store_base import VectorStoreBase
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
class WeaviateStore(VectorStoreBase):
|
class WeaviateStore(VectorStoreBase):
|
||||||
"""Weaviate database"""
|
"""Weaviate database"""
|
||||||
|
|
||||||
def __init__(self, ctx: dict, weaviate_url: str) -> None:
|
def __init__(self, ctx: dict) -> None:
|
||||||
"""Initialize with Weaviate client."""
|
"""Initialize with Weaviate client."""
|
||||||
try:
|
try:
|
||||||
import weaviate
|
import weaviate
|
||||||
@ -21,9 +27,11 @@ class WeaviateStore(VectorStoreBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.ctx = ctx
|
self.ctx = ctx
|
||||||
self.weaviate_url = weaviate_url
|
self.weaviate_url = CFG.WEAVIATE_URL
|
||||||
|
self.embedding = ctx.get("embeddings", None)
|
||||||
|
self.vector_name = ctx["vector_store_name"]
|
||||||
self.persist_dir = os.path.join(
|
self.persist_dir = os.path.join(
|
||||||
KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb"
|
KNOWLEDGE_UPLOAD_ROOT_PATH, self.vector_name + ".vectordb"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.vector_store_client = weaviate.Client(self.weaviate_url)
|
self.vector_store_client = weaviate.Client(self.weaviate_url)
|
||||||
@ -31,28 +39,41 @@ class WeaviateStore(VectorStoreBase):
|
|||||||
def similar_search(self, text: str, topk: int) -> None:
|
def similar_search(self, text: str, topk: int) -> None:
|
||||||
"""Perform similar search in Weaviate"""
|
"""Perform similar search in Weaviate"""
|
||||||
logger.info("Weaviate similar search")
|
logger.info("Weaviate similar search")
|
||||||
nearText = {
|
# nearText = {
|
||||||
"concepts": [text],
|
# "concepts": [text],
|
||||||
"distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
|
# "distance": 0.75, # prior to v1.14 use "certainty" instead of "distance"
|
||||||
}
|
# }
|
||||||
|
# vector = self.embedding.embed_query(text)
|
||||||
response = (
|
response = (
|
||||||
self.vector_store_client.query.get("Document", ["metadata", "text"])
|
self.vector_store_client.query.get(
|
||||||
.with_near_vector({"vector": nearText})
|
self.vector_name, ["metadata", "page_content"]
|
||||||
.with_limit(topk)
|
)
|
||||||
.with_additional(["distance"])
|
# .with_near_vector({"vector": vector})
|
||||||
.do()
|
.with_limit(topk).do()
|
||||||
)
|
)
|
||||||
|
res = response["data"]["Get"][list(response["data"]["Get"].keys())[0]]
|
||||||
return json.dumps(response, indent=2)
|
docs = []
|
||||||
|
for r in res:
|
||||||
|
docs.append(
|
||||||
|
Document(
|
||||||
|
page_content=r["page_content"],
|
||||||
|
metadata={"metadata": r["metadata"]},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return docs
|
||||||
|
|
||||||
def vector_name_exists(self) -> bool:
|
def vector_name_exists(self) -> bool:
|
||||||
"""Check if a vector name exists for a given class in Weaviate.
|
"""Check if a vector name exists for a given class in Weaviate.
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the vector name exists, False otherwise.
|
bool: True if the vector name exists, False otherwise.
|
||||||
"""
|
"""
|
||||||
if self.vector_store_client.schema.get("Document"):
|
try:
|
||||||
return True
|
if self.vector_store_client.schema.get(self.vector_name):
|
||||||
return False
|
return True
|
||||||
|
return False
|
||||||
|
except WeaviateBaseError as e:
|
||||||
|
logger.error("vector_name_exists error", e.message)
|
||||||
|
return False
|
||||||
|
|
||||||
def _default_schema(self) -> None:
|
def _default_schema(self) -> None:
|
||||||
"""
|
"""
|
||||||
@ -62,39 +83,39 @@ class WeaviateStore(VectorStoreBase):
|
|||||||
schema = {
|
schema = {
|
||||||
"classes": [
|
"classes": [
|
||||||
{
|
{
|
||||||
"class": "Document",
|
"class": self.vector_name,
|
||||||
"description": "A document with metadata and text",
|
"description": "A document with metadata and text",
|
||||||
"moduleConfig": {
|
# "moduleConfig": {
|
||||||
"text2vec-transformers": {
|
# "text2vec-transformers": {
|
||||||
"poolingStrategy": "masked_mean",
|
# "poolingStrategy": "masked_mean",
|
||||||
"vectorizeClassName": False,
|
# "vectorizeClassName": False,
|
||||||
}
|
# }
|
||||||
},
|
# },
|
||||||
"properties": [
|
"properties": [
|
||||||
{
|
{
|
||||||
"dataType": ["text"],
|
"dataType": ["text"],
|
||||||
"moduleConfig": {
|
# "moduleConfig": {
|
||||||
"text2vec-transformers": {
|
# "text2vec-transformers": {
|
||||||
"skip": False,
|
# "skip": False,
|
||||||
"vectorizePropertyName": False,
|
# "vectorizePropertyName": False,
|
||||||
}
|
# }
|
||||||
},
|
# },
|
||||||
"description": "Metadata of the document",
|
"description": "Metadata of the document",
|
||||||
"name": "metadata",
|
"name": "metadata",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"dataType": ["text"],
|
"dataType": ["text"],
|
||||||
"moduleConfig": {
|
# "moduleConfig": {
|
||||||
"text2vec-transformers": {
|
# "text2vec-transformers": {
|
||||||
"skip": False,
|
# "skip": False,
|
||||||
"vectorizePropertyName": False,
|
# "vectorizePropertyName": False,
|
||||||
}
|
# }
|
||||||
},
|
# },
|
||||||
"description": "Text content of the document",
|
"description": "Text content of the document",
|
||||||
"name": "text",
|
"name": "page_content",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"vectorizer": "text2vec-transformers",
|
# "vectorizer": "text2vec-transformers",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -114,6 +135,12 @@ class WeaviateStore(VectorStoreBase):
|
|||||||
|
|
||||||
# Batch import all documents
|
# Batch import all documents
|
||||||
for i in range(len(texts)):
|
for i in range(len(texts)):
|
||||||
properties = {"metadata": metadatas[i], "text": texts[i]}
|
properties = {
|
||||||
|
"metadata": metadatas[i]["source"],
|
||||||
|
"page_content": texts[i],
|
||||||
|
}
|
||||||
|
|
||||||
self.vector_store_client.batch.add_data_object(properties, "Document")
|
self.vector_store_client.batch.add_data_object(
|
||||||
|
data_object=properties, class_name=self.vector_name
|
||||||
|
)
|
||||||
|
self.vector_store_client.batch.flush()
|
||||||
|
@ -59,12 +59,13 @@ nltk
|
|||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
# pymilvus==2.2.1
|
# pymilvus==2.2.1
|
||||||
vcrpy
|
vcrpy
|
||||||
chromadb
|
chromadb=0.3.22
|
||||||
markdown2
|
markdown2
|
||||||
colorama
|
colorama
|
||||||
playsound
|
playsound
|
||||||
distro
|
distro
|
||||||
pypdf
|
pypdf
|
||||||
|
weaviate-client
|
||||||
|
|
||||||
# Testing dependencies
|
# Testing dependencies
|
||||||
pytest
|
pytest
|
||||||
|
Loading…
Reference in New Issue
Block a user