mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-17 07:00:15 +00:00
feature:vector store connector
This commit is contained in:
@@ -3,8 +3,7 @@ import os
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import TextLoader, markdown
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.vectorstores import Chroma
|
||||
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE, VECTOR_STORE_TYPE
|
||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
from pilot.source_embedding.csv_embedding import CSVEmbedding
|
||||
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
|
||||
@@ -12,7 +11,7 @@ from pilot.source_embedding.pdf_embedding import PDFEmbedding
|
||||
import markdown
|
||||
|
||||
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
|
||||
from pilot.vector_store.milvus_store import MilvusStore
|
||||
from pilot.vector_store.connector import VectorStoreConnector
|
||||
|
||||
|
||||
class KnowledgeEmbedding:
|
||||
@@ -23,6 +22,7 @@ class KnowledgeEmbedding:
|
||||
self.vector_store_config = vector_store_config
|
||||
self.file_type = "default"
|
||||
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
||||
self.vector_store_config["embeddings"] = self.embeddings
|
||||
self.local_persist = local_persist
|
||||
if not self.local_persist:
|
||||
self.knowledge_embedding_client = self.init_knowledge_embedding()
|
||||
@@ -52,35 +52,10 @@ class KnowledgeEmbedding:
|
||||
return self.knowledge_embedding_client.similar_search(text, topk)
|
||||
|
||||
def knowledge_persist_initialization(self, append_mode):
|
||||
vector_name = self.vector_store_config["vector_store_name"]
|
||||
documents = self._load_knownlege(self.file_path)
|
||||
if self.vector_store_config["vector_store_type"] == "Chroma":
|
||||
persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
|
||||
print("vector db path: ", persist_dir)
|
||||
if os.path.exists(persist_dir):
|
||||
if append_mode:
|
||||
print("append knowledge return vector store")
|
||||
new_documents = self._load_knownlege(self.file_path)
|
||||
vector_store = Chroma.from_documents(documents=new_documents,
|
||||
embedding=self.embeddings,
|
||||
persist_directory=persist_dir)
|
||||
else:
|
||||
print("directly return vector store")
|
||||
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
|
||||
else:
|
||||
print(vector_name + " is new vector store, knowledge begin load...")
|
||||
vector_store = Chroma.from_documents(documents=documents,
|
||||
embedding=self.embeddings,
|
||||
persist_directory=persist_dir)
|
||||
vector_store.persist()
|
||||
|
||||
elif self.vector_store_config["vector_store_type"] == "milvus":
|
||||
vector_store = MilvusStore({"url": self.vector_store_config["url"],
|
||||
"port": self.vector_store_config["port"],
|
||||
"embedding": self.embeddings})
|
||||
vector_store.init_schema_and_load(vector_name, documents)
|
||||
|
||||
return vector_store
|
||||
self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, self.vector_store_config)
|
||||
self.vector_client.load_document(documents)
|
||||
return self.vector_client
|
||||
|
||||
def _load_knownlege(self, path):
|
||||
docments = []
|
||||
|
@@ -1,17 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.vectorstores import Milvus
|
||||
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
|
||||
from pilot.configs.model_config import VECTOR_STORE_TYPE, VECTOR_STORE_CONFIG
|
||||
from pilot.vector_store.milvus_store import MilvusStore
|
||||
from pilot.configs.model_config import VECTOR_STORE_TYPE
|
||||
from pilot.vector_store.connector import VectorStoreConnector
|
||||
|
||||
registered_methods = []
|
||||
|
||||
@@ -35,19 +29,8 @@ class SourceEmbedding(ABC):
|
||||
self.embedding_args = embedding_args
|
||||
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
||||
|
||||
if VECTOR_STORE_TYPE == "milvus":
|
||||
print(VECTOR_STORE_CONFIG)
|
||||
if self.vector_store_config.get("text_field") is None:
|
||||
self.vector_store_client = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
|
||||
"port": VECTOR_STORE_CONFIG["port"],
|
||||
"embedding": self.embeddings})
|
||||
else:
|
||||
self.vector_store_client = Milvus(embedding_function=self.embeddings, collection_name=self.vector_store_config["vector_store_name"], text_field="content",
|
||||
connection_args={"host": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"]})
|
||||
else:
|
||||
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
|
||||
self.vector_store_config["vector_store_name"] + ".vectordb")
|
||||
self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
|
||||
vector_store_config["embeddings"] = self.embeddings
|
||||
self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, vector_store_config)
|
||||
|
||||
@abstractmethod
|
||||
@register
|
||||
@@ -70,24 +53,12 @@ class SourceEmbedding(ABC):
|
||||
@register
|
||||
def index_to_store(self, docs):
|
||||
"""index to vector store"""
|
||||
|
||||
if VECTOR_STORE_TYPE == "chroma":
|
||||
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
|
||||
self.vector_store_config["vector_store_name"] + ".vectordb")
|
||||
self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
|
||||
self.vector_store.persist()
|
||||
|
||||
elif VECTOR_STORE_TYPE == "milvus":
|
||||
self.vector_store = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
|
||||
"port": VECTOR_STORE_CONFIG["port"],
|
||||
"embedding": self.embeddings})
|
||||
self.vector_store.init_schema_and_load(self.vector_store_config["vector_store_name"], docs)
|
||||
self.vector_client.load_document(docs)
|
||||
|
||||
@register
|
||||
def similar_search(self, doc, topk):
|
||||
"""vector store similarity_search"""
|
||||
|
||||
return self.vector_store_client.similarity_search(doc, topk)
|
||||
return self.vector_client.similar_search(doc, topk)
|
||||
|
||||
def source_embedding(self):
|
||||
if 'read' in registered_methods:
|
||||
|
Reference in New Issue
Block a user