feature:add milvus store

This commit is contained in:
aries-ckt
2023-05-21 16:29:00 +08:00
parent 98d50b1b98
commit 6747d877cc
7 changed files with 277 additions and 58 deletions

View File

@@ -1,7 +1,7 @@
import os
from bs4 import BeautifulSoup
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
from langchain.document_loaders import TextLoader, markdown
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
@@ -12,6 +12,7 @@ from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
from pilot.vector_store.milvus_store import MilvusStore
class KnowledgeEmbedding:
@@ -20,7 +21,7 @@ class KnowledgeEmbedding:
self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config
self.vector_store_type = "default"
self.file_type = "default"
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
self.local_persist = local_persist
if not self.local_persist:
@@ -42,7 +43,7 @@ class KnowledgeEmbedding:
elif self.file_path.endswith(".csv"):
embedding = CSVEmbedding(file_path=self.file_path, model_name=self.model_name,
vector_store_config=self.vector_store_config)
elif self.vector_store_type == "default":
elif self.file_type == "default":
embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
return embedding
@@ -52,25 +53,33 @@ class KnowledgeEmbedding:
def knowledge_persist_initialization(self, append_mode):
vector_name = self.vector_store_config["vector_store_name"]
persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
print("vector db path: ", persist_dir)
if os.path.exists(persist_dir):
if append_mode:
print("append knowledge return vector store")
new_documents = self._load_knownlege(self.file_path)
vector_store = Chroma.from_documents(documents=new_documents,
documents = self._load_knownlege(self.file_path)
if self.vector_store_config["vector_store_type"] == "Chroma":
persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
print("vector db path: ", persist_dir)
if os.path.exists(persist_dir):
if append_mode:
print("append knowledge return vector store")
new_documents = self._load_knownlege(self.file_path)
vector_store = Chroma.from_documents(documents=new_documents,
embedding=self.embeddings,
persist_directory=persist_dir)
else:
print("directly return vector store")
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
else:
print(vector_name + " is new vector store, knowledge begin load...")
vector_store = Chroma.from_documents(documents=documents,
embedding=self.embeddings,
persist_directory=persist_dir)
else:
print("directly return vector store")
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
else:
print(vector_name + " is new vector store, knowledge begin load...")
documents = self._load_knownlege(self.file_path)
vector_store = Chroma.from_documents(documents=documents,
embedding=self.embeddings,
persist_directory=persist_dir)
vector_store.persist()
vector_store.persist()
elif self.vector_store_config["vector_store_type"] == "milvus":
vector_store = MilvusStore({"url": self.vector_store_config["url"],
"port": self.vector_store_config["port"],
"embedding": self.embeddings})
vector_store.init_schema_and_load(vector_name, documents)
return vector_store
def _load_knownlege(self, path):

View File

@@ -5,9 +5,14 @@ from abc import ABC, abstractmethod
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import Milvus
from typing import List, Optional, Dict
from pilot.configs.model_config import VECTOR_STORE_TYPE, VECTOR_STORE_CONFIG
from pilot.vector_store.milvus_store import MilvusStore
registered_methods = []
@@ -29,9 +34,20 @@ class SourceEmbedding(ABC):
self.vector_store_config = vector_store_config
self.embedding_args = embedding_args
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
if VECTOR_STORE_TYPE == "milvus":
print(VECTOR_STORE_CONFIG)
if self.vector_store_config.get("text_field") is None:
self.vector_store_client = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
"port": VECTOR_STORE_CONFIG["port"],
"embedding": self.embeddings})
else:
self.vector_store_client = Milvus(embedding_function=self.embeddings, collection_name=self.vector_store_config["vector_store_name"], text_field="content",
connection_args={"host": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"]})
else:
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
@abstractmethod
@register
@@ -54,10 +70,18 @@ class SourceEmbedding(ABC):
@register
def index_to_store(self, docs):
"""index to vector store"""
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
self.vector_store.persist()
if VECTOR_STORE_TYPE == "chroma":
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
self.vector_store.persist()
elif VECTOR_STORE_TYPE == "milvus":
self.vector_store = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
"port": VECTOR_STORE_CONFIG["port"],
"embedding": self.embeddings})
self.vector_store.init_schema_and_load(self.vector_store_config["vector_store_name"], docs)
@register
def similar_search(self, doc, topk):