feature:vector store connector

This commit is contained in:
aries-ckt
2023-05-23 10:50:43 +08:00
parent b70cb8076d
commit 983a00f53a
8 changed files with 209 additions and 128 deletions

View File

@@ -3,8 +3,7 @@ import os
from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader, markdown
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE, VECTOR_STORE_TYPE
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.csv_embedding import CSVEmbedding
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
@@ -12,7 +11,7 @@ from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
from pilot.vector_store.milvus_store import MilvusStore
from pilot.vector_store.connector import VectorStoreConnector
class KnowledgeEmbedding:
@@ -23,6 +22,7 @@ class KnowledgeEmbedding:
self.vector_store_config = vector_store_config
self.file_type = "default"
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
self.vector_store_config["embeddings"] = self.embeddings
self.local_persist = local_persist
if not self.local_persist:
self.knowledge_embedding_client = self.init_knowledge_embedding()
@@ -52,35 +52,10 @@ class KnowledgeEmbedding:
return self.knowledge_embedding_client.similar_search(text, topk)
def knowledge_persist_initialization(self, append_mode):
vector_name = self.vector_store_config["vector_store_name"]
documents = self._load_knownlege(self.file_path)
if self.vector_store_config["vector_store_type"] == "Chroma":
persist_dir = os.path.join(self.vector_store_config["vector_store_path"], vector_name + ".vectordb")
print("vector db path: ", persist_dir)
if os.path.exists(persist_dir):
if append_mode:
print("append knowledge return vector store")
new_documents = self._load_knownlege(self.file_path)
vector_store = Chroma.from_documents(documents=new_documents,
embedding=self.embeddings,
persist_directory=persist_dir)
else:
print("directly return vector store")
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
else:
print(vector_name + " is new vector store, knowledge begin load...")
vector_store = Chroma.from_documents(documents=documents,
embedding=self.embeddings,
persist_directory=persist_dir)
vector_store.persist()
elif self.vector_store_config["vector_store_type"] == "milvus":
vector_store = MilvusStore({"url": self.vector_store_config["url"],
"port": self.vector_store_config["port"],
"embedding": self.embeddings})
vector_store.init_schema_and_load(vector_name, documents)
return vector_store
self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, self.vector_store_config)
self.vector_client.load_document(documents)
return self.vector_client
def _load_knownlege(self, path):
docments = []

View File

@@ -1,17 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from abc import ABC, abstractmethod
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import Milvus
from typing import List, Optional, Dict
from pilot.configs.model_config import VECTOR_STORE_TYPE, VECTOR_STORE_CONFIG
from pilot.vector_store.milvus_store import MilvusStore
from pilot.configs.model_config import VECTOR_STORE_TYPE
from pilot.vector_store.connector import VectorStoreConnector
registered_methods = []
@@ -35,19 +29,8 @@ class SourceEmbedding(ABC):
self.embedding_args = embedding_args
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
if VECTOR_STORE_TYPE == "milvus":
print(VECTOR_STORE_CONFIG)
if self.vector_store_config.get("text_field") is None:
self.vector_store_client = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
"port": VECTOR_STORE_CONFIG["port"],
"embedding": self.embeddings})
else:
self.vector_store_client = Milvus(embedding_function=self.embeddings, collection_name=self.vector_store_config["vector_store_name"], text_field="content",
connection_args={"host": VECTOR_STORE_CONFIG["url"], "port": VECTOR_STORE_CONFIG["port"]})
else:
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store_client = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
vector_store_config["embeddings"] = self.embeddings
self.vector_client = VectorStoreConnector(VECTOR_STORE_TYPE, vector_store_config)
@abstractmethod
@register
@@ -70,24 +53,12 @@ class SourceEmbedding(ABC):
@register
def index_to_store(self, docs):
"""index to vector store"""
if VECTOR_STORE_TYPE == "chroma":
persist_dir = os.path.join(self.vector_store_config["vector_store_path"],
self.vector_store_config["vector_store_name"] + ".vectordb")
self.vector_store = Chroma.from_documents(docs, self.embeddings, persist_directory=persist_dir)
self.vector_store.persist()
elif VECTOR_STORE_TYPE == "milvus":
self.vector_store = MilvusStore({"url": VECTOR_STORE_CONFIG["url"],
"port": VECTOR_STORE_CONFIG["port"],
"embedding": self.embeddings})
self.vector_store.init_schema_and_load(self.vector_store_config["vector_store_name"], docs)
self.vector_client.load_document(docs)
@register
def similar_search(self, doc, topk):
"""vector store similarity_search"""
return self.vector_store_client.similarity_search(doc, topk)
return self.vector_client.similar_search(doc, topk)
def source_embedding(self):
if 'read' in registered_methods: