diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py index acbf82a73..94e48e79e 100644 --- a/pilot/source_embedding/source_embedding.py +++ b/pilot/source_embedding/source_embedding.py @@ -2,9 +2,6 @@ # -*- coding: utf-8 -*- from abc import ABC, abstractmethod from typing import Dict, List, Optional - -from langchain.embeddings import HuggingFaceEmbeddings - from pilot.configs.config import Config from pilot.vector_store.connector import VectorStoreConnector @@ -35,9 +32,7 @@ class SourceEmbedding(ABC): self.model_name = model_name self.vector_store_config = vector_store_config self.embedding_args = embedding_args - self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name) - - vector_store_config["embeddings"] = self.embeddings + self.embeddings = vector_store_config["embeddings"] self.vector_client = VectorStoreConnector( CFG.VECTOR_STORE_TYPE, vector_store_config ) diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py index dea33e61d..e50f367f3 100644 --- a/pilot/vector_store/milvus_store.py +++ b/pilot/vector_store/milvus_store.py @@ -6,6 +6,7 @@ from pymilvus import Collection, DataType, connections, utility from pilot.configs.config import Config from pilot.vector_store.vector_store_base import VectorStoreBase + CFG = Config() @@ -107,6 +108,7 @@ class MilvusStore(VectorStoreBase): self.col = Collection( self.collection_name, using=self.alias ) + self.fields = [] for x in self.col.schema.fields: self.fields.append(x.name) if x.auto_id: @@ -131,7 +133,7 @@ class MilvusStore(VectorStoreBase): max_length = max(max_length, len(y)) # Create the text field fields.append( - FieldSchema(text_field, DataType.VARCHAR, max_length=max_length + 1) + FieldSchema(text_field, DataType.VARCHAR, max_length=max_length + 100) ) # primary key field fields.append( @@ -248,7 +250,11 @@ class MilvusStore(VectorStoreBase): def load_document(self, documents) -> None: """load document in vector database.""" - self.init_schema_and_load(self.collection_name, documents) + batch_size = 500 + batched_list = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)] + # docs = [] + for doc_batch in batched_list: + self.init_schema_and_load(self.collection_name, doc_batch) def similar_search(self, text, topk) -> None: """similar_search in vector database."""