diff --git a/docs/modules/vector.rst b/docs/modules/vector.rst new file mode 100644 index 000000000..d425a027d --- /dev/null +++ b/docs/modules/vector.rst @@ -0,0 +1,23 @@ +VectorConnector +--------- + +**VectorConnector Introduce** + +vector knowledge base is a method of mapping words in language to a high-dimensional vector space. In the vector space, each word is represented as a vector that contains many numerical features, which represent the relationship between the word and other words. This mapping is a clustering technique, and the semantic relationship between words can be calculated by computing the differences between their vectors in the vector space. Vector knowledge bases can be used for natural language processing tasks such as sentiment analysis, text classification, and machine translation. Common vector knowledge bases include Word2Vec, GloVe, and FastText. The training of these vector knowledge bases usually requires a large corpus and computing resources to complete. + +VectorConnector is a vector database connection adapter that allows you to connect different vector databases and abstracts away implementation differences and underlying details of different vector data. For example, it can be used to connect to databases such as Milvus, Chroma, Elasticsearch, and Weaviate. + +DB-GPT VectorConnector currently support milvus and chroma vector database + +- `chroma <./vector/chroma.html>`_: supported chroma vector database. +- `milvus <./vector/milvus.html>`_: supported milvus vector database. + + +.. toctree:: + :maxdepth: 2 + :caption: VectorConnector + :name: chroma + :hidden: + + ./vector/chroma/chroma.md + ./vector/milvus/milvus.md \ No newline at end of file diff --git a/docs/modules/vector/chroma/chroma.md b/docs/modules/vector/chroma/chroma.md new file mode 100644 index 000000000..7c9f41eea --- /dev/null +++ b/docs/modules/vector/chroma/chroma.md @@ -0,0 +1,50 @@ +ChromaStore +================================== +ChromaStore is one implementation of the Chroma vector database in VectorConnector. + +inheriting the VectorStoreBase and implement similar_search(), vector_name_exists(), load_document(). +``` +class ChromaStore(VectorStoreBase): + """chroma database""" + + def __init__(self, ctx: {}) -> None: + self.ctx = ctx + self.embeddings = ctx["embeddings"] + self.persist_dir = os.path.join( + KNOWLEDGE_UPLOAD_ROOT_PATH, ctx["vector_store_name"] + ".vectordb" + ) + self.vector_store_client = Chroma( + persist_directory=self.persist_dir, embedding_function=self.embeddings + ) +``` + +similar_search() + +``` + def similar_search(self, text, topk) -> None: + logger.info("ChromaStore similar search") + return self.vector_store_client.similarity_search(text, topk) + +``` + +vector_name_exists() + +``` + def vector_name_exists(self): + return ( + os.path.exists(self.persist_dir) and len(os.listdir(self.persist_dir)) > 0 + ) + +``` + +load_document() + +``` + def load_document(self, documents): + logger.info("ChromaStore load document") + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + self.vector_store_client.add_texts(texts=texts, metadatas=metadatas) + self.vector_store_client.persist() +``` + diff --git a/docs/modules/vector/milvus/milvus.md b/docs/modules/vector/milvus/milvus.md new file mode 100644 index 000000000..9cb89314c --- /dev/null +++ b/docs/modules/vector/milvus/milvus.md @@ -0,0 +1,76 @@ +MilvusStore +================================== +MilvusStore is one implementation of the Milvus vector database in VectorConnector. + +[Tutorial on how to create a Milvus instance](https://milvus.io/docs/install_standalone-docker.md) + +inheriting the VectorStoreBase and implement similar_search(), vector_name_exists(), load_document(). +``` +class MilvusStore(VectorStoreBase): + """Milvus database""" + + def __init__(self, ctx: {}) -> None: + """init a milvus storage connection. + + Args: + ctx ({}): MilvusStore global config. + """ + # self.configure(cfg) + + connect_kwargs = {} + self.uri = CFG.MILVUS_URL + self.port = CFG.MILVUS_PORT + self.username = CFG.MILVUS_USERNAME + self.password = CFG.MILVUS_PASSWORD + self.collection_name = ctx.get("vector_store_name", None) + self.secure = ctx.get("secure", None) + self.embedding = ctx.get("embeddings", None) + self.fields = [] + self.alias = "default" + ) +``` + +similar_search() + +``` + def similar_search(self, text, topk) -> None: + """similar_search in vector database.""" + self.col = Collection(self.collection_name) + schema = self.col.schema + for x in schema.fields: + self.fields.append(x.name) + if x.auto_id: + self.fields.remove(x.name) + if x.is_primary: + self.primary_field = x.name + if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR: + self.vector_field = x.name + _, docs_and_scores = self._search(text, topk) + return [doc for doc, _, _ in docs_and_scores] + +``` + +vector_name_exists() + +``` + def vector_name_exists(self): + """is vector store name exist.""" + return utility.has_collection(self.collection_name) + +``` + +load_document() + +``` + def load_document(self, documents) -> None: + """load document in vector database.""" + # self.init_schema_and_load(self.collection_name, documents) + batch_size = 500 + batched_list = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + # docs = [] + for doc_batch in batched_list: + self.init_schema_and_load(self.collection_name, doc_batch) +``` +