From 365319a86c218f9518d3edb17a23e86cc3cfb988 Mon Sep 17 00:00:00 2001 From: chenketing Date: Wed, 10 May 2023 20:58:35 +0800 Subject: [PATCH] embedding --- .gitignore | 1 + pilot/__init__.py | 12 ++- pilot/source_embedding/Text2Vectors.py | 17 ++++ pilot/source_embedding/__init__.py | 12 +++ pilot/source_embedding/chroma_test.py | 14 +++ pilot/source_embedding/pdf_embedding.py | 54 ++++++++++ pilot/source_embedding/search_milvus.py | 53 ++++++++++ pilot/source_embedding/source_embedding.py | 112 +++++++++++++++++++++ pilot/source_embedding/text_to_vector.py | 18 ++++ pilot/source_embedding/url_embedding.py | 108 ++++++++++++++++++++ 10 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 pilot/source_embedding/Text2Vectors.py create mode 100644 pilot/source_embedding/__init__.py create mode 100644 pilot/source_embedding/chroma_test.py create mode 100644 pilot/source_embedding/pdf_embedding.py create mode 100644 pilot/source_embedding/search_milvus.py create mode 100644 pilot/source_embedding/source_embedding.py create mode 100644 pilot/source_embedding/text_to_vector.py create mode 100644 pilot/source_embedding/url_embedding.py diff --git a/.gitignore b/.gitignore index 07be74f26..1df2f6556 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ # C extensions *.so +.idea .vscode # Distribution / packaging .Python diff --git a/pilot/__init__.py b/pilot/__init__.py index f102a9cad..f75747d5c 100644 --- a/pilot/__init__.py +++ b/pilot/__init__.py @@ -1 +1,11 @@ -__version__ = "0.0.1" +from pilot.source_embedding import (SourceEmbedding, register) +from pilot.source_embedding import TextToVector +from pilot.source_embedding import Text2Vectors + + +__all__ = [ + "SourceEmbedding", + "TextToVector", + "Text2Vectors", + "register" +] \ No newline at end of file diff --git a/pilot/source_embedding/Text2Vectors.py b/pilot/source_embedding/Text2Vectors.py new file mode 100644 index 000000000..99b03bc75 --- /dev/null +++ b/pilot/source_embedding/Text2Vectors.py @@ -0,0 +1,17 @@ +from typing import List +from langchain.embeddings.huggingface import HuggingFaceEmbeddings +import torch + + +device = "cuda" if torch.cuda.is_available() else "cpu" +from langchain.embeddings.base import Embeddings + + + +class Text2Vectors(Embeddings): + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed search docs.""" + + def embed_query(self, text: str) -> List[float]: + hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") + return hfemb.embed_documents(text)[0] \ No newline at end of file diff --git a/pilot/source_embedding/__init__.py b/pilot/source_embedding/__init__.py new file mode 100644 index 000000000..a44cea0a5 --- /dev/null +++ b/pilot/source_embedding/__init__.py @@ -0,0 +1,12 @@ +from pilot.source_embedding.source_embedding import SourceEmbedding +from pilot.source_embedding.source_embedding import register +from pilot.source_embedding.text_to_vector import TextToVector +from pilot.source_embedding.Text2Vectors import Text2Vectors + + +__all__ = [ + "SourceEmbedding", + "TextToVector", + "Text2Vectors", + "register" +] \ No newline at end of file diff --git a/pilot/source_embedding/chroma_test.py b/pilot/source_embedding/chroma_test.py new file mode 100644 index 000000000..d250f4dde --- /dev/null +++ b/pilot/source_embedding/chroma_test.py @@ -0,0 +1,14 @@ +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import CharacterTextSplitter + +from pilot import TextToVector + +path="/Users/chenketing/Downloads/OceanBase-数据库-V4.1.0-OceanBase-介绍.pdf" + + +loader = UnstructuredFileLoader(path) +text_splitor = CharacterTextSplitter() +docs = loader.load_and_split(text_splitor) + + +# doc["vector"] = TextToVector.textToVector(doc["content"])[0] diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py new file mode 100644 index 000000000..8cd915d90 --- /dev/null +++ b/pilot/source_embedding/pdf_embedding.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import os + +from bs4 import BeautifulSoup +from langchain.document_loaders import UnstructuredFileLoader, UnstructuredPDFLoader +from langchain.vectorstores import Milvus, Chroma +from pymilvus import connections + +from pilot.server.vicuna_server import embeddings +from pilot.source_embedding.text_to_vector import TextToVector +# from vector_store import ESVectorStore + +from pilot.source_embedding import SourceEmbedding, register + + +class PDFEmbedding(SourceEmbedding): + """yuque embedding for read yuque document.""" + + def __init__(self, file_path, model_name, vector_store_config): + """Initialize with YuqueLoader url.""" + self.file_path = file_path + self.model_name = model_name + self.vector_store_config = vector_store_config + + @register + def read(self): + """Load from pdf path.""" + docs = [] + # loader = UnstructuredFileLoader(self.file_path) + loader = UnstructuredPDFLoader(self.file_path, mode="elements") + return loader.load()[0] + + @register + def text_to_vector(self, docs): + """Load from yuque url.""" + for doc in docs: + doc["vector"] = TextToVector.textToVector(doc["content"])[0] + return docs + + @register + def index_to_store(self, docs): + """index into vector store.""" + + # vector_db = Milvus.add_texts( + # docs, + # embeddings, + # connection_args={"host": "127.0.0.1", "port": "19530"}, + # ) + db = Chroma.from_documents(docs, embeddings) + + return Chroma.from_documents(docs, embeddings) + diff --git a/pilot/source_embedding/search_milvus.py b/pilot/source_embedding/search_milvus.py new file mode 100644 index 000000000..181ca630d --- /dev/null +++ b/pilot/source_embedding/search_milvus.py @@ -0,0 +1,53 @@ +from langchain.vectorstores import Milvus +from pymilvus import Collection,utility +from pymilvus import connections, DataType, FieldSchema, CollectionSchema +from pilot.source_embedding.Text2Vectors import Text2Vectors + +# milvus = connections.connect( +# alias="default", +# host='localhost', +# port="19530" +# ) +# collection = Collection("book") + + +# Get an existing collection. +# collection.load() +# +# search_params = {"metric_type": "L2", "params": {}, "offset": 5} +# +# results = collection.search( +# data=[[0.1, 0.2]], +# anns_field="book_intro", +# param=search_params, +# limit=10, +# expr=None, +# output_fields=['book_id'], +# consistency_level="Strong" +# ) +# +# # get the IDs of all returned hits +# results[0].ids +# +# # get the distances to the query vector from all returned hits +# results[0].distances +# +# # get the value of an output field specified in the search request. +# # vector fields are not supported yet. +# hit = results[0][0] +# hit.entity.get('title') + +milvus = connections.connect( + alias="default", + host='localhost', + port="19530" +) +data = ["aaa", "bbb"] +text_embeddings = Text2Vectors() +mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="") + +mivuls.from_texts(texts=data, embedding=text_embeddings) +# docs, +# embedding=embeddings, +# connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"} +# ) \ No newline at end of file diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py new file mode 100644 index 000000000..05e8de338 --- /dev/null +++ b/pilot/source_embedding/source_embedding.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from abc import ABC, abstractmethod + +from pymilvus import connections, FieldSchema, DataType, CollectionSchema + +from pilot.source_embedding.text_to_vector import TextToVector + +from typing import List + + +registered_methods = [] + + +def register(method): + registered_methods.append(method.__name__) + return method + + +class SourceEmbedding(ABC): + """base class for read data source embedding pipeline. + include data read, data process, data split, data to vector, data index vector store + Implementations should implement the method + """ + + def __init__(self, yuque_path, model_name, vector_store_config): + """Initialize with YuqueLoader url, model_name, vector_store_config""" + self.yuque_path = yuque_path + self.model_name = model_name + self.vector_store_config = vector_store_config + + # Sub-classes should implement this method + # as return list(self.lazy_load()). + # This method returns a List which is materialized in memory. + @abstractmethod + @register + def read(self) -> List[ABC]: + """read datasource into document objects.""" + @register + def data_process(self, text): + """pre process data.""" + + @register + def text_split(self, text): + """text split chunk""" + pass + + @register + def text_to_vector(self, docs): + """transform vector""" + for doc in docs: + doc["vector"] = TextToVector.textToVector(doc["content"])[0] + return docs + + @register + def index_to_store(self): + """index to vector store""" + milvus = connections.connect( + alias="default", + host='localhost', + port="19530" + ) + doc_id = FieldSchema( + name="doc_id", + dtype=DataType.INT64, + is_primary=True, + ) + doc_vector = FieldSchema( + name="doc_vector", + dtype=DataType.FLOAT_VECTOR, + dim=self.vector_store_config["dim"] + ) + schema = CollectionSchema( + fields=[doc_id, doc_vector], + description=self.vector_store_config["description"] + ) + + @register + def index_to_store(self): + """index to vector store""" + milvus = connections.connect( + alias="default", + host='localhost', + port="19530" + ) + doc_id = FieldSchema( + name="doc_id", + dtype=DataType.INT64, + is_primary=True, + ) + doc_vector = FieldSchema( + name="doc_vector", + dtype=DataType.FLOAT_VECTOR, + dim=self.vector_store_config["dim"] + ) + schema = CollectionSchema( + fields=[doc_id, doc_vector], + description=self.vector_store_config["description"] + ) + + def source_embedding(self): + if 'read' in registered_methods: + text = self.read() + if 'process' in registered_methods: + self.process(text) + if 'text_split' in registered_methods: + self.text_split(text) + if 'text_to_vector' in registered_methods: + self.text_to_vector(text) + if 'index_to_store' in registered_methods: + self.index_to_store(text) diff --git a/pilot/source_embedding/text_to_vector.py b/pilot/source_embedding/text_to_vector.py new file mode 100644 index 000000000..1f8183f91 --- /dev/null +++ b/pilot/source_embedding/text_to_vector.py @@ -0,0 +1,18 @@ +from langchain.embeddings.huggingface import HuggingFaceEmbeddings +import torch + + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +class TextToVector: + + @staticmethod + def textToVector(text): + hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") + return hfemb.embed_documents([text]) + + @staticmethod + def textlist_to_vector(textlist): + hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") + return hfemb.embed_documents(textlist) \ No newline at end of file diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py new file mode 100644 index 000000000..cca2f6bfe --- /dev/null +++ b/pilot/source_embedding/url_embedding.py @@ -0,0 +1,108 @@ +from random import random + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import Milvus +from langchain.document_loaders import WebBaseLoader +from langchain.text_splitter import CharacterTextSplitter +from pymilvus import connections, DataType, FieldSchema, CollectionSchema +from pymilvus import Collection + + + +from pilot.source_embedding.text_to_vector import TextToVector + + +loader = WebBaseLoader([ + "https://milvus.io/docs/overview.md", +]) + +docs = loader.load() + +# Split the documents into smaller chunks +# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) +# docs = text_splitter.split_documents(docs) + +embeddings = TextToVector.textToVector(docs[0].page_content) + +milvus = connections.connect( + alias="default", + host='localhost', + port="19530" +) + +# collection = Collection("test_book") + + + +# data = [{"doc_id": 11011, "content": 11011, "title": 11011, "vector": embeddings[0]}] +# # collection = Collection("document") +# +# # collection.insert(data=data) +# entities = [ +# { +# 'doc_id': d['doc_id'], +# 'vector': d['vector'], +# 'content': d['content'], +# 'title': d['titlseae'], +# "type": DataType.FLOAT_VECTOR +# } for d in data +# ] +# +# milvus.insert(collection_name="document", entities=entities) +# print("success") +# 定义集合的字段 +# fields = [ +# FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR), +# FieldSchema(name="age", dtype=DataType.INT32), +# FieldSchema(name="gender", dtype=DataType.STRING), +# FieldSchema(name="id", dtype=DataType.INT64) # 添加主键字段 +# ] + +# book_id = FieldSchema( +# name="book_id", +# dtype=DataType.INT64, +# is_primary=True, +# ) +# book_name = FieldSchema( +# name="book_name", +# dtype=DataType.BINARY_VECTOR, +# max_length=200, +# ) +# word_count = FieldSchema( +# name="word_count", +# dtype=DataType.INT64, +# ) +# book_intro = FieldSchema( +# name="book_intro", +# dtype=DataType.FLOAT_VECTOR, +# dim=2 +# ) +# schema = CollectionSchema( +# fields=[book_id, book_name, word_count, book_intro], +# description="Test book search" +# ) +collection_name = "test_book" + +collection = Collection( + name=collection_name, + schema=schema, + using='default', + shards_num=2 + ) +# 插入数据 +# entities = [[ +# {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1}, +# {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2}, +# {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3} +# ]] + +entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]] + +collection.insert(entities) +print("success") + +# vector_store = Milvus.from_documents( +# docs, +# embedding=embeddings, +# connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"} +# ) \ No newline at end of file