diff --git a/examples/knowledge_embedding/pdf_embedding_test.py b/examples/knowledge_embedding/pdf_embedding_test.py new file mode 100644 index 000000000..03aa35d23 --- /dev/null +++ b/examples/knowledge_embedding/pdf_embedding_test.py @@ -0,0 +1,10 @@ +from pilot.source_embedding.pdf_embedding import PDFEmbedding + +path = "xxx.pdf" +model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2" +vector_store_path = "/pilot/source_embedding/" + + +pdf_embedding = PDFEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "ob", "vector_store_path": "vector_store_path"}) +pdf_embedding.source_embedding() +print("success") \ No newline at end of file diff --git a/pilot/source_embedding/Text2Vectors.py b/pilot/source_embedding/Text2Vectors.py deleted file mode 100644 index 99b03bc75..000000000 --- a/pilot/source_embedding/Text2Vectors.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import List -from langchain.embeddings.huggingface import HuggingFaceEmbeddings -import torch - - -device = "cuda" if torch.cuda.is_available() else "cpu" -from langchain.embeddings.base import Embeddings - - - -class Text2Vectors(Embeddings): - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed search docs.""" - - def embed_query(self, text: str) -> List[float]: - hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") - return hfemb.embed_documents(text)[0] \ No newline at end of file diff --git a/pilot/source_embedding/chroma_test.py b/pilot/source_embedding/chroma_test.py deleted file mode 100644 index d250f4dde..000000000 --- a/pilot/source_embedding/chroma_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from langchain.document_loaders import UnstructuredFileLoader -from langchain.text_splitter import CharacterTextSplitter - -from pilot import TextToVector - -path="/Users/chenketing/Downloads/OceanBase-数据库-V4.1.0-OceanBase-介绍.pdf" - - -loader = UnstructuredFileLoader(path) -text_splitor = CharacterTextSplitter() -docs = loader.load_and_split(text_splitor) - - -# doc["vector"] = TextToVector.textToVector(doc["content"])[0] diff --git a/pilot/source_embedding/search_milvus.py b/pilot/source_embedding/search_milvus.py index 181ca630d..a7898f183 100644 --- a/pilot/source_embedding/search_milvus.py +++ b/pilot/source_embedding/search_milvus.py @@ -1,7 +1,6 @@ from langchain.vectorstores import Milvus from pymilvus import Collection,utility from pymilvus import connections, DataType, FieldSchema, CollectionSchema -from pilot.source_embedding.Text2Vectors import Text2Vectors # milvus = connections.connect( # alias="default", @@ -43,7 +42,7 @@ milvus = connections.connect( port="19530" ) data = ["aaa", "bbb"] -text_embeddings = Text2Vectors() +# text_embeddings = Text2Vectors() mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="") mivuls.from_texts(texts=data, embedding=text_embeddings) diff --git a/pilot/source_embedding/text_to_vector.py b/pilot/source_embedding/text_to_vector.py deleted file mode 100644 index 1f8183f91..000000000 --- a/pilot/source_embedding/text_to_vector.py +++ /dev/null @@ -1,18 +0,0 @@ -from langchain.embeddings.huggingface import HuggingFaceEmbeddings -import torch - - -device = "cuda" if torch.cuda.is_available() else "cpu" - - -class TextToVector: - - @staticmethod - def textToVector(text): - hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") - return hfemb.embed_documents([text]) - - @staticmethod - def textlist_to_vector(textlist): - hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2") - return hfemb.embed_documents(textlist) \ No newline at end of file