feature:pdf embedding test

This commit is contained in:
chenketing 2023-05-11 20:28:15 +08:00
parent 0c241dfaad
commit ed855df01d
5 changed files with 11 additions and 51 deletions

View File

@ -0,0 +1,10 @@
from pilot.source_embedding.pdf_embedding import PDFEmbedding
path = "xxx.pdf"
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
vector_store_path = "/pilot/source_embedding/"
pdf_embedding = PDFEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "ob", "vector_store_path": "vector_store_path"})
pdf_embedding.source_embedding()
print("success")

View File

@ -1,17 +0,0 @@
from typing import List
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
from langchain.embeddings.base import Embeddings
class Text2Vectors(Embeddings):
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs."""
def embed_query(self, text: str) -> List[float]:
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents(text)[0]

View File

@ -1,14 +0,0 @@
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from pilot import TextToVector
path="/Users/chenketing/Downloads/OceanBase-数据库-V4.1.0-OceanBase-介绍.pdf"
loader = UnstructuredFileLoader(path)
text_splitor = CharacterTextSplitter()
docs = loader.load_and_split(text_splitor)
# doc["vector"] = TextToVector.textToVector(doc["content"])[0]

View File

@ -1,7 +1,6 @@
from langchain.vectorstores import Milvus
from pymilvus import Collection,utility
from pymilvus import connections, DataType, FieldSchema, CollectionSchema
from pilot.source_embedding.Text2Vectors import Text2Vectors
# milvus = connections.connect(
# alias="default",
@ -43,7 +42,7 @@ milvus = connections.connect(
port="19530"
)
data = ["aaa", "bbb"]
text_embeddings = Text2Vectors()
# text_embeddings = Text2Vectors()
mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="")
mivuls.from_texts(texts=data, embedding=text_embeddings)

View File

@ -1,18 +0,0 @@
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
class TextToVector:
@staticmethod
def textToVector(text):
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents([text])
@staticmethod
def textlist_to_vector(textlist):
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents(textlist)