feature:pdf embedding test

This commit is contained in:
chenketing 2023-05-11 20:28:15 +08:00
parent 0c241dfaad
commit ed855df01d
5 changed files with 11 additions and 51 deletions

View File

@ -0,0 +1,10 @@
from pilot.source_embedding.pdf_embedding import PDFEmbedding
path = "xxx.pdf"
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
vector_store_path = "/pilot/source_embedding/"
pdf_embedding = PDFEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "ob", "vector_store_path": "vector_store_path"})
pdf_embedding.source_embedding()
print("success")

View File

@ -1,17 +0,0 @@
from typing import List
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
from langchain.embeddings.base import Embeddings
class Text2Vectors(Embeddings):
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs."""
def embed_query(self, text: str) -> List[float]:
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents(text)[0]

View File

@ -1,14 +0,0 @@
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from pilot import TextToVector
path="/Users/chenketing/Downloads/OceanBase-数据库-V4.1.0-OceanBase-介绍.pdf"
loader = UnstructuredFileLoader(path)
text_splitor = CharacterTextSplitter()
docs = loader.load_and_split(text_splitor)
# doc["vector"] = TextToVector.textToVector(doc["content"])[0]

View File

@ -1,7 +1,6 @@
from langchain.vectorstores import Milvus from langchain.vectorstores import Milvus
from pymilvus import Collection,utility from pymilvus import Collection,utility
from pymilvus import connections, DataType, FieldSchema, CollectionSchema from pymilvus import connections, DataType, FieldSchema, CollectionSchema
from pilot.source_embedding.Text2Vectors import Text2Vectors
# milvus = connections.connect( # milvus = connections.connect(
# alias="default", # alias="default",
@ -43,7 +42,7 @@ milvus = connections.connect(
port="19530" port="19530"
) )
data = ["aaa", "bbb"] data = ["aaa", "bbb"]
text_embeddings = Text2Vectors() # text_embeddings = Text2Vectors()
mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="") mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="")
mivuls.from_texts(texts=data, embedding=text_embeddings) mivuls.from_texts(texts=data, embedding=text_embeddings)

View File

@ -1,18 +0,0 @@
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
class TextToVector:
@staticmethod
def textToVector(text):
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents([text])
@staticmethod
def textlist_to_vector(textlist):
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
return hfemb.embed_documents(textlist)