mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-23 04:12:13 +00:00
feature:pdf embedding test
This commit is contained in:
parent
0c241dfaad
commit
ed855df01d
10
examples/knowledge_embedding/pdf_embedding_test.py
Normal file
10
examples/knowledge_embedding/pdf_embedding_test.py
Normal file
@ -0,0 +1,10 @@
|
||||
from pilot.source_embedding.pdf_embedding import PDFEmbedding
|
||||
|
||||
path = "xxx.pdf"
|
||||
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
|
||||
vector_store_path = "/pilot/source_embedding/"
|
||||
|
||||
|
||||
pdf_embedding = PDFEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "ob", "vector_store_path": "vector_store_path"})
|
||||
pdf_embedding.source_embedding()
|
||||
print("success")
|
@ -1,17 +0,0 @@
|
||||
from typing import List
|
||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
import torch
|
||||
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
from langchain.embeddings.base import Embeddings
|
||||
|
||||
|
||||
|
||||
class Text2Vectors(Embeddings):
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed search docs."""
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
|
||||
return hfemb.embed_documents(text)[0]
|
@ -1,14 +0,0 @@
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
from pilot import TextToVector
|
||||
|
||||
path="/Users/chenketing/Downloads/OceanBase-数据库-V4.1.0-OceanBase-介绍.pdf"
|
||||
|
||||
|
||||
loader = UnstructuredFileLoader(path)
|
||||
text_splitor = CharacterTextSplitter()
|
||||
docs = loader.load_and_split(text_splitor)
|
||||
|
||||
|
||||
# doc["vector"] = TextToVector.textToVector(doc["content"])[0]
|
@ -1,7 +1,6 @@
|
||||
from langchain.vectorstores import Milvus
|
||||
from pymilvus import Collection,utility
|
||||
from pymilvus import connections, DataType, FieldSchema, CollectionSchema
|
||||
from pilot.source_embedding.Text2Vectors import Text2Vectors
|
||||
|
||||
# milvus = connections.connect(
|
||||
# alias="default",
|
||||
@ -43,7 +42,7 @@ milvus = connections.connect(
|
||||
port="19530"
|
||||
)
|
||||
data = ["aaa", "bbb"]
|
||||
text_embeddings = Text2Vectors()
|
||||
# text_embeddings = Text2Vectors()
|
||||
mivuls = Milvus(collection_name='document', embedding_function= text_embeddings, connection_args={"host": "127.0.0.1", "port": "19530", "alias":"default"}, text_field="")
|
||||
|
||||
mivuls.from_texts(texts=data, embedding=text_embeddings)
|
||||
|
@ -1,18 +0,0 @@
|
||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
import torch
|
||||
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
class TextToVector:
|
||||
|
||||
@staticmethod
|
||||
def textToVector(text):
|
||||
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
|
||||
return hfemb.embed_documents([text])
|
||||
|
||||
@staticmethod
|
||||
def textlist_to_vector(textlist):
|
||||
hfemb = HuggingFaceEmbeddings(model_name="/Users/chenketing/Desktop/project/all-MiniLM-L6-v2")
|
||||
return hfemb.embed_documents(textlist)
|
Loading…
Reference in New Issue
Block a user