feature:url,csv embedding

This commit is contained in:
chenketing 2023-05-11 23:48:56 +08:00
parent ed855df01d
commit d42a9f3bd1
7 changed files with 86 additions and 108 deletions

View File

@ -0,0 +1,13 @@
from pilot.source_embedding.csv_embedding import CSVEmbedding
# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx"
path = "/Users/chenketing/Downloads/vectors.csv"
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
vector_store_path = "/pilot/source_embedding/"
pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
pdf_embedding.source_embedding()
print("success")

View File

@ -0,0 +1,10 @@
from pilot.source_embedding.url_embedding import URLEmbedding
path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023"
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
vector_store_path = "/pilot/source_embedding/"
pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
pdf_embedding.source_embedding()
print("success")

View File

@ -1,11 +1,7 @@
from pilot.source_embedding import (SourceEmbedding, register) from pilot.source_embedding import (SourceEmbedding, register)
from pilot.source_embedding import TextToVector
from pilot.source_embedding import Text2Vectors
__all__ = [ __all__ = [
"SourceEmbedding", "SourceEmbedding",
"TextToVector",
"Text2Vectors",
"register" "register"
] ]

View File

@ -1,12 +1,8 @@
from pilot.source_embedding.source_embedding import SourceEmbedding from pilot.source_embedding.source_embedding import SourceEmbedding
from pilot.source_embedding.source_embedding import register from pilot.source_embedding.source_embedding import register
from pilot.source_embedding.text_to_vector import TextToVector
from pilot.source_embedding.Text2Vectors import Text2Vectors
__all__ = [ __all__ = [
"SourceEmbedding", "SourceEmbedding",
"TextToVector",
"Text2Vectors",
"register" "register"
] ]

View File

@ -0,0 +1,33 @@
from typing import List, Optional, Dict
from pilot.source_embedding import SourceEmbedding, register
from langchain.document_loaders import CSVLoader
from langchain.schema import Document
class CSVEmbedding(SourceEmbedding):
"""csv embedding for read csv document."""
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
"""Initialize with csv path."""
self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config
self.embedding_args = embedding_args
@register
def read(self):
"""Load from csv path."""
loader = CSVLoader(file_path=self.file_path)
return loader.load()
@register
def data_process(self, documents: List[Document]):
i = 0
for d in documents:
documents[i].page_content = d.page_content.replace("\n", "")
i += 1
return documents

View File

@ -6,8 +6,7 @@ from abc import ABC, abstractmethod
from langchain.embeddings import HuggingFaceEmbeddings from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma from langchain.vectorstores import Chroma
from typing import List from typing import List, Optional, Dict
registered_methods = [] registered_methods = []
@ -23,11 +22,12 @@ class SourceEmbedding(ABC):
Implementations should implement the method Implementations should implement the method
""" """
def __init__(self, yuque_path, model_name, vector_store_config): def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
"""Initialize with YuqueLoader url, model_name, vector_store_config""" """Initialize with YuqueLoader url, model_name, vector_store_config"""
self.yuque_path = yuque_path self.yuque_path = yuque_path
self.model_name = model_name self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.embedding_args = embedding_args
@abstractmethod @abstractmethod
@register @register

View File

@ -1,108 +1,38 @@
from random import random from typing import List
from pilot.source_embedding import SourceEmbedding, register
from langchain.embeddings.openai import OpenAIEmbeddings from bs4 import BeautifulSoup
from langchain.vectorstores import Milvus
from langchain.document_loaders import WebBaseLoader from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter from langchain.schema import Document
from pymilvus import connections, DataType, FieldSchema, CollectionSchema
from pymilvus import Collection
from pilot.source_embedding.text_to_vector import TextToVector class URLEmbedding(SourceEmbedding):
"""url embedding for read url document."""
def __init__(self, file_path, model_name, vector_store_config):
"""Initialize with url path."""
self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config
loader = WebBaseLoader([ @register
"https://milvus.io/docs/overview.md", def read(self):
]) """Load from url path."""
loader = WebBaseLoader(web_path=self.file_path)
return loader.load()
docs = loader.load() @register
def data_process(self, documents: List[Document]):
# Split the documents into smaller chunks i = 0
# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) for d in documents:
# docs = text_splitter.split_documents(docs) content = d.page_content.replace("\n", "")
soup = BeautifulSoup(content, 'html.parser')
embeddings = TextToVector.textToVector(docs[0].page_content) for tag in soup(['!doctype', 'meta']):
tag.extract()
milvus = connections.connect( documents[i].page_content = soup.get_text()
alias="default", i += 1
host='localhost', return documents
port="19530"
)
# collection = Collection("test_book")
# data = [{"doc_id": 11011, "content": 11011, "title": 11011, "vector": embeddings[0]}]
# # collection = Collection("document")
#
# # collection.insert(data=data)
# entities = [
# {
# 'doc_id': d['doc_id'],
# 'vector': d['vector'],
# 'content': d['content'],
# 'title': d['titlseae'],
# "type": DataType.FLOAT_VECTOR
# } for d in data
# ]
#
# milvus.insert(collection_name="document", entities=entities)
# print("success")
# 定义集合的字段
# fields = [
# FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR),
# FieldSchema(name="age", dtype=DataType.INT32),
# FieldSchema(name="gender", dtype=DataType.STRING),
# FieldSchema(name="id", dtype=DataType.INT64) # 添加主键字段
# ]
# book_id = FieldSchema(
# name="book_id",
# dtype=DataType.INT64,
# is_primary=True,
# )
# book_name = FieldSchema(
# name="book_name",
# dtype=DataType.BINARY_VECTOR,
# max_length=200,
# )
# word_count = FieldSchema(
# name="word_count",
# dtype=DataType.INT64,
# )
# book_intro = FieldSchema(
# name="book_intro",
# dtype=DataType.FLOAT_VECTOR,
# dim=2
# )
# schema = CollectionSchema(
# fields=[book_id, book_name, word_count, book_intro],
# description="Test book search"
# )
collection_name = "test_book"
collection = Collection(
name=collection_name,
schema=schema,
using='default',
shards_num=2
)
# 插入数据
# entities = [[
# {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1},
# {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2},
# {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3}
# ]]
entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]]
collection.insert(entities)
print("success")
# vector_store = Milvus.from_documents(
# docs,
# embedding=embeddings,
# connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
# )