mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-25 13:06:53 +00:00
feature:url,csv embedding
This commit is contained in:
parent
ed855df01d
commit
d42a9f3bd1
13
examples/knowledge_embedding/csv_embedding_test.py
Normal file
13
examples/knowledge_embedding/csv_embedding_test.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
|
||||||
|
|
||||||
|
from pilot.source_embedding.csv_embedding import CSVEmbedding
|
||||||
|
|
||||||
|
# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx"
|
||||||
|
path = "/Users/chenketing/Downloads/vectors.csv"
|
||||||
|
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
|
||||||
|
vector_store_path = "/pilot/source_embedding/"
|
||||||
|
|
||||||
|
|
||||||
|
pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
|
||||||
|
pdf_embedding.source_embedding()
|
||||||
|
print("success")
|
10
examples/knowledge_embedding/url_embedding_test.py
Normal file
10
examples/knowledge_embedding/url_embedding_test.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from pilot.source_embedding.url_embedding import URLEmbedding
|
||||||
|
|
||||||
|
path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023"
|
||||||
|
model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
|
||||||
|
vector_store_path = "/pilot/source_embedding/"
|
||||||
|
|
||||||
|
|
||||||
|
pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
|
||||||
|
pdf_embedding.source_embedding()
|
||||||
|
print("success")
|
@ -1,11 +1,7 @@
|
|||||||
from pilot.source_embedding import (SourceEmbedding, register)
|
from pilot.source_embedding import (SourceEmbedding, register)
|
||||||
from pilot.source_embedding import TextToVector
|
|
||||||
from pilot.source_embedding import Text2Vectors
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"SourceEmbedding",
|
"SourceEmbedding",
|
||||||
"TextToVector",
|
|
||||||
"Text2Vectors",
|
|
||||||
"register"
|
"register"
|
||||||
]
|
]
|
@ -1,12 +1,8 @@
|
|||||||
from pilot.source_embedding.source_embedding import SourceEmbedding
|
from pilot.source_embedding.source_embedding import SourceEmbedding
|
||||||
from pilot.source_embedding.source_embedding import register
|
from pilot.source_embedding.source_embedding import register
|
||||||
from pilot.source_embedding.text_to_vector import TextToVector
|
|
||||||
from pilot.source_embedding.Text2Vectors import Text2Vectors
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"SourceEmbedding",
|
"SourceEmbedding",
|
||||||
"TextToVector",
|
|
||||||
"Text2Vectors",
|
|
||||||
"register"
|
"register"
|
||||||
]
|
]
|
33
pilot/source_embedding/csv_embedding.py
Normal file
33
pilot/source_embedding/csv_embedding.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from typing import List, Optional, Dict
|
||||||
|
from pilot.source_embedding import SourceEmbedding, register
|
||||||
|
|
||||||
|
from langchain.document_loaders import CSVLoader
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
class CSVEmbedding(SourceEmbedding):
|
||||||
|
"""csv embedding for read csv document."""
|
||||||
|
|
||||||
|
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
|
||||||
|
"""Initialize with csv path."""
|
||||||
|
self.file_path = file_path
|
||||||
|
self.model_name = model_name
|
||||||
|
self.vector_store_config = vector_store_config
|
||||||
|
self.embedding_args = embedding_args
|
||||||
|
|
||||||
|
@register
|
||||||
|
def read(self):
|
||||||
|
"""Load from csv path."""
|
||||||
|
loader = CSVLoader(file_path=self.file_path)
|
||||||
|
return loader.load()
|
||||||
|
|
||||||
|
@register
|
||||||
|
def data_process(self, documents: List[Document]):
|
||||||
|
i = 0
|
||||||
|
for d in documents:
|
||||||
|
documents[i].page_content = d.page_content.replace("\n", "")
|
||||||
|
i += 1
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -6,8 +6,7 @@ from abc import ABC, abstractmethod
|
|||||||
from langchain.embeddings import HuggingFaceEmbeddings
|
from langchain.embeddings import HuggingFaceEmbeddings
|
||||||
from langchain.vectorstores import Chroma
|
from langchain.vectorstores import Chroma
|
||||||
|
|
||||||
from typing import List
|
from typing import List, Optional, Dict
|
||||||
|
|
||||||
|
|
||||||
registered_methods = []
|
registered_methods = []
|
||||||
|
|
||||||
@ -23,11 +22,12 @@ class SourceEmbedding(ABC):
|
|||||||
Implementations should implement the method
|
Implementations should implement the method
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, yuque_path, model_name, vector_store_config):
|
def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
|
||||||
"""Initialize with YuqueLoader url, model_name, vector_store_config"""
|
"""Initialize with YuqueLoader url, model_name, vector_store_config"""
|
||||||
self.yuque_path = yuque_path
|
self.yuque_path = yuque_path
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.embedding_args = embedding_args
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@register
|
@register
|
||||||
|
@ -1,108 +1,38 @@
|
|||||||
from random import random
|
from typing import List
|
||||||
|
from pilot.source_embedding import SourceEmbedding, register
|
||||||
|
|
||||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
from bs4 import BeautifulSoup
|
||||||
from langchain.vectorstores import Milvus
|
|
||||||
from langchain.document_loaders import WebBaseLoader
|
from langchain.document_loaders import WebBaseLoader
|
||||||
from langchain.text_splitter import CharacterTextSplitter
|
from langchain.schema import Document
|
||||||
from pymilvus import connections, DataType, FieldSchema, CollectionSchema
|
|
||||||
from pymilvus import Collection
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from pilot.source_embedding.text_to_vector import TextToVector
|
class URLEmbedding(SourceEmbedding):
|
||||||
|
"""url embedding for read url document."""
|
||||||
|
|
||||||
|
def __init__(self, file_path, model_name, vector_store_config):
|
||||||
|
"""Initialize with url path."""
|
||||||
|
self.file_path = file_path
|
||||||
|
self.model_name = model_name
|
||||||
|
self.vector_store_config = vector_store_config
|
||||||
|
|
||||||
loader = WebBaseLoader([
|
@register
|
||||||
"https://milvus.io/docs/overview.md",
|
def read(self):
|
||||||
])
|
"""Load from url path."""
|
||||||
|
loader = WebBaseLoader(web_path=self.file_path)
|
||||||
|
return loader.load()
|
||||||
|
|
||||||
docs = loader.load()
|
@register
|
||||||
|
def data_process(self, documents: List[Document]):
|
||||||
# Split the documents into smaller chunks
|
i = 0
|
||||||
# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
|
for d in documents:
|
||||||
# docs = text_splitter.split_documents(docs)
|
content = d.page_content.replace("\n", "")
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
embeddings = TextToVector.textToVector(docs[0].page_content)
|
for tag in soup(['!doctype', 'meta']):
|
||||||
|
tag.extract()
|
||||||
milvus = connections.connect(
|
documents[i].page_content = soup.get_text()
|
||||||
alias="default",
|
i += 1
|
||||||
host='localhost',
|
return documents
|
||||||
port="19530"
|
|
||||||
)
|
|
||||||
|
|
||||||
# collection = Collection("test_book")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# data = [{"doc_id": 11011, "content": 11011, "title": 11011, "vector": embeddings[0]}]
|
|
||||||
# # collection = Collection("document")
|
|
||||||
#
|
|
||||||
# # collection.insert(data=data)
|
|
||||||
# entities = [
|
|
||||||
# {
|
|
||||||
# 'doc_id': d['doc_id'],
|
|
||||||
# 'vector': d['vector'],
|
|
||||||
# 'content': d['content'],
|
|
||||||
# 'title': d['titlseae'],
|
|
||||||
# "type": DataType.FLOAT_VECTOR
|
|
||||||
# } for d in data
|
|
||||||
# ]
|
|
||||||
#
|
|
||||||
# milvus.insert(collection_name="document", entities=entities)
|
|
||||||
# print("success")
|
|
||||||
# 定义集合的字段
|
|
||||||
# fields = [
|
|
||||||
# FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR),
|
|
||||||
# FieldSchema(name="age", dtype=DataType.INT32),
|
|
||||||
# FieldSchema(name="gender", dtype=DataType.STRING),
|
|
||||||
# FieldSchema(name="id", dtype=DataType.INT64) # 添加主键字段
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# book_id = FieldSchema(
|
|
||||||
# name="book_id",
|
|
||||||
# dtype=DataType.INT64,
|
|
||||||
# is_primary=True,
|
|
||||||
# )
|
|
||||||
# book_name = FieldSchema(
|
|
||||||
# name="book_name",
|
|
||||||
# dtype=DataType.BINARY_VECTOR,
|
|
||||||
# max_length=200,
|
|
||||||
# )
|
|
||||||
# word_count = FieldSchema(
|
|
||||||
# name="word_count",
|
|
||||||
# dtype=DataType.INT64,
|
|
||||||
# )
|
|
||||||
# book_intro = FieldSchema(
|
|
||||||
# name="book_intro",
|
|
||||||
# dtype=DataType.FLOAT_VECTOR,
|
|
||||||
# dim=2
|
|
||||||
# )
|
|
||||||
# schema = CollectionSchema(
|
|
||||||
# fields=[book_id, book_name, word_count, book_intro],
|
|
||||||
# description="Test book search"
|
|
||||||
# )
|
|
||||||
collection_name = "test_book"
|
|
||||||
|
|
||||||
collection = Collection(
|
|
||||||
name=collection_name,
|
|
||||||
schema=schema,
|
|
||||||
using='default',
|
|
||||||
shards_num=2
|
|
||||||
)
|
|
||||||
# 插入数据
|
|
||||||
# entities = [[
|
|
||||||
# {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1},
|
|
||||||
# {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2},
|
|
||||||
# {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3}
|
|
||||||
# ]]
|
|
||||||
|
|
||||||
entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]]
|
|
||||||
|
|
||||||
collection.insert(entities)
|
|
||||||
print("success")
|
|
||||||
|
|
||||||
# vector_store = Milvus.from_documents(
|
|
||||||
# docs,
|
|
||||||
# embedding=embeddings,
|
|
||||||
# connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
|
|
||||||
# )
|
|
Loading…
Reference in New Issue
Block a user