From d42a9f3bd147745e285338f5637b928f18f1eff9 Mon Sep 17 00:00:00 2001 From: chenketing Date: Thu, 11 May 2023 23:48:56 +0800 Subject: [PATCH] feature:url,csv embedding --- .../knowledge_embedding/csv_embedding_test.py | 13 ++ .../knowledge_embedding/url_embedding_test.py | 10 ++ pilot/__init__.py | 4 - pilot/source_embedding/__init__.py | 4 - pilot/source_embedding/csv_embedding.py | 33 +++++ pilot/source_embedding/source_embedding.py | 6 +- pilot/source_embedding/url_embedding.py | 124 ++++-------------- 7 files changed, 86 insertions(+), 108 deletions(-) create mode 100644 examples/knowledge_embedding/csv_embedding_test.py create mode 100644 examples/knowledge_embedding/url_embedding_test.py create mode 100644 pilot/source_embedding/csv_embedding.py diff --git a/examples/knowledge_embedding/csv_embedding_test.py b/examples/knowledge_embedding/csv_embedding_test.py new file mode 100644 index 000000000..76d95229e --- /dev/null +++ b/examples/knowledge_embedding/csv_embedding_test.py @@ -0,0 +1,13 @@ + + +from pilot.source_embedding.csv_embedding import CSVEmbedding + +# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx" +path = "/Users/chenketing/Downloads/vectors.csv" +model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2" +vector_store_path = "/pilot/source_embedding/" + + +pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"}) +pdf_embedding.source_embedding() +print("success") \ No newline at end of file diff --git a/examples/knowledge_embedding/url_embedding_test.py b/examples/knowledge_embedding/url_embedding_test.py new file mode 100644 index 000000000..fea12d45f --- /dev/null +++ b/examples/knowledge_embedding/url_embedding_test.py @@ -0,0 +1,10 @@ +from pilot.source_embedding.url_embedding import URLEmbedding + +path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023" +model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2" +vector_store_path = "/pilot/source_embedding/" + + +pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"}) +pdf_embedding.source_embedding() +print("success") \ No newline at end of file diff --git a/pilot/__init__.py b/pilot/__init__.py index f75747d5c..a1531040e 100644 --- a/pilot/__init__.py +++ b/pilot/__init__.py @@ -1,11 +1,7 @@ from pilot.source_embedding import (SourceEmbedding, register) -from pilot.source_embedding import TextToVector -from pilot.source_embedding import Text2Vectors __all__ = [ "SourceEmbedding", - "TextToVector", - "Text2Vectors", "register" ] \ No newline at end of file diff --git a/pilot/source_embedding/__init__.py b/pilot/source_embedding/__init__.py index a44cea0a5..9d1e74a31 100644 --- a/pilot/source_embedding/__init__.py +++ b/pilot/source_embedding/__init__.py @@ -1,12 +1,8 @@ from pilot.source_embedding.source_embedding import SourceEmbedding from pilot.source_embedding.source_embedding import register -from pilot.source_embedding.text_to_vector import TextToVector -from pilot.source_embedding.Text2Vectors import Text2Vectors __all__ = [ "SourceEmbedding", - "TextToVector", - "Text2Vectors", "register" ] \ No newline at end of file diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/source_embedding/csv_embedding.py new file mode 100644 index 000000000..db73ae7e5 --- /dev/null +++ b/pilot/source_embedding/csv_embedding.py @@ -0,0 +1,33 @@ +from typing import List, Optional, Dict +from pilot.source_embedding import SourceEmbedding, register + +from langchain.document_loaders import CSVLoader +from langchain.schema import Document + + +class CSVEmbedding(SourceEmbedding): + """csv embedding for read csv document.""" + + def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None): + """Initialize with csv path.""" + self.file_path = file_path + self.model_name = model_name + self.vector_store_config = vector_store_config + self.embedding_args = embedding_args + + @register + def read(self): + """Load from csv path.""" + loader = CSVLoader(file_path=self.file_path) + return loader.load() + + @register + def data_process(self, documents: List[Document]): + i = 0 + for d in documents: + documents[i].page_content = d.page_content.replace("\n", "") + i += 1 + return documents + + + diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py index ec66e302c..82d91c0ba 100644 --- a/pilot/source_embedding/source_embedding.py +++ b/pilot/source_embedding/source_embedding.py @@ -6,8 +6,7 @@ from abc import ABC, abstractmethod from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma -from typing import List - +from typing import List, Optional, Dict registered_methods = [] @@ -23,11 +22,12 @@ class SourceEmbedding(ABC): Implementations should implement the method """ - def __init__(self, yuque_path, model_name, vector_store_config): + def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None): """Initialize with YuqueLoader url, model_name, vector_store_config""" self.yuque_path = yuque_path self.model_name = model_name self.vector_store_config = vector_store_config + self.embedding_args = embedding_args @abstractmethod @register diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py index cca2f6bfe..5fa29e0d2 100644 --- a/pilot/source_embedding/url_embedding.py +++ b/pilot/source_embedding/url_embedding.py @@ -1,108 +1,38 @@ -from random import random +from typing import List +from pilot.source_embedding import SourceEmbedding, register -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import Milvus +from bs4 import BeautifulSoup from langchain.document_loaders import WebBaseLoader -from langchain.text_splitter import CharacterTextSplitter -from pymilvus import connections, DataType, FieldSchema, CollectionSchema -from pymilvus import Collection +from langchain.schema import Document -from pilot.source_embedding.text_to_vector import TextToVector +class URLEmbedding(SourceEmbedding): + """url embedding for read url document.""" + def __init__(self, file_path, model_name, vector_store_config): + """Initialize with url path.""" + self.file_path = file_path + self.model_name = model_name + self.vector_store_config = vector_store_config -loader = WebBaseLoader([ - "https://milvus.io/docs/overview.md", -]) + @register + def read(self): + """Load from url path.""" + loader = WebBaseLoader(web_path=self.file_path) + return loader.load() -docs = loader.load() - -# Split the documents into smaller chunks -# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) -# docs = text_splitter.split_documents(docs) - -embeddings = TextToVector.textToVector(docs[0].page_content) - -milvus = connections.connect( - alias="default", - host='localhost', - port="19530" -) - -# collection = Collection("test_book") + @register + def data_process(self, documents: List[Document]): + i = 0 + for d in documents: + content = d.page_content.replace("\n", "") + soup = BeautifulSoup(content, 'html.parser') + for tag in soup(['!doctype', 'meta']): + tag.extract() + documents[i].page_content = soup.get_text() + i += 1 + return documents -# data = [{"doc_id": 11011, "content": 11011, "title": 11011, "vector": embeddings[0]}] -# # collection = Collection("document") -# -# # collection.insert(data=data) -# entities = [ -# { -# 'doc_id': d['doc_id'], -# 'vector': d['vector'], -# 'content': d['content'], -# 'title': d['titlseae'], -# "type": DataType.FLOAT_VECTOR -# } for d in data -# ] -# -# milvus.insert(collection_name="document", entities=entities) -# print("success") -# 定义集合的字段 -# fields = [ -# FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR), -# FieldSchema(name="age", dtype=DataType.INT32), -# FieldSchema(name="gender", dtype=DataType.STRING), -# FieldSchema(name="id", dtype=DataType.INT64) # 添加主键字段 -# ] - -# book_id = FieldSchema( -# name="book_id", -# dtype=DataType.INT64, -# is_primary=True, -# ) -# book_name = FieldSchema( -# name="book_name", -# dtype=DataType.BINARY_VECTOR, -# max_length=200, -# ) -# word_count = FieldSchema( -# name="word_count", -# dtype=DataType.INT64, -# ) -# book_intro = FieldSchema( -# name="book_intro", -# dtype=DataType.FLOAT_VECTOR, -# dim=2 -# ) -# schema = CollectionSchema( -# fields=[book_id, book_name, word_count, book_intro], -# description="Test book search" -# ) -collection_name = "test_book" - -collection = Collection( - name=collection_name, - schema=schema, - using='default', - shards_num=2 - ) -# 插入数据 -# entities = [[ -# {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1}, -# {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2}, -# {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3} -# ]] - -entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]] - -collection.insert(entities) -print("success") - -# vector_store = Milvus.from_documents( -# docs, -# embedding=embeddings, -# connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"} -# ) \ No newline at end of file