From d42a9f3bd147745e285338f5637b928f18f1eff9 Mon Sep 17 00:00:00 2001
From: chenketing <chenketing.ckt@antgroup.com>
Date: Thu, 11 May 2023 23:48:56 +0800
Subject: [PATCH] feature:url,csv embedding

---
 .../knowledge_embedding/csv_embedding_test.py |  13 ++
 .../knowledge_embedding/url_embedding_test.py |  10 ++
 pilot/__init__.py                             |   4 -
 pilot/source_embedding/__init__.py            |   4 -
 pilot/source_embedding/csv_embedding.py       |  33 +++++
 pilot/source_embedding/source_embedding.py    |   6 +-
 pilot/source_embedding/url_embedding.py       | 124 ++++--------------
 7 files changed, 86 insertions(+), 108 deletions(-)
 create mode 100644 examples/knowledge_embedding/csv_embedding_test.py
 create mode 100644 examples/knowledge_embedding/url_embedding_test.py
 create mode 100644 pilot/source_embedding/csv_embedding.py

diff --git a/examples/knowledge_embedding/csv_embedding_test.py b/examples/knowledge_embedding/csv_embedding_test.py
new file mode 100644
index 000000000..76d95229e
--- /dev/null
+++ b/examples/knowledge_embedding/csv_embedding_test.py
@@ -0,0 +1,13 @@
+
+
+from pilot.source_embedding.csv_embedding import CSVEmbedding
+
+# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx"
+path = "/Users/chenketing/Downloads/vectors.csv"
+model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
+vector_store_path = "/pilot/source_embedding/"
+
+
+pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
+pdf_embedding.source_embedding()
+print("success")
\ No newline at end of file
diff --git a/examples/knowledge_embedding/url_embedding_test.py b/examples/knowledge_embedding/url_embedding_test.py
new file mode 100644
index 000000000..fea12d45f
--- /dev/null
+++ b/examples/knowledge_embedding/url_embedding_test.py
@@ -0,0 +1,10 @@
+from pilot.source_embedding.url_embedding import URLEmbedding
+
+path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023"
+model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
+vector_store_path = "/pilot/source_embedding/"
+
+
+pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
+pdf_embedding.source_embedding()
+print("success")
\ No newline at end of file
diff --git a/pilot/__init__.py b/pilot/__init__.py
index f75747d5c..a1531040e 100644
--- a/pilot/__init__.py
+++ b/pilot/__init__.py
@@ -1,11 +1,7 @@
 from pilot.source_embedding import (SourceEmbedding, register)
-from pilot.source_embedding import TextToVector
-from pilot.source_embedding import Text2Vectors
 
 
 __all__ = [
     "SourceEmbedding",
-    "TextToVector",
-    "Text2Vectors",
     "register"
 ]
\ No newline at end of file
diff --git a/pilot/source_embedding/__init__.py b/pilot/source_embedding/__init__.py
index a44cea0a5..9d1e74a31 100644
--- a/pilot/source_embedding/__init__.py
+++ b/pilot/source_embedding/__init__.py
@@ -1,12 +1,8 @@
 from pilot.source_embedding.source_embedding import SourceEmbedding
 from pilot.source_embedding.source_embedding import register
-from pilot.source_embedding.text_to_vector import TextToVector
-from pilot.source_embedding.Text2Vectors import Text2Vectors
 
 
 __all__ = [
     "SourceEmbedding",
-    "TextToVector",
-    "Text2Vectors",
     "register"
 ]
\ No newline at end of file
diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/source_embedding/csv_embedding.py
new file mode 100644
index 000000000..db73ae7e5
--- /dev/null
+++ b/pilot/source_embedding/csv_embedding.py
@@ -0,0 +1,33 @@
+from typing import List, Optional, Dict
+from pilot.source_embedding import SourceEmbedding, register
+
+from langchain.document_loaders import CSVLoader
+from langchain.schema import Document
+
+
+class CSVEmbedding(SourceEmbedding):
+    """csv embedding for read csv document."""
+
+    def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
+        """Initialize with csv path."""
+        self.file_path = file_path
+        self.model_name = model_name
+        self.vector_store_config = vector_store_config
+        self.embedding_args = embedding_args
+
+    @register
+    def read(self):
+        """Load from csv path."""
+        loader = CSVLoader(file_path=self.file_path)
+        return loader.load()
+
+    @register
+    def data_process(self, documents: List[Document]):
+        i = 0
+        for d in documents:
+            documents[i].page_content = d.page_content.replace("\n", "")
+            i += 1
+        return documents
+
+
+
diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py
index ec66e302c..82d91c0ba 100644
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -6,8 +6,7 @@ from abc import ABC, abstractmethod
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
 
-from typing import List
-
+from typing import List, Optional, Dict
 
 registered_methods = []
 
@@ -23,11 +22,12 @@ class SourceEmbedding(ABC):
     Implementations should implement the  method
     """
 
-    def __init__(self, yuque_path, model_name, vector_store_config):
+    def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
         """Initialize with YuqueLoader url, model_name, vector_store_config"""
         self.yuque_path = yuque_path
         self.model_name = model_name
         self.vector_store_config = vector_store_config
+        self.embedding_args = embedding_args
 
     @abstractmethod
     @register
diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py
index cca2f6bfe..5fa29e0d2 100644
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -1,108 +1,38 @@
-from random import random
+from typing import List
+from pilot.source_embedding import SourceEmbedding, register
 
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import Milvus
+from bs4 import BeautifulSoup
 from langchain.document_loaders import WebBaseLoader
-from langchain.text_splitter import CharacterTextSplitter
-from pymilvus import connections, DataType, FieldSchema, CollectionSchema
-from pymilvus import Collection
+from langchain.schema import Document
 
 
 
-from pilot.source_embedding.text_to_vector import TextToVector
+class URLEmbedding(SourceEmbedding):
+    """url embedding for read url document."""
 
+    def __init__(self, file_path, model_name, vector_store_config):
+        """Initialize with url path."""
+        self.file_path = file_path
+        self.model_name = model_name
+        self.vector_store_config = vector_store_config
 
-loader = WebBaseLoader([
-    "https://milvus.io/docs/overview.md",
-])
+    @register
+    def read(self):
+        """Load from url path."""
+        loader = WebBaseLoader(web_path=self.file_path)
+        return loader.load()
 
-docs = loader.load()
-
-# Split the documents into smaller chunks
-# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
-# docs = text_splitter.split_documents(docs)
-
-embeddings = TextToVector.textToVector(docs[0].page_content)
-
-milvus = connections.connect(
-  alias="default",
-  host='localhost',
-  port="19530"
-)
-
-# collection = Collection("test_book")
+    @register
+    def data_process(self, documents: List[Document]):
+        i = 0
+        for d in documents:
+            content = d.page_content.replace("\n", "")
+            soup = BeautifulSoup(content, 'html.parser')
+            for tag in soup(['!doctype', 'meta']):
+                tag.extract()
+            documents[i].page_content = soup.get_text()
+            i += 1
+        return documents
 
 
 
-# data = [{"doc_id": 11011, "content": 11011, "title": 11011,  "vector": embeddings[0]}]
-# # collection = Collection("document")
-#
-# # collection.insert(data=data)
-# entities = [
-#     {
-#         'doc_id': d['doc_id'],
-#         'vector': d['vector'],
-#         'content': d['content'],
-#         'title': d['titlseae'],
-#         "type": DataType.FLOAT_VECTOR
-#     } for d in data
-# ]
-#
-# milvus.insert(collection_name="document", entities=entities)
-# print("success")
-# 定义集合的字段
-# fields = [
-#     FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR),
-#     FieldSchema(name="age", dtype=DataType.INT32),
-#     FieldSchema(name="gender", dtype=DataType.STRING),
-#     FieldSchema(name="id", dtype=DataType.INT64)  # 添加主键字段
-# ]
-
-# book_id = FieldSchema(
-#   name="book_id",
-#   dtype=DataType.INT64,
-#   is_primary=True,
-# )
-# book_name = FieldSchema(
-#   name="book_name",
-#   dtype=DataType.BINARY_VECTOR,
-#   max_length=200,
-# )
-# word_count = FieldSchema(
-#   name="word_count",
-#   dtype=DataType.INT64,
-# )
-# book_intro = FieldSchema(
-#   name="book_intro",
-#   dtype=DataType.FLOAT_VECTOR,
-#   dim=2
-# )
-# schema = CollectionSchema(
-#   fields=[book_id, book_name, word_count, book_intro],
-#   description="Test book search"
-# )
-collection_name = "test_book"
-
-collection = Collection(
-    name=collection_name,
-    schema=schema,
-    using='default',
-    shards_num=2
-    )
-# 插入数据
-# entities = [[
-#     {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1},
-#     {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2},
-#     {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3}
-# ]]
-
-entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]]
-
-collection.insert(entities)
-print("success")
-
-# vector_store = Milvus.from_documents(
-#     docs,
-#     embedding=embeddings,
-#     connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
-# )
\ No newline at end of file