feature:url,csv embedding

2025-07-24 12:45:45 +00:00 · 2023-05-11 23:48:56 +08:00 · 2023-05-11 23:48:56 +08:00 · d42a9f3bd1
commit d42a9f3bd1
parent ed855df01d
7 changed files with 86 additions and 108 deletions
--- a/examples/knowledge_embedding/csv_embedding_test.py
+++ b/examples/knowledge_embedding/csv_embedding_test.py
@ -0,0 +1,13 @@
+
+
+from pilot.source_embedding.csv_embedding import CSVEmbedding
+
+# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx"
+path = "/Users/chenketing/Downloads/vectors.csv"
+model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
+vector_store_path = "/pilot/source_embedding/"
+
+
+pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
+pdf_embedding.source_embedding()
+print("success")
--- a/examples/knowledge_embedding/url_embedding_test.py
+++ b/examples/knowledge_embedding/url_embedding_test.py
@ -0,0 +1,10 @@
+from pilot.source_embedding.url_embedding import URLEmbedding
+
+path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023"
+model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
+vector_store_path = "/pilot/source_embedding/"
+
+
+pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
+pdf_embedding.source_embedding()
+print("success")
--- a/pilot/init.py
+++ b/pilot/init.py
@ -1,11 +1,7 @@
 from pilot.source_embedding import (SourceEmbedding, register)
-from pilot.source_embedding import TextToVector
-from pilot.source_embedding import Text2Vectors


 __all__ = [
    "SourceEmbedding",
-    "TextToVector",
-    "Text2Vectors",
    "register"
 ]
--- a/pilot/source_embedding/init.py
+++ b/pilot/source_embedding/init.py
@ -1,12 +1,8 @@
 from pilot.source_embedding.source_embedding import SourceEmbedding
 from pilot.source_embedding.source_embedding import register
-from pilot.source_embedding.text_to_vector import TextToVector
-from pilot.source_embedding.Text2Vectors import Text2Vectors


 __all__ = [
    "SourceEmbedding",
-    "TextToVector",
-    "Text2Vectors",
    "register"
 ]
--- a/pilot/source_embedding/csv_embedding.py
+++ b/pilot/source_embedding/csv_embedding.py
@ -0,0 +1,33 @@
+from typing import List, Optional, Dict
+from pilot.source_embedding import SourceEmbedding, register
+
+from langchain.document_loaders import CSVLoader
+from langchain.schema import Document
+
+
+class CSVEmbedding(SourceEmbedding):
+    """csv embedding for read csv document."""
+
+    def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
+        """Initialize with csv path."""
+        self.file_path = file_path
+        self.model_name = model_name
+        self.vector_store_config = vector_store_config
+        self.embedding_args = embedding_args
+
+    @register
+    def read(self):
+        """Load from csv path."""
+        loader = CSVLoader(file_path=self.file_path)
+        return loader.load()
+
+    @register
+    def data_process(self, documents: List[Document]):
+        i = 0
+        for d in documents:
+            documents[i].page_content = d.page_content.replace("\n", "")
+            i += 1
+        return documents
+
+
+
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@ -6,8 +6,7 @@ from abc import ABC, abstractmethod
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma

-from typing import List
-
+from typing import List, Optional, Dict

 registered_methods = []

@ -23,11 +22,12 @@ class SourceEmbedding(ABC):
    Implementations should implement the  method
    """

-    def __init__(self, yuque_path, model_name, vector_store_config):
+    def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
        """Initialize with YuqueLoader url, model_name, vector_store_config"""
        self.yuque_path = yuque_path
        self.model_name = model_name
        self.vector_store_config = vector_store_config
+        self.embedding_args = embedding_args

    @abstractmethod
    @register
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@ -1,108 +1,38 @@
-from random import random
+from typing import List
+from pilot.source_embedding import SourceEmbedding, register

-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import Milvus
+from bs4 import BeautifulSoup
 from langchain.document_loaders import WebBaseLoader
-from langchain.text_splitter import CharacterTextSplitter
-from pymilvus import connections, DataType, FieldSchema, CollectionSchema
-from pymilvus import Collection
+from langchain.schema import Document



-from pilot.source_embedding.text_to_vector import TextToVector
+class URLEmbedding(SourceEmbedding):
+    """url embedding for read url document."""

+    def __init__(self, file_path, model_name, vector_store_config):
+        """Initialize with url path."""
+        self.file_path = file_path
+        self.model_name = model_name
+        self.vector_store_config = vector_store_config

-loader = WebBaseLoader([
-    "https://milvus.io/docs/overview.md",
-])
+    @register
+    def read(self):
+        """Load from url path."""
+        loader = WebBaseLoader(web_path=self.file_path)
+        return loader.load()

-docs = loader.load()
-
-# Split the documents into smaller chunks
-# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
-# docs = text_splitter.split_documents(docs)
-
-embeddings = TextToVector.textToVector(docs[0].page_content)
-
-milvus = connections.connect(
-  alias="default",
-  host='localhost',
-  port="19530"
-)
-
-# collection = Collection("test_book")
+    @register
+    def data_process(self, documents: List[Document]):
+        i = 0
+        for d in documents:
+            content = d.page_content.replace("\n", "")
+            soup = BeautifulSoup(content, 'html.parser')
+            for tag in soup(['!doctype', 'meta']):
+                tag.extract()
+            documents[i].page_content = soup.get_text()
+            i += 1
+        return documents



-# data = [{"doc_id": 11011, "content": 11011, "title": 11011,  "vector": embeddings[0]}]
-# # collection = Collection("document")
-#
-# # collection.insert(data=data)
-# entities = [
-#     {
-#         'doc_id': d['doc_id'],
-#         'vector': d['vector'],
-#         'content': d['content'],
-#         'title': d['titlseae'],
-#         "type": DataType.FLOAT_VECTOR
-#     } for d in data
-# ]
-#
-# milvus.insert(collection_name="document", entities=entities)
-# print("success")
-# 定义集合的字段
-# fields = [
-#     FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR),
-#     FieldSchema(name="age", dtype=DataType.INT32),
-#     FieldSchema(name="gender", dtype=DataType.STRING),
-#     FieldSchema(name="id", dtype=DataType.INT64)  # 添加主键字段
-# ]
-
-# book_id = FieldSchema(
-#   name="book_id",
-#   dtype=DataType.INT64,
-#   is_primary=True,
-# )
-# book_name = FieldSchema(
-#   name="book_name",
-#   dtype=DataType.BINARY_VECTOR,
-#   max_length=200,
-# )
-# word_count = FieldSchema(
-#   name="word_count",
-#   dtype=DataType.INT64,
-# )
-# book_intro = FieldSchema(
-#   name="book_intro",
-#   dtype=DataType.FLOAT_VECTOR,
-#   dim=2
-# )
-# schema = CollectionSchema(
-#   fields=[book_id, book_name, word_count, book_intro],
-#   description="Test book search"
-# )
-collection_name = "test_book"
-
-collection = Collection(
-    name=collection_name,
-    schema=schema,
-    using='default',
-    shards_num=2
-    )
-# 插入数据
-# entities = [[
-#     {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1},
-#     {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2},
-#     {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3}
-# ]]
-
-entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]]
-
-collection.insert(entities)
-print("success")
-
-# vector_store = Milvus.from_documents(
-#     docs,
-#     embedding=embeddings,
-#     connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
-# )