feature:url,csv embedding

2025-09-10 05:19:44 +00:00 · 2023-05-11 23:48:56 +08:00
parent ed855df01d
commit d42a9f3bd1
7 changed files with 86 additions and 108 deletions
--- a/examples/knowledge_embedding/csv_embedding_test.py
+++ b/examples/knowledge_embedding/csv_embedding_test.py
@@ -0,0 +1,13 @@
 from pilot.source_embedding.csv_embedding import CSVEmbedding
 # path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx"
 path = "/Users/chenketing/Downloads/vectors.csv"
 model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
 vector_store_path = "/pilot/source_embedding/"
 pdf_embedding = CSVEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
 pdf_embedding.source_embedding()
 print("success")
--- a/examples/knowledge_embedding/url_embedding_test.py
+++ b/examples/knowledge_embedding/url_embedding_test.py
@@ -0,0 +1,10 @@
 from pilot.source_embedding.url_embedding import URLEmbedding
 path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023"
 model_name = "/Users/chenketing/Desktop/project/all-MiniLM-L6-v2"
 vector_store_path = "/pilot/source_embedding/"
 pdf_embedding = URLEmbedding(file_path=path, model_name=model_name, vector_store_config={"vector_store_name": "url", "vector_store_path": "vector_store_path"})
 pdf_embedding.source_embedding()
 print("success")
--- a/pilot/init.py
+++ b/pilot/init.py
@@ -1,11 +1,7 @@
 from pilot.source_embedding import (SourceEmbedding, register)
 from pilot.source_embedding import TextToVector
 from pilot.source_embedding import Text2Vectors
 __all__ = [
    "SourceEmbedding",
    "TextToVector",
    "Text2Vectors",
    "register"
 ]
--- a/pilot/source_embedding/init.py
+++ b/pilot/source_embedding/init.py
@@ -1,12 +1,8 @@
 from pilot.source_embedding.source_embedding import SourceEmbedding
 from pilot.source_embedding.source_embedding import register
 from pilot.source_embedding.text_to_vector import TextToVector
 from pilot.source_embedding.Text2Vectors import Text2Vectors
 __all__ = [
    "SourceEmbedding",
    "TextToVector",
    "Text2Vectors",
    "register"
 ]
--- a/pilot/source_embedding/csv_embedding.py
+++ b/pilot/source_embedding/csv_embedding.py
@@ -0,0 +1,33 @@
 from typing import List, Optional, Dict
 from pilot.source_embedding import SourceEmbedding, register
 from langchain.document_loaders import CSVLoader
 from langchain.schema import Document
 class CSVEmbedding(SourceEmbedding):
    """csv embedding for read csv document."""
    def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
        """Initialize with csv path."""
        self.file_path = file_path
        self.model_name = model_name
        self.vector_store_config = vector_store_config
        self.embedding_args = embedding_args
    @register
    def read(self):
        """Load from csv path."""
        loader = CSVLoader(file_path=self.file_path)
        return loader.load()
    @register
    def data_process(self, documents: List[Document]):
        i = 0
        for d in documents:
            documents[i].page_content = d.page_content.replace("\n", "")
            i += 1
        return documents
--- a/pilot/source_embedding/source_embedding.py
+++ b/pilot/source_embedding/source_embedding.py
@@ -6,8 +6,7 @@ from abc import ABC, abstractmethod
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
-from typing import List
+from typing import List, Optional, Dict
 registered_methods = []
@@ -23,11 +22,12 @@ class SourceEmbedding(ABC):
    Implementations should implement the  method
    """
-    def __init__(self, yuque_path, model_name, vector_store_config):
+    def __init__(self, yuque_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
        """Initialize with YuqueLoader url, model_name, vector_store_config"""
        self.yuque_path = yuque_path
        self.model_name = model_name
        self.vector_store_config = vector_store_config
        self.embedding_args = embedding_args
    @abstractmethod
    @register
--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -1,108 +1,38 @@
-from random import random
+from typing import List
 from pilot.source_embedding import SourceEmbedding, register
-from langchain.embeddings.openai import OpenAIEmbeddings
+from bs4 import BeautifulSoup
 from langchain.vectorstores import Milvus
 from langchain.document_loaders import WebBaseLoader
-from langchain.text_splitter import CharacterTextSplitter
+from langchain.schema import Document
 from pymilvus import connections, DataType, FieldSchema, CollectionSchema
 from pymilvus import Collection
-from pilot.source_embedding.text_to_vector import TextToVector
+class URLEmbedding(SourceEmbedding):
    """url embedding for read url document."""
    def __init__(self, file_path, model_name, vector_store_config):
        """Initialize with url path."""
        self.file_path = file_path
        self.model_name = model_name
        self.vector_store_config = vector_store_config
-loader = WebBaseLoader([
+    @register
-    "https://milvus.io/docs/overview.md",
+    def read(self):
-])
+        """Load from url path."""
        loader = WebBaseLoader(web_path=self.file_path)
        return loader.load()
-docs = loader.load()
+    @register
-
+    def data_process(self, documents: List[Document]):
-# Split the documents into smaller chunks
+        i = 0
-# text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
+        for d in documents:
-# docs = text_splitter.split_documents(docs)
+            content = d.page_content.replace("\n", "")
-
+            soup = BeautifulSoup(content, 'html.parser')
-embeddings = TextToVector.textToVector(docs[0].page_content)
+            for tag in soup(['!doctype', 'meta']):
-
+                tag.extract()
-milvus = connections.connect(
+            documents[i].page_content = soup.get_text()
-  alias="default",
+            i += 1
-  host='localhost',
+        return documents
  port="19530"
 )
 # collection = Collection("test_book")
 # data = [{"doc_id": 11011, "content": 11011, "title": 11011,  "vector": embeddings[0]}]
 # # collection = Collection("document")
 #
 # # collection.insert(data=data)
 # entities = [
 #     {
 #         'doc_id': d['doc_id'],
 #         'vector': d['vector'],
 #         'content': d['content'],
 #         'title': d['titlseae'],
 #         "type": DataType.FLOAT_VECTOR
 #     } for d in data
 # ]
 #
 # milvus.insert(collection_name="document", entities=entities)
 # print("success")
 # 定义集合的字段
 # fields = [
 #     FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR),
 #     FieldSchema(name="age", dtype=DataType.INT32),
 #     FieldSchema(name="gender", dtype=DataType.STRING),
 #     FieldSchema(name="id", dtype=DataType.INT64)  # 添加主键字段
 # ]
 # book_id = FieldSchema(
 #   name="book_id",
 #   dtype=DataType.INT64,
 #   is_primary=True,
 # )
 # book_name = FieldSchema(
 #   name="book_name",
 #   dtype=DataType.BINARY_VECTOR,
 #   max_length=200,
 # )
 # word_count = FieldSchema(
 #   name="word_count",
 #   dtype=DataType.INT64,
 # )
 # book_intro = FieldSchema(
 #   name="book_intro",
 #   dtype=DataType.FLOAT_VECTOR,
 #   dim=2
 # )
 # schema = CollectionSchema(
 #   fields=[book_id, book_name, word_count, book_intro],
 #   description="Test book search"
 # )
 collection_name = "test_book"
 collection = Collection(
    name=collection_name,
    schema=schema,
    using='default',
    shards_num=2
    )
 # 插入数据
 # entities = [[
 #     {"book_id": 30, "book_intro": [0.1, 0.2], "word_count": 1},
 #     {"book_id": 25, "book_intro": [0.1, 0.2], "word_count": 2},
 #     {"book_id": 40, "book_intro": [0.1, 0.2], "word_count": 3}
 # ]]
 entities = [[30, 25, 40], ["test1", "test2", "test3"], [1, 2, 3], [[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]]
 collection.insert(entities)
 print("success")
 # vector_store = Milvus.from_documents(
 #     docs,
 #     embedding=embeddings,
 #     connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
 # )