From 97c9cfbd6b5cf6bbe639c6ce8eba2939816ce337 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Mon, 3 Jul 2023 11:53:26 +0800 Subject: [PATCH 1/3] fix:delete source_embedding 1.rename source_embedding to embedding_engine --- datacenter/next.config.js | 2 +- pilot/server/webserver.py | 4 +- pilot/source_embedding/EncodeTextLoader.py | 26 ----- pilot/source_embedding/__init__.py | 3 - .../source_embedding/chn_document_splitter.py | 55 ---------- pilot/source_embedding/csv_embedding.py | 36 ------- pilot/source_embedding/external/__init__.py | 0 pilot/source_embedding/knowledge_embedding.py | 60 ----------- pilot/source_embedding/markdown_embedding.py | 51 --------- pilot/source_embedding/pdf_embedding.py | 44 -------- pilot/source_embedding/pdf_loader.py | 55 ---------- pilot/source_embedding/ppt_embedding.py | 41 ------- pilot/source_embedding/search_milvus.py | 61 ----------- pilot/source_embedding/source_embedding.py | 101 ------------------ pilot/source_embedding/string_embedding.py | 29 ----- pilot/source_embedding/url_embedding.py | 49 --------- pilot/source_embedding/word_embedding.py | 39 ------- 17 files changed, 2 insertions(+), 654 deletions(-) delete mode 100644 pilot/source_embedding/EncodeTextLoader.py delete mode 100644 pilot/source_embedding/__init__.py delete mode 100644 pilot/source_embedding/chn_document_splitter.py delete mode 100644 pilot/source_embedding/csv_embedding.py delete mode 100644 pilot/source_embedding/external/__init__.py delete mode 100644 pilot/source_embedding/knowledge_embedding.py delete mode 100644 pilot/source_embedding/markdown_embedding.py delete mode 100644 pilot/source_embedding/pdf_embedding.py delete mode 100644 pilot/source_embedding/pdf_loader.py delete mode 100644 pilot/source_embedding/ppt_embedding.py delete mode 100644 pilot/source_embedding/search_milvus.py delete mode 100644 pilot/source_embedding/source_embedding.py delete mode 100644 pilot/source_embedding/string_embedding.py delete mode 100644 pilot/source_embedding/url_embedding.py delete mode 100644 pilot/source_embedding/word_embedding.py diff --git a/datacenter/next.config.js b/datacenter/next.config.js index 3ce9719fa..a07800a23 100644 --- a/datacenter/next.config.js +++ b/datacenter/next.config.js @@ -8,7 +8,7 @@ const nextConfig = { ignoreBuildErrors: true }, env: { - API_BASE_URL: process.env.API_BASE_URL || 'http://localhost:5000' + API_BASE_URL: process.env.API_BASE_URL || 'https://u158074-879a-d00019a9.westa.seetacloud.com:8443' } } diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index 2cc1cdc80..0866e8be9 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -37,7 +37,7 @@ from pilot.conversation import ( from pilot.server.gradio_css import code_highlight_css from pilot.server.gradio_patch import Chatbot as grChatbot -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding from pilot.utils import build_logger from pilot.vector_store.extract_tovec import ( get_vector_storelist, @@ -335,8 +335,6 @@ def http_bot( } elif ChatScene.ChatNewKnowledge == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "user_input": state.last_user_input, "knowledge_name": knowledge_name, diff --git a/pilot/source_embedding/EncodeTextLoader.py b/pilot/source_embedding/EncodeTextLoader.py deleted file mode 100644 index 2b7344f18..000000000 --- a/pilot/source_embedding/EncodeTextLoader.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List, Optional -import chardet - -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader - - -class EncodeTextLoader(BaseLoader): - """Load text files.""" - - def __init__(self, file_path: str, encoding: Optional[str] = None): - """Initialize with file path.""" - self.file_path = file_path - self.encoding = encoding - - def load(self) -> List[Document]: - """Load from file path.""" - with open(self.file_path, "rb") as f: - raw_text = f.read() - result = chardet.detect(raw_text) - if result["encoding"] is None: - text = raw_text.decode("utf-8") - else: - text = raw_text.decode(result["encoding"]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] diff --git a/pilot/source_embedding/__init__.py b/pilot/source_embedding/__init__.py deleted file mode 100644 index 464ff11b1..000000000 --- a/pilot/source_embedding/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from pilot.source_embedding.source_embedding import SourceEmbedding, register - -__all__ = ["SourceEmbedding", "register"] diff --git a/pilot/source_embedding/chn_document_splitter.py b/pilot/source_embedding/chn_document_splitter.py deleted file mode 100644 index 5bf06ea8c..000000000 --- a/pilot/source_embedding/chn_document_splitter.py +++ /dev/null @@ -1,55 +0,0 @@ -import re -from typing import List - -from langchain.text_splitter import CharacterTextSplitter - - -class CHNDocumentSplitter(CharacterTextSplitter): - def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs): - super().__init__(**kwargs) - self.pdf = pdf - self.sentence_size = sentence_size - - def split_text(self, text: str) -> List[str]: - if self.pdf: - text = re.sub(r"\n{3,}", r"\n", text) - text = re.sub("\s", " ", text) - text = re.sub("\n\n", "", text) - - text = re.sub(r"([;;.!?。!?\?])([^”’])", r"\1\n\2", text) - text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) - text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) - text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r"\1\n\2", text) - text = text.rstrip() - ls = [i for i in text.split("\n") if i] - for ele in ls: - if len(ele) > self.sentence_size: - ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r"\1\n\2", ele) - ele1_ls = ele1.split("\n") - for ele_ele1 in ele1_ls: - if len(ele_ele1) > self.sentence_size: - ele_ele2 = re.sub( - r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r"\1\n\2", ele_ele1 - ) - ele2_ls = ele_ele2.split("\n") - for ele_ele2 in ele2_ls: - if len(ele_ele2) > self.sentence_size: - ele_ele3 = re.sub( - '( ["’”」』]{0,2})([^ ])', r"\1\n\2", ele_ele2 - ) - ele2_id = ele2_ls.index(ele_ele2) - ele2_ls = ( - ele2_ls[:ele2_id] - + [i for i in ele_ele3.split("\n") if i] - + ele2_ls[ele2_id + 1 :] - ) - ele_id = ele1_ls.index(ele_ele1) - ele1_ls = ( - ele1_ls[:ele_id] - + [i for i in ele2_ls if i] - + ele1_ls[ele_id + 1 :] - ) - - id = ls.index(ele) - ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1 :] - return ls diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/source_embedding/csv_embedding.py deleted file mode 100644 index 0e69574b4..000000000 --- a/pilot/source_embedding/csv_embedding.py +++ /dev/null @@ -1,36 +0,0 @@ -from typing import Dict, List, Optional - -from langchain.document_loaders import CSVLoader -from langchain.schema import Document - -from pilot.source_embedding import SourceEmbedding, register - - -class CSVEmbedding(SourceEmbedding): - """csv embedding for read csv document.""" - - def __init__( - self, - file_path, - vector_store_config, - embedding_args: Optional[Dict] = None, - ): - """Initialize with csv path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - self.embedding_args = embedding_args - - @register - def read(self): - """Load from csv path.""" - loader = CSVLoader(file_path=self.file_path) - return loader.load() - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - documents[i].page_content = d.page_content.replace("\n", "") - i += 1 - return documents diff --git a/pilot/source_embedding/external/__init__.py b/pilot/source_embedding/external/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py deleted file mode 100644 index b36880067..000000000 --- a/pilot/source_embedding/knowledge_embedding.py +++ /dev/null @@ -1,60 +0,0 @@ -from typing import Optional - -from chromadb.errors import NotEnoughElementsException -from langchain.embeddings import HuggingFaceEmbeddings - -from pilot.configs.config import Config -from pilot.embedding_engine.knowledge_type import get_knowledge_embedding -from pilot.vector_store.connector import VectorStoreConnector - -CFG = Config() - - -class KnowledgeEmbedding: - def __init__( - self, - model_name, - vector_store_config, - file_type: Optional[str] = "default", - file_path: Optional[str] = None, - ): - """Initialize with Loader url, model_name, vector_store_config""" - self.file_path = file_path - self.model_name = model_name - self.vector_store_config = vector_store_config - self.file_type = file_type - self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name) - self.vector_store_config["embeddings"] = self.embeddings - - def knowledge_embedding(self): - self.knowledge_embedding_client = self.init_knowledge_embedding() - self.knowledge_embedding_client.source_embedding() - - def knowledge_embedding_batch(self, docs): - # docs = self.knowledge_embedding_client.read_batch() - self.knowledge_embedding_client.index_to_store(docs) - - def read(self): - self.knowledge_embedding_client = self.init_knowledge_embedding() - return self.knowledge_embedding_client.read_batch() - - def init_knowledge_embedding(self): - return get_knowledge_embedding( - self.file_type.upper(), self.file_path, self.vector_store_config - ) - - def similar_search(self, text, topk): - vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, self.vector_store_config - ) - try: - ans = vector_client.similar_search(text, topk) - except NotEnoughElementsException: - ans = vector_client.similar_search(text, 1) - return ans - - def vector_exist(self): - vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, self.vector_store_config - ) - return vector_client.vector_name_exists() diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py deleted file mode 100644 index d8caee959..000000000 --- a/pilot/source_embedding/markdown_embedding.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -from typing import List - -import markdown -from bs4 import BeautifulSoup -from langchain.schema import Document -from langchain.text_splitter import SpacyTextSplitter - -from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter - -CFG = Config() - - -class MarkdownEmbedding(SourceEmbedding): - """markdown embedding for read markdown document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with markdown path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - # self.encoding = encoding - - @register - def read(self): - """Load from markdown path.""" - loader = EncodeTextLoader(self.file_path) - textsplitter = SpacyTextSplitter( - pipeline="zh_core_web_sm", - chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, - chunk_overlap=100, - ) - return loader.load_and_split(textsplitter) - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - content = markdown.markdown(d.page_content) - soup = BeautifulSoup(content, "html.parser") - for tag in soup(["!doctype", "meta", "i.fa"]): - tag.extract() - documents[i].page_content = soup.get_text() - documents[i].page_content = documents[i].page_content.replace("\n", " ") - i += 1 - return documents diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py deleted file mode 100644 index dd8c39c03..000000000 --- a/pilot/source_embedding/pdf_embedding.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -from typing import List - -from langchain.document_loaders import PyPDFLoader -from langchain.schema import Document -from langchain.text_splitter import SpacyTextSplitter - -from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register - -CFG = Config() - - -class PDFEmbedding(SourceEmbedding): - """pdf embedding for read pdf document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with pdf path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - - @register - def read(self): - """Load from pdf path.""" - loader = PyPDFLoader(self.file_path) - # textsplitter = CHNDocumentSplitter( - # pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE - # ) - textsplitter = SpacyTextSplitter( - pipeline="zh_core_web_sm", - chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, - chunk_overlap=100, - ) - return loader.load_and_split(textsplitter) - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - documents[i].page_content = d.page_content.replace("\n", "") - i += 1 - return documents diff --git a/pilot/source_embedding/pdf_loader.py b/pilot/source_embedding/pdf_loader.py deleted file mode 100644 index bbeead0cd..000000000 --- a/pilot/source_embedding/pdf_loader.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Loader that loads image files.""" -import os -from typing import List - -import fitz -from langchain.document_loaders.unstructured import UnstructuredFileLoader -from paddleocr import PaddleOCR - - -class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" - - def _get_elements(self) -> List: - def pdf_ocr_txt(filepath, dir_path="tmp_files"): - full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) - if not os.path.exists(full_dir_path): - os.makedirs(full_dir_path) - filename = os.path.split(filepath)[-1] - ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) - doc = fitz.open(filepath) - txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) - img_name = os.path.join(full_dir_path, ".tmp.png") - with open(txt_file_path, "w", encoding="utf-8") as fout: - for i in range(doc.page_count): - page = doc[i] - text = page.get_text("") - fout.write(text) - fout.write("\n") - - img_list = page.get_images() - for img in img_list: - pix = fitz.Pixmap(doc, img[0]) - - pix.save(img_name) - - result = ocr.ocr(img_name) - ocr_result = [i[1][0] for line in result for i in line] - fout.write("\n".join(ocr_result)) - os.remove(img_name) - return txt_file_path - - txt_file_path = pdf_ocr_txt(self.file_path) - from unstructured.partition.text import partition_text - - return partition_text(filename=txt_file_path, **self.unstructured_kwargs) - - -if __name__ == "__main__": - filepath = os.path.join( - os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test_py.pdf" - ) - loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") - docs = loader.load() - for doc in docs: - print(doc) diff --git a/pilot/source_embedding/ppt_embedding.py b/pilot/source_embedding/ppt_embedding.py deleted file mode 100644 index 583b29ed1..000000000 --- a/pilot/source_embedding/ppt_embedding.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -from typing import List - -from langchain.document_loaders import UnstructuredPowerPointLoader -from langchain.schema import Document -from langchain.text_splitter import SpacyTextSplitter - -from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register - -CFG = Config() - - -class PPTEmbedding(SourceEmbedding): - """ppt embedding for read ppt document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with pdf path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - - @register - def read(self): - """Load from ppt path.""" - loader = UnstructuredPowerPointLoader(self.file_path) - textsplitter = SpacyTextSplitter( - pipeline="zh_core_web_sm", - chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, - chunk_overlap=200, - ) - return loader.load_and_split(textsplitter) - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - documents[i].page_content = d.page_content.replace("\n", "") - i += 1 - return documents diff --git a/pilot/source_embedding/search_milvus.py b/pilot/source_embedding/search_milvus.py deleted file mode 100644 index aa02c1f61..000000000 --- a/pilot/source_embedding/search_milvus.py +++ /dev/null @@ -1,61 +0,0 @@ -# from langchain.embeddings import HuggingFaceEmbeddings -# from langchain.vectorstores import Milvus -# from pymilvus import Collection,utility -# from pymilvus import connections, DataType, FieldSchema, CollectionSchema -# -# # milvus = connections.connect( -# # alias="default", -# # host='localhost', -# # port="19530" -# # ) -# # collection = Collection("book") -# -# -# # Get an existing collection. -# # collection.load() -# # -# # search_params = {"metric_type": "L2", "params": {}, "offset": 5} -# # -# # results = collection.search( -# # data=[[0.1, 0.2]], -# # anns_field="book_intro", -# # param=search_params, -# # limit=10, -# # expr=None, -# # output_fields=['book_id'], -# # consistency_level="Strong" -# # ) -# # -# # # get the IDs of all returned hits -# # results[0].ids -# # -# # # get the distances to the query vector from all returned hits -# # results[0].distances -# # -# # # get the value of an output field specified in the search request. -# # # vector fields are not supported yet. -# # hit = results[0][0] -# # hit.entity.get('title') -# -# # milvus = connections.connect( -# # alias="default", -# # host='localhost', -# # port="19530" -# # ) -# from pilot.vector_store.milvus_store import MilvusStore -# -# data = ["aaa", "bbb"] -# model_name = "xx/all-MiniLM-L6-v2" -# embeddings = HuggingFaceEmbeddings(model_name=model_name) -# -# # text_embeddings = Text2Vectors() -# mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"}) -# -# mivuls.insert(["textc","tezt2"]) -# print("success") -# ct -# # mivuls.from_texts(texts=data, embedding=embeddings) -# # docs, -# # embedding=embeddings, -# # connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"} -# # ) diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py deleted file mode 100644 index 372e35c22..000000000 --- a/pilot/source_embedding/source_embedding.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -from abc import ABC, abstractmethod -from typing import Dict, List, Optional - -from chromadb.errors import NotEnoughElementsException -from pilot.configs.config import Config -from pilot.vector_store.connector import VectorStoreConnector - -registered_methods = [] -CFG = Config() - - -def register(method): - registered_methods.append(method.__name__) - return method - - -class SourceEmbedding(ABC): - """base class for read data source embedding pipeline. - include data read, data process, data split, data to vector, data index vector store - Implementations should implement the method - """ - - def __init__( - self, - file_path, - vector_store_config, - embedding_args: Optional[Dict] = None, - ): - """Initialize with Loader url, model_name, vector_store_config""" - self.file_path = file_path - self.vector_store_config = vector_store_config - self.embedding_args = embedding_args - self.embeddings = vector_store_config["embeddings"] - - @abstractmethod - @register - def read(self) -> List[ABC]: - """read datasource into document objects.""" - - @register - def data_process(self, text): - """pre process data.""" - - @register - def text_split(self, text): - """text split chunk""" - pass - - @register - def text_to_vector(self, docs): - """transform vector""" - pass - - @register - def index_to_store(self, docs): - """index to vector store""" - self.vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, self.vector_store_config - ) - self.vector_client.load_document(docs) - - @register - def similar_search(self, doc, topk): - """vector store similarity_search""" - self.vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, self.vector_store_config - ) - try: - ans = self.vector_client.similar_search(doc, topk) - except NotEnoughElementsException: - ans = self.vector_client.similar_search(doc, 1) - return ans - - def vector_name_exist(self): - self.vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, self.vector_store_config - ) - return self.vector_client.vector_name_exists() - - def source_embedding(self): - if "read" in registered_methods: - text = self.read() - if "data_process" in registered_methods: - text = self.data_process(text) - if "text_split" in registered_methods: - self.text_split(text) - if "text_to_vector" in registered_methods: - self.text_to_vector(text) - if "index_to_store" in registered_methods: - self.index_to_store(text) - - def read_batch(self): - if "read" in registered_methods: - text = self.read() - if "data_process" in registered_methods: - text = self.data_process(text) - if "text_split" in registered_methods: - self.text_split(text) - return text diff --git a/pilot/source_embedding/string_embedding.py b/pilot/source_embedding/string_embedding.py deleted file mode 100644 index a1d18ee82..000000000 --- a/pilot/source_embedding/string_embedding.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import List - -from langchain.schema import Document - -from pilot import SourceEmbedding, register - - -class StringEmbedding(SourceEmbedding): - """string embedding for read string document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with pdf path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - - @register - def read(self): - """Load from String path.""" - metadata = {"source": "db_summary"} - return [Document(page_content=self.file_path, metadata=metadata)] - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - documents[i].page_content = d.page_content.replace("\n", "") - i += 1 - return documents diff --git a/pilot/source_embedding/url_embedding.py b/pilot/source_embedding/url_embedding.py deleted file mode 100644 index a315e6e45..000000000 --- a/pilot/source_embedding/url_embedding.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import List - -from bs4 import BeautifulSoup -from langchain.document_loaders import WebBaseLoader -from langchain.schema import Document -from langchain.text_splitter import CharacterTextSplitter - -from pilot.configs.config import Config -from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter - -CFG = Config() - - -class URLEmbedding(SourceEmbedding): - """url embedding for read url document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with url path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - - @register - def read(self): - """Load from url path.""" - loader = WebBaseLoader(web_path=self.file_path) - if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( - chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, - chunk_overlap=20, - length_function=len, - ) - else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) - return loader.load_and_split(text_splitter) - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - content = d.page_content.replace("\n", "") - soup = BeautifulSoup(content, "html.parser") - for tag in soup(["!doctype", "meta"]): - tag.extract() - documents[i].page_content = soup.get_text() - i += 1 - return documents diff --git a/pilot/source_embedding/word_embedding.py b/pilot/source_embedding/word_embedding.py deleted file mode 100644 index 1f30f241c..000000000 --- a/pilot/source_embedding/word_embedding.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -from typing import List - -from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader -from langchain.schema import Document - -from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter - -CFG = Config() - - -class WordEmbedding(SourceEmbedding): - """word embedding for read word document.""" - - def __init__(self, file_path, vector_store_config): - """Initialize with word path.""" - super().__init__(file_path, vector_store_config) - self.file_path = file_path - self.vector_store_config = vector_store_config - - @register - def read(self): - """Load from word path.""" - loader = UnstructuredWordDocumentLoader(self.file_path) - textsplitter = CHNDocumentSplitter( - pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE - ) - return loader.load_and_split(textsplitter) - - @register - def data_process(self, documents: List[Document]): - i = 0 - for d in documents: - documents[i].page_content = d.page_content.replace("\n", "") - i += 1 - return documents From b831ee586356bc0a038de91f07a05953788e0d00 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Mon, 3 Jul 2023 15:03:23 +0800 Subject: [PATCH 2/3] refactor: webserver chat delete temperature, max_new_tokens --- pilot/embedding_engine/source_embedding.py | 14 ++++++++++---- pilot/server/webserver.py | 12 ------------ tools/knowledge_init.py | 1 + 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pilot/embedding_engine/source_embedding.py b/pilot/embedding_engine/source_embedding.py index b99529cf9..372e35c22 100644 --- a/pilot/embedding_engine/source_embedding.py +++ b/pilot/embedding_engine/source_embedding.py @@ -33,9 +33,6 @@ class SourceEmbedding(ABC): self.vector_store_config = vector_store_config self.embedding_args = embedding_args self.embeddings = vector_store_config["embeddings"] - self.vector_client = VectorStoreConnector( - CFG.VECTOR_STORE_TYPE, vector_store_config - ) @abstractmethod @register @@ -59,11 +56,17 @@ class SourceEmbedding(ABC): @register def index_to_store(self, docs): """index to vector store""" - return self.vector_client.load_document(docs) + self.vector_client = VectorStoreConnector( + CFG.VECTOR_STORE_TYPE, self.vector_store_config + ) + self.vector_client.load_document(docs) @register def similar_search(self, doc, topk): """vector store similarity_search""" + self.vector_client = VectorStoreConnector( + CFG.VECTOR_STORE_TYPE, self.vector_store_config + ) try: ans = self.vector_client.similar_search(doc, topk) except NotEnoughElementsException: @@ -71,6 +74,9 @@ class SourceEmbedding(ABC): return ans def vector_name_exist(self): + self.vector_client = VectorStoreConnector( + CFG.VECTOR_STORE_TYPE, self.vector_store_config + ) return self.vector_client.vector_name_exists() def source_embedding(self): diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index 0866e8be9..b6c1e2cc3 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -297,39 +297,29 @@ def http_bot( if ChatScene.ChatWithDbExecute == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "db_name": db_selector, "user_input": state.last_user_input, } elif ChatScene.ChatWithDbQA == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "db_name": db_selector, "user_input": state.last_user_input, } elif ChatScene.ChatExecution == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "plugin_selector": plugin_selector, "user_input": state.last_user_input, } elif ChatScene.ChatNormal == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "user_input": state.last_user_input, } elif ChatScene.ChatDefaultKnowledge == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "user_input": state.last_user_input, } @@ -341,8 +331,6 @@ def http_bot( } elif ChatScene.ChatUrlKnowledge == scene: chat_param = { - "temperature": temperature, - "max_new_tokens": max_new_tokens, "chat_session_id": state.conv_id, "user_input": state.last_user_input, "url": url_input, diff --git a/tools/knowledge_init.py b/tools/knowledge_init.py index 752cf18d0..66285a784 100644 --- a/tools/knowledge_init.py +++ b/tools/knowledge_init.py @@ -49,6 +49,7 @@ class LocalKnowledgeInit: space = KnowledgeSpaceRequest space.name = self.vector_store_config["vector_store_name"] space.desc = "knowledge_init.py" + space.vector_type = CFG.VECTOR_STORE_TYPE space.owner = "DB-GPT" knowledge_space_service.create_knowledge_space(space) From 6b17a16bd0a9cfe3d604681f5f62b2fc10036db8 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Mon, 3 Jul 2023 16:09:18 +0800 Subject: [PATCH 3/3] fix:knowledge_init path 1.knowledge_init path 2.url embedding chunk --- pilot/embedding_engine/url_embedding.py | 8 ++++++-- tools/knowledge_init.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pilot/embedding_engine/url_embedding.py b/pilot/embedding_engine/url_embedding.py index 113e2985e..ce9e5360d 100644 --- a/pilot/embedding_engine/url_embedding.py +++ b/pilot/embedding_engine/url_embedding.py @@ -3,7 +3,7 @@ from typing import List from bs4 import BeautifulSoup from langchain.document_loaders import WebBaseLoader from langchain.schema import Document -from langchain.text_splitter import CharacterTextSplitter +from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter from pilot.configs.config import Config from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE @@ -33,7 +33,11 @@ class URLEmbedding(SourceEmbedding): length_function=len, ) else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) + text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, + chunk_overlap=100, + ) return loader.load_and_split(text_splitter) @register diff --git a/tools/knowledge_init.py b/tools/knowledge_init.py index 66285a784..34460e398 100644 --- a/tools/knowledge_init.py +++ b/tools/knowledge_init.py @@ -3,12 +3,12 @@ import argparse import os import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from pilot.embedding_engine.knowledge_type import KnowledgeType from pilot.openapi.knowledge.knowledge_service import KnowledgeService from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from pilot.configs.config import Config from pilot.configs.model_config import (