diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index faa93227f..da68ab332 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -21,15 +21,17 @@ LLM_MODEL_CONFIG = { "flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"), "vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"), "text2vec": os.path.join(MODEL_PATH, "text2vec-large-chinese"), + "text2vec-base": os.path.join(MODEL_PATH, "text2vec-base-chinese"), "sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2") } -VECTOR_SEARCH_TOP_K = 3 +VECTOR_SEARCH_TOP_K = 20 LLM_MODEL = "vicuna-13b" LIMIT_MODEL_CONCURRENCY = 5 MAX_POSITION_EMBEDDINGS = 4096 -VICUNA_MODEL_SERVER = "http://121.41.227.141:8000" +# VICUNA_MODEL_SERVER = "http://121.41.227.141:8000" +VICUNA_MODEL_SERVER = "http://120.79.27.110:8000" # Load model config ISLOAD_8BIT = True @@ -44,4 +46,5 @@ DB_SETTINGS = { } VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store") -KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") \ No newline at end of file +KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") +KNOWLEDGE_CHUNK_SPLIT_SIZE = 100 diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index 07d94b773..25940a437 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -499,6 +499,7 @@ def build_single_model_ui(): files = gr.File(label="添加文件", file_types=[".txt", ".md", ".docx", ".pdf"], file_count="multiple", + allow_flagged_uploads=True, show_label=False ) diff --git a/pilot/source_embedding/chn_document_splitter.py b/pilot/source_embedding/chn_document_splitter.py index 090a6af56..10a77aeca 100644 --- a/pilot/source_embedding/chn_document_splitter.py +++ b/pilot/source_embedding/chn_document_splitter.py @@ -9,33 +9,17 @@ class CHNDocumentSplitter(CharacterTextSplitter): self.pdf = pdf self.sentence_size = sentence_size - # def split_text_version2(self, text: str) -> List[str]: - # if self.pdf: - # text = re.sub(r"\n{3,}", "\n", text) - # text = re.sub('\s', ' ', text) - # text = text.replace("\n\n", "") - # sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; - # sent_list = [] - # for ele in sent_sep_pattern.split(text): - # if sent_sep_pattern.match(ele) and sent_list: - # sent_list[-1] += ele - # elif ele: - # sent_list.append(ele) - # return sent_list - def split_text(self, text: str) -> List[str]: if self.pdf: text = re.sub(r"\n{3,}", r"\n", text) text = re.sub('\s', " ", text) text = re.sub("\n\n", "", text) - text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 - text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 - text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 + text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) + text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) + text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) - # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 - text = text.rstrip() # 段尾如果有多余的\n就去掉它 - # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 + text = text.rstrip() ls = [i for i in text.split("\n") if i] for ele in ls: if len(ele) > self.sentence_size: diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py index 594723b6e..08d962908 100644 --- a/pilot/source_embedding/knowledge_embedding.py +++ b/pilot/source_embedding/knowledge_embedding.py @@ -4,13 +4,15 @@ from bs4 import BeautifulSoup from langchain.document_loaders import PyPDFLoader, TextLoader, markdown from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma -from pilot.configs.model_config import DATASETS_DIR +from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.csv_embedding import CSVEmbedding from pilot.source_embedding.markdown_embedding import MarkdownEmbedding from pilot.source_embedding.pdf_embedding import PDFEmbedding import markdown +from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader + class KnowledgeEmbedding: def __init__(self, file_path, model_name, vector_store_config, local_persist=True): @@ -63,7 +65,7 @@ class KnowledgeEmbedding: print("directly return vector store") vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings) else: - print(vector_name + "is new vector store, knowledge begin load...") + print(vector_name + " is new vector store, knowledge begin load...") documents = self._load_knownlege(self.file_path) vector_store = Chroma.from_documents(documents=documents, embedding=self.embeddings, @@ -88,7 +90,7 @@ class KnowledgeEmbedding: def _load_file(self, filename): if filename.lower().endswith(".md"): loader = TextLoader(filename) - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100) + text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) docs = loader.load_and_split(text_splitter) i = 0 for d in docs: @@ -100,11 +102,15 @@ class KnowledgeEmbedding: docs[i].page_content = docs[i].page_content.replace("\n", " ") i += 1 elif filename.lower().endswith(".pdf"): - loader = PyPDFLoader(filename) - textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100) + loader = UnstructuredPaddlePDFLoader(filename) + textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) docs = loader.load_and_split(textsplitter) + i = 0 + for d in docs: + docs[i].page_content = d.page_content.replace("\n", " ").replace("�", "") + i += 1 else: loader = TextLoader(filename) - text_splitor = CHNDocumentSplitter(sentence_size=100) + text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) docs = loader.load_and_split(text_splitor) return docs \ No newline at end of file diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py index fee9504b6..834226f75 100644 --- a/pilot/source_embedding/markdown_embedding.py +++ b/pilot/source_embedding/markdown_embedding.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup from langchain.document_loaders import TextLoader from langchain.schema import Document import markdown +from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter @@ -26,7 +27,7 @@ class MarkdownEmbedding(SourceEmbedding): def read(self): """Load from markdown path.""" loader = TextLoader(self.file_path) - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100) + text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) return loader.load_and_split(text_splitter) @register diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index bd0ae3aba..a8749695b 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -2,11 +2,12 @@ # -*- coding: utf-8 -*- from typing import List -from langchain.document_loaders import PyPDFLoader from langchain.schema import Document +from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader class PDFEmbedding(SourceEmbedding): @@ -22,8 +23,8 @@ class PDFEmbedding(SourceEmbedding): @register def read(self): """Load from pdf path.""" - loader = PyPDFLoader(self.file_path) - textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100) + loader = UnstructuredPaddlePDFLoader(self.file_path) + textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE) return loader.load_and_split(textsplitter) @register diff --git a/pilot/source_embedding/search_milvus.py b/pilot/source_embedding/search_milvus.py index 18f93d1d3..ec0aa6813 100644 --- a/pilot/source_embedding/search_milvus.py +++ b/pilot/source_embedding/search_milvus.py @@ -50,7 +50,7 @@ # # # text_embeddings = Text2Vectors() # mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"}) -# +# # mivuls.insert(["textc","tezt2"]) # print("success") # ct diff --git a/pilot/vector_store/milvus_store.py b/pilot/vector_store/milvus_store.py index 1f07c969e..eda0b4e38 100644 --- a/pilot/vector_store/milvus_store.py +++ b/pilot/vector_store/milvus_store.py @@ -1,6 +1,7 @@ - +from langchain.embeddings import HuggingFaceEmbeddings from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection +from pilot.configs.model_config import LLM_MODEL_CONFIG from pilot.vector_store.vector_store_base import VectorStoreBase @@ -9,7 +10,7 @@ class MilvusStore(VectorStoreBase): """Construct a milvus memory storage connection. Args: - cfg (Config): Auto-GPT global config. + cfg (Config): MilvusStore global config. """ # self.configure(cfg) @@ -71,21 +72,21 @@ class MilvusStore(VectorStoreBase): self.index_params, index_name="vector", ) + info = self.collection.describe() self.collection.load() - # def add(self, data) -> str: - # """Add an embedding of data into milvus. - # - # Args: - # data (str): The raw text to construct embedding index. - # - # Returns: - # str: log. - # """ - # embedding = get_ada_embedding(data) - # result = self.collection.insert([[embedding], [data]]) - # _text = ( - # "Inserting data into memory at primary key: " - # f"{result.primary_keys[0]}:\n data: {data}" - # ) - # return _text \ No newline at end of file + def insert(self, text) -> str: + """Add an embedding of data into milvus. + Args: + text (str): The raw text to construct embedding index. + Returns: + str: log. + """ + # embedding = get_ada_embedding(data) + embeddings = HuggingFaceEmbeddings(model_name=LLM_MODEL_CONFIG["sentence-transforms"]) + result = self.collection.insert([embeddings.embed_documents(text), text]) + _text = ( + "Inserting data into memory at primary key: " + f"{result.primary_keys[0]}:\n data: {text}" + ) + return _text \ No newline at end of file diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py index bc827953d..e9ecad49a 100644 --- a/tools/knowlege_init.py +++ b/tools/knowlege_init.py @@ -41,5 +41,4 @@ if __name__ == "__main__": append_mode = args.append kv = LocalKnowledgeInit() vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, vector_name=vector_name, append_mode=append_mode) - docs = vector_store.similarity_search("小明",1) print("your knowledge embedding success...") \ No newline at end of file