mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-07 11:23:40 +00:00
update:knowledge load script
This commit is contained in:
parent
03f7ed32e5
commit
336ba1e042
@ -21,15 +21,17 @@ LLM_MODEL_CONFIG = {
|
|||||||
"flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"),
|
"flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"),
|
||||||
"vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"),
|
"vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"),
|
||||||
"text2vec": os.path.join(MODEL_PATH, "text2vec-large-chinese"),
|
"text2vec": os.path.join(MODEL_PATH, "text2vec-large-chinese"),
|
||||||
|
"text2vec-base": os.path.join(MODEL_PATH, "text2vec-base-chinese"),
|
||||||
"sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2")
|
"sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
VECTOR_SEARCH_TOP_K = 3
|
VECTOR_SEARCH_TOP_K = 20
|
||||||
LLM_MODEL = "vicuna-13b"
|
LLM_MODEL = "vicuna-13b"
|
||||||
LIMIT_MODEL_CONCURRENCY = 5
|
LIMIT_MODEL_CONCURRENCY = 5
|
||||||
MAX_POSITION_EMBEDDINGS = 4096
|
MAX_POSITION_EMBEDDINGS = 4096
|
||||||
VICUNA_MODEL_SERVER = "http://121.41.227.141:8000"
|
# VICUNA_MODEL_SERVER = "http://121.41.227.141:8000"
|
||||||
|
VICUNA_MODEL_SERVER = "http://120.79.27.110:8000"
|
||||||
|
|
||||||
# Load model config
|
# Load model config
|
||||||
ISLOAD_8BIT = True
|
ISLOAD_8BIT = True
|
||||||
@ -44,4 +46,5 @@ DB_SETTINGS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
|
VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vs_store")
|
||||||
KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
KNOWLEDGE_UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
||||||
|
KNOWLEDGE_CHUNK_SPLIT_SIZE = 100
|
||||||
|
@ -499,6 +499,7 @@ def build_single_model_ui():
|
|||||||
files = gr.File(label="添加文件",
|
files = gr.File(label="添加文件",
|
||||||
file_types=[".txt", ".md", ".docx", ".pdf"],
|
file_types=[".txt", ".md", ".docx", ".pdf"],
|
||||||
file_count="multiple",
|
file_count="multiple",
|
||||||
|
allow_flagged_uploads=True,
|
||||||
show_label=False
|
show_label=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -9,33 +9,17 @@ class CHNDocumentSplitter(CharacterTextSplitter):
|
|||||||
self.pdf = pdf
|
self.pdf = pdf
|
||||||
self.sentence_size = sentence_size
|
self.sentence_size = sentence_size
|
||||||
|
|
||||||
# def split_text_version2(self, text: str) -> List[str]:
|
|
||||||
# if self.pdf:
|
|
||||||
# text = re.sub(r"\n{3,}", "\n", text)
|
|
||||||
# text = re.sub('\s', ' ', text)
|
|
||||||
# text = text.replace("\n\n", "")
|
|
||||||
# sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
|
|
||||||
# sent_list = []
|
|
||||||
# for ele in sent_sep_pattern.split(text):
|
|
||||||
# if sent_sep_pattern.match(ele) and sent_list:
|
|
||||||
# sent_list[-1] += ele
|
|
||||||
# elif ele:
|
|
||||||
# sent_list.append(ele)
|
|
||||||
# return sent_list
|
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
if self.pdf:
|
if self.pdf:
|
||||||
text = re.sub(r"\n{3,}", r"\n", text)
|
text = re.sub(r"\n{3,}", r"\n", text)
|
||||||
text = re.sub('\s', " ", text)
|
text = re.sub('\s', " ", text)
|
||||||
text = re.sub("\n\n", "", text)
|
text = re.sub("\n\n", "", text)
|
||||||
|
|
||||||
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
|
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text)
|
||||||
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
|
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
|
||||||
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
|
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)
|
||||||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
|
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
|
||||||
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
|
text = text.rstrip()
|
||||||
text = text.rstrip() # 段尾如果有多余的\n就去掉它
|
|
||||||
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
|
|
||||||
ls = [i for i in text.split("\n") if i]
|
ls = [i for i in text.split("\n") if i]
|
||||||
for ele in ls:
|
for ele in ls:
|
||||||
if len(ele) > self.sentence_size:
|
if len(ele) > self.sentence_size:
|
||||||
|
@ -4,13 +4,15 @@ from bs4 import BeautifulSoup
|
|||||||
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
|
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
|
||||||
from langchain.embeddings import HuggingFaceEmbeddings
|
from langchain.embeddings import HuggingFaceEmbeddings
|
||||||
from langchain.vectorstores import Chroma
|
from langchain.vectorstores import Chroma
|
||||||
from pilot.configs.model_config import DATASETS_DIR
|
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||||
from pilot.source_embedding.csv_embedding import CSVEmbedding
|
from pilot.source_embedding.csv_embedding import CSVEmbedding
|
||||||
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
|
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
|
||||||
from pilot.source_embedding.pdf_embedding import PDFEmbedding
|
from pilot.source_embedding.pdf_embedding import PDFEmbedding
|
||||||
import markdown
|
import markdown
|
||||||
|
|
||||||
|
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeEmbedding:
|
class KnowledgeEmbedding:
|
||||||
def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
|
def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
|
||||||
@ -63,7 +65,7 @@ class KnowledgeEmbedding:
|
|||||||
print("directly return vector store")
|
print("directly return vector store")
|
||||||
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
|
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
|
||||||
else:
|
else:
|
||||||
print(vector_name + "is new vector store, knowledge begin load...")
|
print(vector_name + " is new vector store, knowledge begin load...")
|
||||||
documents = self._load_knownlege(self.file_path)
|
documents = self._load_knownlege(self.file_path)
|
||||||
vector_store = Chroma.from_documents(documents=documents,
|
vector_store = Chroma.from_documents(documents=documents,
|
||||||
embedding=self.embeddings,
|
embedding=self.embeddings,
|
||||||
@ -88,7 +90,7 @@ class KnowledgeEmbedding:
|
|||||||
def _load_file(self, filename):
|
def _load_file(self, filename):
|
||||||
if filename.lower().endswith(".md"):
|
if filename.lower().endswith(".md"):
|
||||||
loader = TextLoader(filename)
|
loader = TextLoader(filename)
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
|
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||||
docs = loader.load_and_split(text_splitter)
|
docs = loader.load_and_split(text_splitter)
|
||||||
i = 0
|
i = 0
|
||||||
for d in docs:
|
for d in docs:
|
||||||
@ -100,11 +102,15 @@ class KnowledgeEmbedding:
|
|||||||
docs[i].page_content = docs[i].page_content.replace("\n", " ")
|
docs[i].page_content = docs[i].page_content.replace("\n", " ")
|
||||||
i += 1
|
i += 1
|
||||||
elif filename.lower().endswith(".pdf"):
|
elif filename.lower().endswith(".pdf"):
|
||||||
loader = PyPDFLoader(filename)
|
loader = UnstructuredPaddlePDFLoader(filename)
|
||||||
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
|
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||||
docs = loader.load_and_split(textsplitter)
|
docs = loader.load_and_split(textsplitter)
|
||||||
|
i = 0
|
||||||
|
for d in docs:
|
||||||
|
docs[i].page_content = d.page_content.replace("\n", " ").replace("<EFBFBD>", "")
|
||||||
|
i += 1
|
||||||
else:
|
else:
|
||||||
loader = TextLoader(filename)
|
loader = TextLoader(filename)
|
||||||
text_splitor = CHNDocumentSplitter(sentence_size=100)
|
text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||||
docs = loader.load_and_split(text_splitor)
|
docs = loader.load_and_split(text_splitor)
|
||||||
return docs
|
return docs
|
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
|||||||
from langchain.document_loaders import TextLoader
|
from langchain.document_loaders import TextLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
import markdown
|
import markdown
|
||||||
|
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
from pilot.source_embedding import SourceEmbedding, register
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||||
@ -26,7 +27,7 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
def read(self):
|
def read(self):
|
||||||
"""Load from markdown path."""
|
"""Load from markdown path."""
|
||||||
loader = TextLoader(self.file_path)
|
loader = TextLoader(self.file_path)
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
|
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -2,11 +2,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader
|
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
from pilot.source_embedding import SourceEmbedding, register
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||||
|
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
|
||||||
|
|
||||||
|
|
||||||
class PDFEmbedding(SourceEmbedding):
|
class PDFEmbedding(SourceEmbedding):
|
||||||
@ -22,8 +23,8 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from pdf path."""
|
"""Load from pdf path."""
|
||||||
loader = PyPDFLoader(self.file_path)
|
loader = UnstructuredPaddlePDFLoader(self.file_path)
|
||||||
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
|
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||||
return loader.load_and_split(textsplitter)
|
return loader.load_and_split(textsplitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -50,7 +50,7 @@
|
|||||||
#
|
#
|
||||||
# # text_embeddings = Text2Vectors()
|
# # text_embeddings = Text2Vectors()
|
||||||
# mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"})
|
# mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"})
|
||||||
#
|
#
|
||||||
# mivuls.insert(["textc","tezt2"])
|
# mivuls.insert(["textc","tezt2"])
|
||||||
# print("success")
|
# print("success")
|
||||||
# ct
|
# ct
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
from langchain.embeddings import HuggingFaceEmbeddings
|
||||||
from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection
|
from pymilvus import DataType, FieldSchema, CollectionSchema, connections, Collection
|
||||||
|
|
||||||
|
from pilot.configs.model_config import LLM_MODEL_CONFIG
|
||||||
from pilot.vector_store.vector_store_base import VectorStoreBase
|
from pilot.vector_store.vector_store_base import VectorStoreBase
|
||||||
|
|
||||||
|
|
||||||
@ -9,7 +10,7 @@ class MilvusStore(VectorStoreBase):
|
|||||||
"""Construct a milvus memory storage connection.
|
"""Construct a milvus memory storage connection.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cfg (Config): Auto-GPT global config.
|
cfg (Config): MilvusStore global config.
|
||||||
"""
|
"""
|
||||||
# self.configure(cfg)
|
# self.configure(cfg)
|
||||||
|
|
||||||
@ -71,21 +72,21 @@ class MilvusStore(VectorStoreBase):
|
|||||||
self.index_params,
|
self.index_params,
|
||||||
index_name="vector",
|
index_name="vector",
|
||||||
)
|
)
|
||||||
|
info = self.collection.describe()
|
||||||
self.collection.load()
|
self.collection.load()
|
||||||
|
|
||||||
# def add(self, data) -> str:
|
def insert(self, text) -> str:
|
||||||
# """Add an embedding of data into milvus.
|
"""Add an embedding of data into milvus.
|
||||||
#
|
Args:
|
||||||
# Args:
|
text (str): The raw text to construct embedding index.
|
||||||
# data (str): The raw text to construct embedding index.
|
Returns:
|
||||||
#
|
str: log.
|
||||||
# Returns:
|
"""
|
||||||
# str: log.
|
# embedding = get_ada_embedding(data)
|
||||||
# """
|
embeddings = HuggingFaceEmbeddings(model_name=LLM_MODEL_CONFIG["sentence-transforms"])
|
||||||
# embedding = get_ada_embedding(data)
|
result = self.collection.insert([embeddings.embed_documents(text), text])
|
||||||
# result = self.collection.insert([[embedding], [data]])
|
_text = (
|
||||||
# _text = (
|
"Inserting data into memory at primary key: "
|
||||||
# "Inserting data into memory at primary key: "
|
f"{result.primary_keys[0]}:\n data: {text}"
|
||||||
# f"{result.primary_keys[0]}:\n data: {data}"
|
)
|
||||||
# )
|
return _text
|
||||||
# return _text
|
|
@ -41,5 +41,4 @@ if __name__ == "__main__":
|
|||||||
append_mode = args.append
|
append_mode = args.append
|
||||||
kv = LocalKnowledgeInit()
|
kv = LocalKnowledgeInit()
|
||||||
vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, vector_name=vector_name, append_mode=append_mode)
|
vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, vector_name=vector_name, append_mode=append_mode)
|
||||||
docs = vector_store.similarity_search("小明",1)
|
|
||||||
print("your knowledge embedding success...")
|
print("your knowledge embedding success...")
|
Loading…
Reference in New Issue
Block a user