feature:knowledge embedding support file path auto adapt

This commit is contained in:
aries-ckt 2023-06-05 16:26:19 +08:00
parent 31d457cfd5
commit be1a792d3c
16 changed files with 140 additions and 82 deletions

View File

@ -148,6 +148,8 @@ class Config(metaclass=Singleton):
### EMBEDDING Configuration ### EMBEDDING Configuration
self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec") self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec")
self.KNOWLEDGE_CHUNK_SIZE = os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)
self.KNOWLEDGE_SEARCH_TOP_SIZE = os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10)
### SUMMARY_CONFIG Configuration ### SUMMARY_CONFIG Configuration
self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR") self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR")

View File

@ -34,7 +34,6 @@ LLM_MODEL_CONFIG = {
"chatglm-6b-int4": os.path.join(MODEL_PATH, "chatglm-6b-int4"), "chatglm-6b-int4": os.path.join(MODEL_PATH, "chatglm-6b-int4"),
"chatglm-6b": os.path.join(MODEL_PATH, "chatglm-6b"), "chatglm-6b": os.path.join(MODEL_PATH, "chatglm-6b"),
"text2vec-base": os.path.join(MODEL_PATH, "text2vec-base-chinese"), "text2vec-base": os.path.join(MODEL_PATH, "text2vec-base-chinese"),
"guanaco-33b-merged": os.path.join(MODEL_PATH, "guanaco-33b-merged"
"sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2"), "sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2"),
"guanaco-33b-merged": os.path.join(MODEL_PATH, "guanaco-33b-merged"), "guanaco-33b-merged": os.path.join(MODEL_PATH, "guanaco-33b-merged"),
"proxyllm": "proxyllm", "proxyllm": "proxyllm",

View File

View File

@ -46,9 +46,7 @@ class ChatNewKnowledge(BaseChat):
"vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
} }
self.knowledge_embedding_client = KnowledgeEmbedding( self.knowledge_embedding_client = KnowledgeEmbedding(
file_path="",
model_name=LLM_MODEL_CONFIG["text2vec"], model_name=LLM_MODEL_CONFIG["text2vec"],
local_persist=False,
vector_store_config=vector_store_config, vector_store_config=vector_store_config,
) )

View File

@ -14,13 +14,21 @@ CFG = Config()
PROMPT_SCENE_DEFINE = """You are an AI designed to answer human questions, please follow the prompts and conventions of the system's input for your answers""" PROMPT_SCENE_DEFINE = """You are an AI designed to answer human questions, please follow the prompts and conventions of the system's input for your answers"""
_DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题, _DEFAULT_TEMPLATE_ZH = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造 如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造
已知内容: 已知内容:
{context} {context}
问题: 问题:
{question} {question}
""" """
_DEFAULT_TEMPLATE_EN = """ Based on the known information below, provide users with professional and concise answers to their questions. If the answer cannot be obtained from the provided content, please say: "The information provided in the knowledge base is not sufficient to answer this question." It is forbidden to make up information randomly.
known information:
{context}
question:
{question}
"""
_DEFAULT_TEMPLATE = _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == 'en' else _DEFAULT_TEMPLATE_ZH
PROMPT_SEP = SeparatorStyle.SINGLE.value PROMPT_SEP = SeparatorStyle.SINGLE.value

View File

@ -42,9 +42,7 @@ class ChatDefaultKnowledge(BaseChat):
"vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
} }
self.knowledge_embedding_client = KnowledgeEmbedding( self.knowledge_embedding_client = KnowledgeEmbedding(
file_path="",
model_name=LLM_MODEL_CONFIG["text2vec"], model_name=LLM_MODEL_CONFIG["text2vec"],
local_persist=False,
vector_store_config=vector_store_config, vector_store_config=vector_store_config,
) )

View File

@ -15,13 +15,21 @@ PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelli
The assistant gives helpful, detailed, professional and polite answers to the user's questions. """ The assistant gives helpful, detailed, professional and polite answers to the user's questions. """
_DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题, _DEFAULT_TEMPLATE_ZH = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造 如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造
已知内容: 已知内容:
{context} {context}
问题: 问题:
{question} {question}
""" """
_DEFAULT_TEMPLATE_EN = """ Based on the known information below, provide users with professional and concise answers to their questions. If the answer cannot be obtained from the provided content, please say: "The information provided in the knowledge base is not sufficient to answer this question." It is forbidden to make up information randomly.
known information:
{context}
question:
{question}
"""
_DEFAULT_TEMPLATE = _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == 'en' else _DEFAULT_TEMPLATE_ZH
PROMPT_SEP = SeparatorStyle.SINGLE.value PROMPT_SEP = SeparatorStyle.SINGLE.value

View File

@ -40,15 +40,13 @@ class ChatUrlKnowledge(BaseChat):
self.url = url self.url = url
vector_store_config = { vector_store_config = {
"vector_store_name": url, "vector_store_name": url,
"text_field": "content",
"vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
} }
self.knowledge_embedding_client = KnowledgeEmbedding( self.knowledge_embedding_client = KnowledgeEmbedding(
file_path=url,
file_type="url",
model_name=LLM_MODEL_CONFIG["text2vec"], model_name=LLM_MODEL_CONFIG["text2vec"],
local_persist=False,
vector_store_config=vector_store_config, vector_store_config=vector_store_config,
file_type="url",
file_path=url,
) )
# url soruce in vector # url soruce in vector

View File

@ -14,20 +14,21 @@ CFG = Config()
PROMPT_SCENE_DEFINE = """A chat between a curious human and an artificial intelligence assistant, who very familiar with database related knowledge. PROMPT_SCENE_DEFINE = """A chat between a curious human and an artificial intelligence assistant, who very familiar with database related knowledge.
The assistant gives helpful, detailed, professional and polite answers to the user's questions. """ The assistant gives helpful, detailed, professional and polite answers to the user's questions. """
_DEFAULT_TEMPLATE_ZH = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
# _DEFAULT_TEMPLATE = """ Based on the known information, provide professional and concise answers to the user's questions. If the answer cannot be obtained from the provided content, please say: 'The information provided in the knowledge base is not sufficient to answer this question.' Fabrication is prohibited.。
# known information:
# {context}
# question:
# {question}
# """
_DEFAULT_TEMPLATE = """ 基于以下已知的信息, 专业、简要的回答用户的问题,
如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造 如果无法从提供的内容中获取答案, 请说: "知识库中提供的内容不足以回答此问题" 禁止胡乱编造
已知内容: 已知内容:
{context} {context}
问题: 问题:
{question} {question}
""" """
_DEFAULT_TEMPLATE_EN = """ Based on the known information below, provide users with professional and concise answers to their questions. If the answer cannot be obtained from the provided content, please say: "The information provided in the knowledge base is not sufficient to answer this question." It is forbidden to make up information randomly.
known information:
{context}
question:
{question}
"""
_DEFAULT_TEMPLATE = _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == 'en' else _DEFAULT_TEMPLATE_ZH
PROMPT_SEP = SeparatorStyle.SINGLE.value PROMPT_SEP = SeparatorStyle.SINGLE.value

View File

@ -0,0 +1,26 @@
from typing import List, Optional
import chardet
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class EncodeTextLoader(BaseLoader):
"""Load text files."""
def __init__(self, file_path: str, encoding: Optional[str] = None):
"""Initialize with file path."""
self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]:
"""Load from file path."""
with open(self.file_path, 'rb') as f:
raw_text = f.read()
result = chardet.detect(raw_text)
if result['encoding'] is None:
text = raw_text.decode('utf-8')
else:
text = raw_text.decode(result['encoding'])
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]

View File

@ -1,4 +1,5 @@
import os import os
from typing import Optional
import markdown import markdown
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -12,19 +13,28 @@ from pilot.source_embedding.csv_embedding import CSVEmbedding
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
from pilot.source_embedding.pdf_embedding import PDFEmbedding from pilot.source_embedding.pdf_embedding import PDFEmbedding
from pilot.source_embedding.url_embedding import URLEmbedding from pilot.source_embedding.url_embedding import URLEmbedding
from pilot.source_embedding.word_embedding import WordEmbedding
from pilot.vector_store.connector import VectorStoreConnector from pilot.vector_store.connector import VectorStoreConnector
CFG = Config() CFG = Config()
KnowledgeEmbeddingType = {
".txt": (MarkdownEmbedding, {}),
".md": (MarkdownEmbedding,{}),
".pdf": (PDFEmbedding, {}),
".doc": (WordEmbedding, {}),
".docx": (WordEmbedding, {}),
".csv": (CSVEmbedding, {}),
}
class KnowledgeEmbedding: class KnowledgeEmbedding:
def __init__( def __init__(
self, self,
file_path,
model_name, model_name,
vector_store_config, vector_store_config,
local_persist=True, file_type: Optional[str] = "default",
file_type="default", file_path: Optional[str] = None,
): ):
"""Initialize with Loader url, model_name, vector_store_config""" """Initialize with Loader url, model_name, vector_store_config"""
self.file_path = file_path self.file_path = file_path
@ -33,11 +43,9 @@ class KnowledgeEmbedding:
self.file_type = file_type self.file_type = file_type
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name) self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
self.vector_store_config["embeddings"] = self.embeddings self.vector_store_config["embeddings"] = self.embeddings
self.local_persist = local_persist
if not self.local_persist:
self.knowledge_embedding_client = self.init_knowledge_embedding()
def knowledge_embedding(self): def knowledge_embedding(self):
self.knowledge_embedding_client = self.init_knowledge_embedding()
self.knowledge_embedding_client.source_embedding() self.knowledge_embedding_client.source_embedding()
def knowledge_embedding_batch(self): def knowledge_embedding_batch(self):
@ -50,40 +58,24 @@ class KnowledgeEmbedding:
model_name=self.model_name, model_name=self.model_name,
vector_store_config=self.vector_store_config, vector_store_config=self.vector_store_config,
) )
elif self.file_path.endswith(".pdf"): return embedding
embedding = PDFEmbedding( extension = "." + self.file_path.rsplit(".", 1)[-1]
file_path=self.file_path, if extension in KnowledgeEmbeddingType:
model_name=self.model_name, knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension]
vector_store_config=self.vector_store_config, embedding = knowledge_class(self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config, **knowledge_args)
) return embedding
elif self.file_path.endswith(".md"): raise ValueError(f"Unsupported knowledge file type '{extension}'")
embedding = MarkdownEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
elif self.file_path.endswith(".csv"):
embedding = CSVEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
elif self.file_type == "default":
embedding = MarkdownEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
return embedding return embedding
def similar_search(self, text, topk): def similar_search(self, text, topk):
return self.knowledge_embedding_client.similar_search(text, topk) vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, self.vector_store_config)
return vector_client.similar_search(text, topk)
def vector_exist(self): def vector_exist(self):
return self.knowledge_embedding_client.vector_name_exist() vector_client = VectorStoreConnector(
CFG.VECTOR_STORE_TYPE, self.vector_store_config
)
return vector_client.vector_name_exists()
def knowledge_persist_initialization(self, append_mode): def knowledge_persist_initialization(self, append_mode):
documents = self._load_knownlege(self.file_path) documents = self._load_knownlege(self.file_path)

View File

@ -10,6 +10,7 @@ from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
@ -22,11 +23,12 @@ class MarkdownEmbedding(SourceEmbedding):
self.file_path = file_path self.file_path = file_path
self.model_name = model_name self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
# self.encoding = encoding
@register @register
def read(self): def read(self):
"""Load from markdown path.""" """Load from markdown path."""
loader = TextLoader(self.file_path) loader = EncodeTextLoader(self.file_path)
text_splitter = CHNDocumentSplitter( text_splitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
) )

View File

@ -13,13 +13,13 @@ from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
class PDFEmbedding(SourceEmbedding): class PDFEmbedding(SourceEmbedding):
"""pdf embedding for read pdf document.""" """pdf embedding for read pdf document."""
def __init__(self, file_path, model_name, vector_store_config): def __init__(self, file_path, model_name, vector_store_config, encoding):
"""Initialize with pdf path.""" """Initialize with pdf path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, model_name, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.encoding = encoding
@register @register
def read(self): def read(self):
"""Load from pdf path.""" """Load from pdf path."""

View File

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import List
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
class WordEmbedding(SourceEmbedding):
"""word embedding for read word document."""
def __init__(self, file_path, model_name, vector_store_config):
"""Initialize with word path."""
super().__init__(file_path, model_name, vector_store_config)
self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config
@register
def read(self):
"""Load from word path."""
loader = UnstructuredWordDocumentLoader(self.file_path)
textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
)
return loader.load_and_split(textsplitter)
@register
def data_process(self, documents: List[Document]):
i = 0
for d in documents:
documents[i].page_content = d.page_content.replace("\n", "")
i += 1
return documents

View File

@ -74,17 +74,11 @@ class DBSummaryClient:
@staticmethod @staticmethod
def get_similar_tables(dbname, query, topk): def get_similar_tables(dbname, query, topk):
"""get user query related tables info""" """get user query related tables info"""
embeddings = HuggingFaceEmbeddings(
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL]
)
vector_store_config = { vector_store_config = {
"vector_store_name": dbname + "_profile", "vector_store_name": dbname + "_profile",
"embeddings": embeddings,
} }
knowledge_embedding_client = KnowledgeEmbedding( knowledge_embedding_client = KnowledgeEmbedding(
file_path="",
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
local_persist=False,
vector_store_config=vector_store_config, vector_store_config=vector_store_config,
) )
if CFG.SUMMARY_CONFIG == "FAST": if CFG.SUMMARY_CONFIG == "FAST":
@ -105,12 +99,10 @@ class DBSummaryClient:
for table in related_tables: for table in related_tables:
vector_store_config = { vector_store_config = {
"vector_store_name": table + "_ts", "vector_store_name": table + "_ts",
"embeddings": embeddings,
} }
knowledge_embedding_client = KnowledgeEmbedding( knowledge_embedding_client = KnowledgeEmbedding(
file_path="", file_path="",
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
local_persist=False,
vector_store_config=vector_store_config, vector_store_config=vector_store_config,
) )
table_summery = knowledge_embedding_client.similar_search(query, 1) table_summery = knowledge_embedding_client.similar_search(query, 1)

View File

@ -19,36 +19,32 @@ CFG = Config()
class LocalKnowledgeInit: class LocalKnowledgeInit:
embeddings: object = None embeddings: object = None
model_name = LLM_MODEL_CONFIG["text2vec"]
top_k: int = VECTOR_SEARCH_TOP_K top_k: int = VECTOR_SEARCH_TOP_K
def __init__(self, vector_store_config) -> None: def __init__(self, vector_store_config) -> None:
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.model_name = LLM_MODEL_CONFIG["text2vec"]
def knowledge_persist(self, file_path, append_mode): def knowledge_persist(self, file_path, append_mode):
"""knowledge persist""" """knowledge persist"""
kv = KnowledgeEmbedding( for root, _, files in os.walk(file_path, topdown=False):
file_path=file_path, for file in files:
model_name=LLM_MODEL_CONFIG["text2vec"], filename = os.path.join(root, file)
# docs = self._load_file(filename)
ke = KnowledgeEmbedding(
file_path=filename,
model_name=self.model_name,
vector_store_config=self.vector_store_config, vector_store_config=self.vector_store_config,
) )
vector_store = kv.knowledge_persist_initialization(append_mode) client = ke.init_knowledge_embedding()
return vector_store client.source_embedding()
def query(self, q):
"""Query similar doc from Vector"""
vector_store = self.init_vector_store()
docs = vector_store.similarity_search_with_score(q, k=self.top_k)
for doc in docs:
dc, s = doc
yield s, dc
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--vector_name", type=str, default="default") parser.add_argument("--vector_name", type=str, default="default")
parser.add_argument("--append", type=bool, default=False) parser.add_argument("--append", type=bool, default=False)
parser.add_argument("--store_type", type=str, default="Chroma")
args = parser.parse_args() args = parser.parse_args()
vector_name = args.vector_name vector_name = args.vector_name
append_mode = args.append append_mode = args.append
@ -56,5 +52,5 @@ if __name__ == "__main__":
vector_store_config = {"vector_store_name": vector_name} vector_store_config = {"vector_store_name": vector_name}
print(vector_store_config) print(vector_store_config)
kv = LocalKnowledgeInit(vector_store_config=vector_store_config) kv = LocalKnowledgeInit(vector_store_config=vector_store_config)
vector_store = kv.knowledge_persist(file_path=DATASETS_DIR, append_mode=append_mode) kv.knowledge_persist(file_path=DATASETS_DIR, append_mode=append_mode)
print("your knowledge embedding success...") print("your knowledge embedding success...")