update:knowledge env

This commit is contained in:
aries-ckt 2023-06-05 18:08:55 +08:00
parent f2f28fee42
commit e29fa37cde
18 changed files with 43 additions and 80 deletions

View File

@ -28,8 +28,12 @@ MAX_POSITION_EMBEDDINGS=4096
# FAST_LLM_MODEL=chatglm-6b # FAST_LLM_MODEL=chatglm-6b
### EMBEDDINGS #*******************************************************************#
## EMBEDDING_MODEL - Model to use for creating embeddings #** EMBEDDING SETTINGS **#
#*******************************************************************#
EMBEDDING_MODEL=text2vec
KNOWLEDGE_CHUNK_SIZE=500
KNOWLEDGE_SEARCH_TOP_SIZE=5
## EMBEDDING_TOKENIZER - Tokenizer to use for chunking large inputs ## EMBEDDING_TOKENIZER - Tokenizer to use for chunking large inputs
## EMBEDDING_TOKEN_LIMIT - Chunk size limit for large inputs ## EMBEDDING_TOKEN_LIMIT - Chunk size limit for large inputs
# EMBEDDING_MODEL=all-MiniLM-L6-v2 # EMBEDDING_MODEL=all-MiniLM-L6-v2

View File

@ -148,8 +148,8 @@ class Config(metaclass=Singleton):
### EMBEDDING Configuration ### EMBEDDING Configuration
self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec") self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec")
self.KNOWLEDGE_CHUNK_SIZE = os.getenv("KNOWLEDGE_CHUNK_SIZE", 100) self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 500))
self.KNOWLEDGE_SEARCH_TOP_SIZE = os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10) self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 10))
### SUMMARY_CONFIG Configuration ### SUMMARY_CONFIG Configuration
self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR") self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "VECTOR")

View File

@ -14,7 +14,6 @@ from pilot.configs.model_config import (
KNOWLEDGE_UPLOAD_ROOT_PATH, KNOWLEDGE_UPLOAD_ROOT_PATH,
LLM_MODEL_CONFIG, LLM_MODEL_CONFIG,
LOGDIR, LOGDIR,
VECTOR_SEARCH_TOP_K,
) )
from pilot.scene.chat_knowledge.custom.prompt import prompt from pilot.scene.chat_knowledge.custom.prompt import prompt
@ -52,7 +51,7 @@ class ChatNewKnowledge(BaseChat):
def generate_input_values(self): def generate_input_values(self):
docs = self.knowledge_embedding_client.similar_search( docs = self.knowledge_embedding_client.similar_search(
self.current_user_input, VECTOR_SEARCH_TOP_K self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
) )
context = [d.page_content for d in docs] context = [d.page_content for d in docs]
context = context[:2000] context = context[:2000]

View File

@ -14,7 +14,6 @@ from pilot.configs.model_config import (
KNOWLEDGE_UPLOAD_ROOT_PATH, KNOWLEDGE_UPLOAD_ROOT_PATH,
LLM_MODEL_CONFIG, LLM_MODEL_CONFIG,
LOGDIR, LOGDIR,
VECTOR_SEARCH_TOP_K,
) )
from pilot.scene.chat_knowledge.default.prompt import prompt from pilot.scene.chat_knowledge.default.prompt import prompt
@ -48,7 +47,7 @@ class ChatDefaultKnowledge(BaseChat):
def generate_input_values(self): def generate_input_values(self):
docs = self.knowledge_embedding_client.similar_search( docs = self.knowledge_embedding_client.similar_search(
self.current_user_input, VECTOR_SEARCH_TOP_K self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
) )
context = [d.page_content for d in docs] context = [d.page_content for d in docs]
context = context[:2000] context = context[:2000]

View File

@ -14,7 +14,6 @@ from pilot.configs.model_config import (
KNOWLEDGE_UPLOAD_ROOT_PATH, KNOWLEDGE_UPLOAD_ROOT_PATH,
LLM_MODEL_CONFIG, LLM_MODEL_CONFIG,
LOGDIR, LOGDIR,
VECTOR_SEARCH_TOP_K,
) )
from pilot.scene.chat_knowledge.url.prompt import prompt from pilot.scene.chat_knowledge.url.prompt import prompt
@ -56,7 +55,7 @@ class ChatUrlKnowledge(BaseChat):
def generate_input_values(self): def generate_input_values(self):
docs = self.knowledge_embedding_client.similar_search( docs = self.knowledge_embedding_client.similar_search(
self.current_user_input, VECTOR_SEARCH_TOP_K self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE
) )
context = [d.page_content for d in docs] context = [d.page_content for d in docs]
context = context[:2000] context = context[:2000]

View File

@ -3,12 +3,14 @@
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate
from pilot.configs.model_config import VECTOR_SEARCH_TOP_K from pilot.configs.config import Config
from pilot.conversation import conv_qa_prompt_template, conv_db_summary_templates from pilot.conversation import conv_qa_prompt_template, conv_db_summary_templates
from pilot.logs import logger from pilot.logs import logger
from pilot.model.llm_out.vicuna_llm import VicunaLLM from pilot.model.llm_out.vicuna_llm import VicunaLLM
from pilot.vector_store.file_loader import KnownLedge2Vector from pilot.vector_store.file_loader import KnownLedge2Vector
CFG = Config()
class KnownLedgeBaseQA: class KnownLedgeBaseQA:
def __init__(self) -> None: def __init__(self) -> None:
@ -22,7 +24,7 @@ class KnownLedgeBaseQA:
) )
retriever = self.vector_store.as_retriever( retriever = self.vector_store.as_retriever(
search_kwargs={"k": VECTOR_SEARCH_TOP_K} search_kwargs={"k": CFG.KNOWLEDGE_SEARCH_TOP_SIZE}
) )
docs = retriever.get_relevant_documents(query=query) docs = retriever.get_relevant_documents(query=query)

View File

@ -634,7 +634,6 @@ def knowledge_embedding_store(vs_id, files):
knowledge_embedding_client = KnowledgeEmbedding( knowledge_embedding_client = KnowledgeEmbedding(
file_path=os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vs_id, filename), file_path=os.path.join(KNOWLEDGE_UPLOAD_ROOT_PATH, vs_id, filename),
model_name=LLM_MODEL_CONFIG["text2vec"], model_name=LLM_MODEL_CONFIG["text2vec"],
local_persist=False,
vector_store_config={ vector_store_config={
"vector_store_name": vector_store_name["vs_name"], "vector_store_name": vector_store_name["vs_name"],
"vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH,

View File

@ -12,14 +12,12 @@ class CSVEmbedding(SourceEmbedding):
def __init__( def __init__(
self, self,
file_path, file_path,
model_name,
vector_store_config, vector_store_config,
embedding_args: Optional[Dict] = None, embedding_args: Optional[Dict] = None,
): ):
"""Initialize with csv path.""" """Initialize with csv path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.embedding_args = embedding_args self.embedding_args = embedding_args

View File

@ -64,7 +64,6 @@ class KnowledgeEmbedding:
knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension] knowledge_class, knowledge_args = KnowledgeEmbeddingType[extension]
embedding = knowledge_class( embedding = knowledge_class(
self.file_path, self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config, vector_store_config=self.vector_store_config,
**knowledge_args, **knowledge_args,
) )

View File

@ -8,20 +8,21 @@ from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader from langchain.document_loaders import TextLoader
from langchain.schema import Document from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.config import Config
from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
CFG = Config()
class MarkdownEmbedding(SourceEmbedding): class MarkdownEmbedding(SourceEmbedding):
"""markdown embedding for read markdown document.""" """markdown embedding for read markdown document."""
def __init__(self, file_path, model_name, vector_store_config): def __init__(self, file_path, vector_store_config):
"""Initialize with markdown path.""" """Initialize with markdown path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
# self.encoding = encoding # self.encoding = encoding
@ -30,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding):
"""Load from markdown path.""" """Load from markdown path."""
loader = EncodeTextLoader(self.file_path) loader = EncodeTextLoader(self.file_path)
text_splitter = CHNDocumentSplitter( text_splitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
) )
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)

View File

@ -5,19 +5,20 @@ from typing import List
from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.config import Config
from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
CFG = Config()
class PDFEmbedding(SourceEmbedding): class PDFEmbedding(SourceEmbedding):
"""pdf embedding for read pdf document.""" """pdf embedding for read pdf document."""
def __init__(self, file_path, model_name, vector_store_config, encoding): def __init__(self, file_path, vector_store_config, encoding):
"""Initialize with pdf path.""" """Initialize with pdf path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.encoding = encoding self.encoding = encoding
@ -27,7 +28,7 @@ class PDFEmbedding(SourceEmbedding):
# loader = UnstructuredPaddlePDFLoader(self.file_path) # loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path) loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter( textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
) )
return loader.load_and_split(textsplitter) return loader.load_and_split(textsplitter)

View File

@ -23,13 +23,11 @@ class SourceEmbedding(ABC):
def __init__( def __init__(
self, self,
file_path, file_path,
model_name,
vector_store_config, vector_store_config,
embedding_args: Optional[Dict] = None, embedding_args: Optional[Dict] = None,
): ):
"""Initialize with Loader url, model_name, vector_store_config""" """Initialize with Loader url, model_name, vector_store_config"""
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
self.embedding_args = embedding_args self.embedding_args = embedding_args
self.embeddings = vector_store_config["embeddings"] self.embeddings = vector_store_config["embeddings"]

View File

@ -8,11 +8,10 @@ from pilot import SourceEmbedding, register
class StringEmbedding(SourceEmbedding): class StringEmbedding(SourceEmbedding):
"""string embedding for read string document.""" """string embedding for read string document."""
def __init__(self, file_path, model_name, vector_store_config): def __init__(self, file_path, vector_store_config):
"""Initialize with pdf path.""" """Initialize with pdf path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
@register @register

View File

@ -16,11 +16,10 @@ CFG = Config()
class URLEmbedding(SourceEmbedding): class URLEmbedding(SourceEmbedding):
"""url embedding for read url document.""" """url embedding for read url document."""
def __init__(self, file_path, model_name, vector_store_config): def __init__(self, file_path, vector_store_config):
"""Initialize with url path.""" """Initialize with url path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
@register @register
@ -29,7 +28,7 @@ class URLEmbedding(SourceEmbedding):
loader = WebBaseLoader(web_path=self.file_path) loader = WebBaseLoader(web_path=self.file_path)
if CFG.LANGUAGE == "en": if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter( text_splitter = CharacterTextSplitter(
chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20, chunk_overlap=20,
length_function=len, length_function=len,
) )

View File

@ -5,19 +5,20 @@ from typing import List
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.schema import Document from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.config import Config
from pilot.source_embedding import SourceEmbedding, register from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
CFG = Config()
class WordEmbedding(SourceEmbedding): class WordEmbedding(SourceEmbedding):
"""word embedding for read word document.""" """word embedding for read word document."""
def __init__(self, file_path, model_name, vector_store_config): def __init__(self, file_path, vector_store_config):
"""Initialize with word path.""" """Initialize with word path."""
super().__init__(file_path, model_name, vector_store_config) super().__init__(file_path, vector_store_config)
self.file_path = file_path self.file_path = file_path
self.model_name = model_name
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config
@register @register
@ -25,7 +26,7 @@ class WordEmbedding(SourceEmbedding):
"""Load from word path.""" """Load from word path."""
loader = UnstructuredWordDocumentLoader(self.file_path) loader = UnstructuredWordDocumentLoader(self.file_path)
textsplitter = CHNDocumentSplitter( textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
) )
return loader.load_and_split(textsplitter) return loader.load_and_split(textsplitter)

View File

@ -34,24 +34,21 @@ class DBSummaryClient:
"embeddings": embeddings, "embeddings": embeddings,
} }
embedding = StringEmbedding( embedding = StringEmbedding(
db_summary_client.get_summery(), file_path=db_summary_client.get_summery(),
LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], vector_store_config=vector_store_config,
vector_store_config,
) )
if not embedding.vector_name_exist(): if not embedding.vector_name_exist():
if CFG.SUMMARY_CONFIG == "FAST": if CFG.SUMMARY_CONFIG == "FAST":
for vector_table_info in db_summary_client.get_summery(): for vector_table_info in db_summary_client.get_summery():
embedding = StringEmbedding( embedding = StringEmbedding(
vector_table_info, vector_table_info,
LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
vector_store_config, vector_store_config,
) )
embedding.source_embedding() embedding.source_embedding()
else: else:
embedding = StringEmbedding( embedding = StringEmbedding(
db_summary_client.get_summery(), file_path=db_summary_client.get_summery(),
LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], vector_store_config=vector_store_config,
vector_store_config,
) )
embedding.source_embedding() embedding.source_embedding()
for ( for (
@ -64,7 +61,6 @@ class DBSummaryClient:
} }
embedding = StringEmbedding( embedding = StringEmbedding(
table_summary, table_summary,
LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
table_vector_store_config, table_vector_store_config,
) )
embedding.source_embedding() embedding.source_embedding()
@ -124,30 +120,3 @@ def _get_llm_response(query, db_input, dbsummary):
) )
res = chat.nostream_call() res = chat.nostream_call()
return json.loads(res)["table"] return json.loads(res)["table"]
# if __name__ == "__main__":
# # summary = DBSummaryClient.get_similar_tables("db_test", "查询在线用户的购物车", 10)
#
# text= """Based on the input "查询在线聊天的用户好友" and the known database information, the tables involved in the user input are "chat_users" and "friends".
# Response:
#
# {
# "table": ["chat_users"]
# }"""
# text = text.rstrip().replace("\n","")
# start = text.find("{")
# end = text.find("}") + 1
#
# # 从字符串中截取出JSON数据
# json_str = text[start:end]
#
# # 将JSON数据转换为Python中的字典类型
# data = json.loads(json_str)
# # pattern = r'{s*"table"s*:s*[[^]]*]s*}'
# # match = re.search(pattern, text)
# # if match:
# # json_string = match.group(0)
# # # 将JSON字符串转换为Python对象
# # json_obj = json.loads(json_string)
# # print(summary)

View File

@ -17,7 +17,6 @@ from langchain.vectorstores import Chroma
from pilot.configs.model_config import ( from pilot.configs.model_config import (
DATASETS_DIR, DATASETS_DIR,
LLM_MODEL_CONFIG, LLM_MODEL_CONFIG,
VECTOR_SEARCH_TOP_K,
VECTORE_PATH, VECTORE_PATH,
) )
@ -41,7 +40,6 @@ class KnownLedge2Vector:
embeddings: object = None embeddings: object = None
model_name = LLM_MODEL_CONFIG["sentence-transforms"] model_name = LLM_MODEL_CONFIG["sentence-transforms"]
top_k: int = VECTOR_SEARCH_TOP_K
def __init__(self, model_name=None) -> None: def __init__(self, model_name=None) -> None:
if not model_name: if not model_name:

View File

@ -10,7 +10,6 @@ from pilot.configs.config import Config
from pilot.configs.model_config import ( from pilot.configs.model_config import (
DATASETS_DIR, DATASETS_DIR,
LLM_MODEL_CONFIG, LLM_MODEL_CONFIG,
VECTOR_SEARCH_TOP_K,
) )
from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
@ -19,7 +18,6 @@ CFG = Config()
class LocalKnowledgeInit: class LocalKnowledgeInit:
embeddings: object = None embeddings: object = None
top_k: int = VECTOR_SEARCH_TOP_K
def __init__(self, vector_store_config) -> None: def __init__(self, vector_store_config) -> None:
self.vector_store_config = vector_store_config self.vector_store_config = vector_store_config