From ffeb5040ccd2b3b64e9e1f29acfcc8c855416d77 Mon Sep 17 00:00:00 2001 From: csunny Date: Thu, 4 May 2023 15:00:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pilot/configs/model_config.py | 4 +++- pilot/vector_store/extract_tovec.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index ad8eb83c1..1238d1bcb 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -13,9 +13,11 @@ LOGDIR = os.path.join(root_path, "logs") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" llm_model_config = { "flan-t5-base": os.path.join(model_path, "flan-t5-base"), - "vicuna-13b": os.path.join(model_path, "vicuna-13b") + "vicuna-13b": os.path.join(model_path, "vicuna-13b"), + "sentence-transforms": os.path.join(model_path, "all-MiniLM-L6-v2") } + LLM_MODEL = "vicuna-13b" LIMIT_MODEL_CONCURRENCY = 5 MAX_POSITION_EMBEDDINGS = 2048 diff --git a/pilot/vector_store/extract_tovec.py b/pilot/vector_store/extract_tovec.py index 74e06cf92..5b7df3eb2 100644 --- a/pilot/vector_store/extract_tovec.py +++ b/pilot/vector_store/extract_tovec.py @@ -5,6 +5,8 @@ from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma from pilot.model.vicuna_llm import VicunaEmbeddingLLM +# from langchain.embeddings import SentenceTransformerEmbeddings + embeddings = VicunaEmbeddingLLM() @@ -20,4 +22,17 @@ def knownledge_tovec(filename): return docsearch +# def knownledge_tovec_st(filename): +# """ Use sentence transformers to embedding the document. +# https://github.com/UKPLab/sentence-transformers +# """ +# from pilot.configs.model_config import llm_model_config +# embeddings = SentenceTransformerEmbeddings(model=llm_model_config["sentence-transforms"]) +# with open(filename, "r") as f: +# knownledge = f.read() + +# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) +# texts = text_splitter(knownledge) +# docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{"source": str(i)} for i in range(len(texts))]) +# return docsearch