backend: rebase llama.cpp submodule on latest upstream (#2694)

* Adds support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Kompute support)
* Also enables Kompute support for StarCoder2, XVERSE, Command R, and OLMo
* Includes a number of Kompute resource management fixes

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-07-19 14:52:58 -04:00
committed by GitHub
parent 398ef34a87
commit 290c629442
4 changed files with 266 additions and 211 deletions

View File

@@ -30,9 +30,9 @@
#ifdef GGML_USE_KOMPUTE
# include <ggml-kompute.h>
#elif GGML_USE_VULKAN
#elif defined(GGML_USE_VULKAN)
# include <ggml-vulkan.h>
#elif GGML_USE_CUDA
#elif defined(GGML_USE_CUDA)
# include <ggml-cuda.h>
#endif
@@ -51,14 +51,14 @@ static const std::vector<const char *> KNOWN_ARCHES {
// "grok", -- 314B parameters
"gpt2",
// "gptj", -- no inference code
// "gptneox", -- no inference code
"gptneox",
"mpt",
"baichuan",
"starcoder",
// "persimmon", -- CUDA generates garbage
"refact",
"bert",
"nomic-bert",
// "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed.
"bloom",
"stablelm",
"qwen",
@@ -72,12 +72,20 @@ static const std::vector<const char *> KNOWN_ARCHES {
"internlm2",
// "minicpm", -- CUDA generates garbage
"gemma",
"gemma2",
"starcoder2",
// "mamba", -- CUDA missing SSM_CONV
"xverse",
"command-r",
// "dbrx", -- 16x12B parameters
"olmo",
"openelm",
// "arctic", -- 10B+128x3.66B parameters
// "deepseek2", -- excessive VRAM requirements
"chatglm",
// "bitnet", -- tensor not within file bounds?
// "t5", -- seq2seq model
"jais",
};
static const std::vector<const char *> EMBEDDING_ARCHES {
@@ -103,6 +111,16 @@ static void llama_log_callback(enum ggml_log_level level, const char *text, void
}
}
#ifdef GGML_USE_CUDA
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
fputs(text, stderr);
}
}
#endif
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -515,9 +533,8 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
{
const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
const bool useBOS = wantBOS && shouldAddBOS();
auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
std::vector<LLModel::Token> fres(strCat.size()+4);
auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
std::vector<LLModel::Token> fres(str.length() + 4);
auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, special);
fres.resize(fres_len);
return fres;
}
@@ -525,10 +542,10 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
std::string LLamaModel::tokenToString(Token id) const
{
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
GGML_ASSERT(check == -n_tokens);
}
else {
@@ -1170,6 +1187,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
DLL_EXPORT LLModel *construct()
{
llama_log_set(llama_log_callback, nullptr);
#ifdef GGML_USE_CUDA
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
#endif
return new LLamaModel;
}
}