From 8d19ef39097940b181da1d848fb9128e81cc7af0 Mon Sep 17 00:00:00 2001 From: Aaron Miller Date: Wed, 28 Jun 2023 17:35:07 -0700 Subject: [PATCH] backend: factor out common elements in model code (#1089) * backend: factor out common structs in model code prepping to hack on these by hopefully making there be fewer places to fix the same bug rename * use common buffer wrapper instead of manual malloc * fix replit compile warnings --- gpt4all-backend/CMakeLists.txt | 8 +-- gpt4all-backend/falcon.cpp | 74 ++++++--------------------- gpt4all-backend/gptj.cpp | 40 ++------------- gpt4all-backend/llmodel_shared.h | 36 ++++++++++++++ gpt4all-backend/mpt.cpp | 40 ++------------- gpt4all-backend/replit.cpp | 85 +++++++------------------------- 6 files changed, 81 insertions(+), 202 deletions(-) create mode 100644 gpt4all-backend/llmodel_shared.h diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index 9e602638..80a3f000 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -98,7 +98,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) prepare_target(llamamodel-mainline llama-mainline) add_library(replit-mainline-${BUILD_VARIANT} SHARED - replit.cpp utils.h utils.cpp llmodel_shared.cpp) + replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) prepare_target(replit-mainline llama-mainline) if (NOT LLAMA_METAL) @@ -114,15 +114,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) prepare_target(llamamodel-230511 llama-230511) add_library(gptj-${BUILD_VARIANT} SHARED - gptj.cpp utils.h utils.cpp llmodel_shared.cpp) + gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) prepare_target(gptj ggml-230511) add_library(falcon-${BUILD_VARIANT} SHARED - falcon.cpp utils.h utils.cpp llmodel_shared.cpp) + falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) prepare_target(falcon llama-mainline) add_library(mpt-${BUILD_VARIANT} SHARED - mpt.cpp utils.h utils.cpp llmodel_shared.cpp) + mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) prepare_target(mpt ggml-230511) endif() endforeach() diff --git a/gpt4all-backend/falcon.cpp b/gpt4all-backend/falcon.cpp index aced4fdf..cdb135a1 100644 --- a/gpt4all-backend/falcon.cpp +++ b/gpt4all-backend/falcon.cpp @@ -3,6 +3,7 @@ #include "llama.h" #include "llama-util.h" #include "utils.h" +#include "llmodel_shared.h" #include #include @@ -46,38 +47,6 @@ struct falcon_layer { struct ggml_tensor* ffn_down; }; -struct falcon_buffer { - uint8_t * addr = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] addr; - addr = new uint8_t[size]; - this->size = size; - } - - ~falcon_buffer() { - delete[] addr; - } -}; - -struct falcon_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; - - struct ggml_context * ctx = NULL; - - falcon_buffer buf; - - int n; // number of tokens currently in the cache - - ~falcon_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - } -}; - struct falcon_model { falcon_hparams hparams; @@ -89,22 +58,19 @@ struct falcon_model { std::vector layers; // key + value memory - falcon_kv_cache kv_self; + llm_kv_cache kv_self; struct ggml_context* ctx; std::map tensors; - void * eval_buf; - size_t eval_buf_size; - void * scr0_buf; - size_t scr0_buf_size; - void * scr1_buf; - size_t scr1_buf_size; + llm_buffer eval_buf; + llm_buffer scr0_buf; + llm_buffer scr1_buf; }; static bool kv_cache_init( const struct falcon_hparams & hparams, - struct falcon_kv_cache & cache, + struct llm_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; @@ -464,12 +430,9 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca fin.close(); - model.eval_buf_size = 256u * 1024 * 1024; - model.eval_buf = malloc(model.eval_buf_size); - model.scr0_buf_size = 256u * 1024 * 1024; - model.scr0_buf = malloc(model.scr0_buf_size); - model.scr1_buf_size = 256u * 1024 * 1024; - model.scr1_buf = malloc(model.scr1_buf_size); + model.eval_buf.resize(256u * 1024 * 1024); + model.scr0_buf.resize(256u * 1024 * 1024); + model.scr1_buf.resize(256u * 1024 * 1024); return true; } @@ -503,8 +466,8 @@ bool falcon_eval( const size_t head_dim = n_embd / n_head; struct ggml_init_params eval_ctx_params = { - .mem_size = model.eval_buf_size, - .mem_buffer = model.eval_buf, + .mem_size = model.eval_buf.size, + .mem_buffer = model.eval_buf.addr, .no_alloc = false, }; @@ -526,7 +489,7 @@ bool falcon_eval( struct ggml_tensor * cur; struct ggml_tensor * layernorm_output; - ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, }); + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); // self-attention { @@ -667,7 +630,7 @@ bool falcon_eval( } } - ggml_set_scratch(ctx0, {0, model.scr1_buf_size, model.scr1_buf, }); + ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, }); struct ggml_tensor* inpFF = layernorm_output; struct ggml_tensor* attn_out = ggml_cpy( @@ -685,7 +648,7 @@ bool falcon_eval( inpL = cur; } - ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, }); + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); // norm { @@ -864,15 +827,6 @@ Falcon::~Falcon() { ggml_free(d_ptr->model->ctx); d_ptr->model->ctx = nullptr; } - if(d_ptr->model->eval_buf) { - free(d_ptr->model->eval_buf); - } - if(d_ptr->model->scr0_buf) { - free(d_ptr->model->scr0_buf); - } - if(d_ptr->model->scr1_buf) { - free(d_ptr->model->scr1_buf); - } delete d_ptr->model; } diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp index 62925d9f..e7233a15 100644 --- a/gpt4all-backend/gptj.cpp +++ b/gpt4all-backend/gptj.cpp @@ -2,6 +2,7 @@ #include "gptj_impl.h" #include "utils.h" +#include "llmodel_shared.h" #include #include @@ -63,39 +64,6 @@ struct gptj_layer { struct ggml_tensor * c_mlp_proj_b; }; -struct gptj_buffer { - uint8_t * addr = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] addr; - addr = new uint8_t[size]; - this->size = size; - } - - ~gptj_buffer() { - fflush(stdout); - delete[] addr; - } -}; - -struct gptj_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; - - struct ggml_context * ctx = NULL; - - gptj_buffer buf; - - int n; // number of tokens currently in the cache - - ~gptj_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - } -}; - struct gptj_model { gptj_hparams hparams; @@ -111,13 +79,13 @@ struct gptj_model { std::vector layers; // key + value memory - struct gptj_kv_cache kv_self; + struct llm_kv_cache kv_self; // struct ggml_context * ctx; std::map tensors; - gptj_buffer buf; + llm_buffer buf; ~gptj_model() { if (ctx) { @@ -128,7 +96,7 @@ struct gptj_model { static bool kv_cache_init( const struct gptj_hparams & hparams, - struct gptj_kv_cache & cache, + struct llm_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h new file mode 100644 index 00000000..6a66a5d1 --- /dev/null +++ b/gpt4all-backend/llmodel_shared.h @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include + +struct llm_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + void resize(size_t size) { + delete[] addr; + addr = new uint8_t[size]; + this->size = size; + } + + ~llm_buffer() { + delete[] addr; + } +}; + +struct llm_kv_cache { + struct ggml_tensor * k; + struct ggml_tensor * v; + + struct ggml_context * ctx = NULL; + + llm_buffer buf; + + int n; // number of tokens currently in the cache + + ~llm_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } +}; diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp index 7912ac3a..ce019f3e 100644 --- a/gpt4all-backend/mpt.cpp +++ b/gpt4all-backend/mpt.cpp @@ -2,6 +2,7 @@ #include "mpt_impl.h" #include "utils.h" +#include "llmodel_shared.h" #include #include @@ -62,39 +63,6 @@ struct mpt_layer { struct ggml_tensor * ffn_down_proj_w; }; -struct mpt_buffer { - uint8_t * addr = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] addr; - addr = new uint8_t[size]; - this->size = size; - } - - ~mpt_buffer() { - fflush(stdout); - delete[] addr; - } -}; - -struct mpt_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; - - struct ggml_context * ctx = NULL; - - mpt_buffer buf; - - int n; // number of tokens currently in the cache - - ~mpt_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - } -}; - struct mpt_model { mpt_hparams hparams; @@ -107,12 +75,12 @@ struct mpt_model { std::vector layers; - struct mpt_kv_cache kv_self; + struct llm_kv_cache kv_self; struct ggml_context * ctx; std::map tensors; - mpt_buffer buf; + llm_buffer buf; ~mpt_model() { if (ctx) { @@ -123,7 +91,7 @@ struct mpt_model { static bool kv_cache_init( const struct mpt_hparams & hparams, - struct mpt_kv_cache & cache, + struct llm_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp index a1c45abc..821100a5 100644 --- a/gpt4all-backend/replit.cpp +++ b/gpt4all-backend/replit.cpp @@ -2,8 +2,10 @@ #include "replit_impl.h" #include "utils.h" +#include "llmodel_shared.h" #include +#include #include #include #include @@ -181,40 +183,6 @@ struct replit_layer { struct ggml_tensor * c_mlp_mlp_down_weight; }; - -struct replit_buffer { - uint8_t * addr = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] addr; - addr = new uint8_t[size]; - this->size = size; - } - - ~replit_buffer() { - fflush(stdout); - delete[] addr; - } -}; - -struct replit_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; - - struct ggml_context * ctx = NULL; - - replit_buffer buf; - - int n; // number of tokens currently in the cache - - ~replit_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - } -}; - struct replit_model { mpt_hparams hparams; @@ -224,15 +192,12 @@ struct replit_model { std::vector layers; // key + value memory - struct replit_kv_cache kv_self; + struct llm_kv_cache kv_self; struct ggml_context * ctx; - void * eval_buf; - size_t eval_buf_size; - void * scr0_buf; - size_t scr0_buf_size; - void * scr1_buf; - size_t scr1_buf_size; + llm_buffer eval_buf; + llm_buffer scr0_buf; + llm_buffer scr1_buf; #ifdef GGML_USE_METAL struct ggml_metal_context * ctx_metal; #endif @@ -241,7 +206,7 @@ struct replit_model { static bool kv_cache_init( const struct mpt_hparams & hparams, - struct replit_kv_cache & cache, + struct llm_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; @@ -438,7 +403,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v); - printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); + printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); } // load weights @@ -520,12 +485,9 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors); } - model.eval_buf_size = 256u * 1024 * 1024; - model.eval_buf = malloc(model.eval_buf_size); - model.scr0_buf_size = 256u * 1024 * 1024; - model.scr0_buf = malloc(model.scr0_buf_size); - model.scr1_buf_size = 256u * 1024 * 1024; - model.scr1_buf = malloc(model.scr1_buf_size); + model.eval_buf.resize(256u * 1024 * 1024); + model.scr0_buf.resize(256u * 1024 * 1024); + model.scr1_buf.resize(256u * 1024 * 1024); #ifdef GGML_USE_METAL model.ctx_metal = ggml_metal_init(); @@ -542,9 +504,9 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size)); GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx), ggml_get_mem_size(model.kv_self.ctx), 0)); - GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf, model.eval_buf_size, 0)); - GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf, model.scr0_buf_size, 0)); - GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr1", model.scr1_buf, model.scr1_buf_size, 0)); + GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf.addr, model.eval_buf.size, 0)); + GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf.addr, model.scr0_buf.size, 0)); + GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr1", model.scr1_buf.addr, model.scr1_buf.size, 0)); #endif return true; @@ -585,8 +547,8 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa const int n_vocab = hparams.n_vocab; struct ggml_init_params eval_ctx_params = { - .mem_size = model.eval_buf_size, - .mem_buffer = model.eval_buf, + .mem_size = model.eval_buf.size, + .mem_buffer = model.eval_buf.addr, .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(eval_ctx_params); @@ -598,7 +560,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd); for (int il = 0; il < n_layer; ++il) { - ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, }); + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); struct ggml_tensor * cur; // a = self.ln_1(x) @@ -689,7 +651,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // projection { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); } } - ggml_set_scratch(ctx0, {0, model.scr1_buf_size, model.scr1_buf, }); + ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, }); inpL = ggml_add(ctx0, inpL, cur); @@ -716,7 +678,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // x = x + n inpL = ggml_add(ctx0, inpL, cur); } - ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, }); + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); // norm { inpL = ggml_norm(ctx0, inpL); @@ -946,15 +908,6 @@ Replit::~Replit() ggml_free(d_ptr->model->ctx); d_ptr->model->ctx = nullptr; } - if(d_ptr->model->eval_buf) { - free(d_ptr->model->eval_buf); - } - if(d_ptr->model->scr0_buf) { - free(d_ptr->model->scr0_buf); - } - if(d_ptr->model->scr1_buf) { - free(d_ptr->model->scr1_buf); - } delete d_ptr->model; }