mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-03 09:09:07 +00:00
backend: factor out common elements in model code (#1089)
* backend: factor out common structs in model code prepping to hack on these by hopefully making there be fewer places to fix the same bug rename * use common buffer wrapper instead of manual malloc * fix replit compile warnings
This commit is contained in:
parent
285aa50b60
commit
8d19ef3909
@ -98,7 +98,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
prepare_target(llamamodel-mainline llama-mainline)
|
prepare_target(llamamodel-mainline llama-mainline)
|
||||||
|
|
||||||
add_library(replit-mainline-${BUILD_VARIANT} SHARED
|
add_library(replit-mainline-${BUILD_VARIANT} SHARED
|
||||||
replit.cpp utils.h utils.cpp llmodel_shared.cpp)
|
replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
prepare_target(replit-mainline llama-mainline)
|
prepare_target(replit-mainline llama-mainline)
|
||||||
|
|
||||||
if (NOT LLAMA_METAL)
|
if (NOT LLAMA_METAL)
|
||||||
@ -114,15 +114,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
prepare_target(llamamodel-230511 llama-230511)
|
prepare_target(llamamodel-230511 llama-230511)
|
||||||
|
|
||||||
add_library(gptj-${BUILD_VARIANT} SHARED
|
add_library(gptj-${BUILD_VARIANT} SHARED
|
||||||
gptj.cpp utils.h utils.cpp llmodel_shared.cpp)
|
gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
prepare_target(gptj ggml-230511)
|
prepare_target(gptj ggml-230511)
|
||||||
|
|
||||||
add_library(falcon-${BUILD_VARIANT} SHARED
|
add_library(falcon-${BUILD_VARIANT} SHARED
|
||||||
falcon.cpp utils.h utils.cpp llmodel_shared.cpp)
|
falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
prepare_target(falcon llama-mainline)
|
prepare_target(falcon llama-mainline)
|
||||||
|
|
||||||
add_library(mpt-${BUILD_VARIANT} SHARED
|
add_library(mpt-${BUILD_VARIANT} SHARED
|
||||||
mpt.cpp utils.h utils.cpp llmodel_shared.cpp)
|
mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
prepare_target(mpt ggml-230511)
|
prepare_target(mpt ggml-230511)
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-util.h"
|
#include "llama-util.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -46,38 +47,6 @@ struct falcon_layer {
|
|||||||
struct ggml_tensor* ffn_down;
|
struct ggml_tensor* ffn_down;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct falcon_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
~falcon_buffer() {
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct falcon_kv_cache {
|
|
||||||
struct ggml_tensor * k;
|
|
||||||
struct ggml_tensor * v;
|
|
||||||
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
falcon_buffer buf;
|
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
|
||||||
|
|
||||||
~falcon_kv_cache() {
|
|
||||||
if (ctx) {
|
|
||||||
ggml_free(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct falcon_model {
|
struct falcon_model {
|
||||||
falcon_hparams hparams;
|
falcon_hparams hparams;
|
||||||
|
|
||||||
@ -89,22 +58,19 @@ struct falcon_model {
|
|||||||
std::vector<falcon_layer> layers;
|
std::vector<falcon_layer> layers;
|
||||||
|
|
||||||
// key + value memory
|
// key + value memory
|
||||||
falcon_kv_cache kv_self;
|
llm_kv_cache kv_self;
|
||||||
|
|
||||||
struct ggml_context* ctx;
|
struct ggml_context* ctx;
|
||||||
std::map<std::string, struct ggml_tensor*> tensors;
|
std::map<std::string, struct ggml_tensor*> tensors;
|
||||||
|
|
||||||
void * eval_buf;
|
llm_buffer eval_buf;
|
||||||
size_t eval_buf_size;
|
llm_buffer scr0_buf;
|
||||||
void * scr0_buf;
|
llm_buffer scr1_buf;
|
||||||
size_t scr0_buf_size;
|
|
||||||
void * scr1_buf;
|
|
||||||
size_t scr1_buf_size;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool kv_cache_init(
|
static bool kv_cache_init(
|
||||||
const struct falcon_hparams & hparams,
|
const struct falcon_hparams & hparams,
|
||||||
struct falcon_kv_cache & cache,
|
struct llm_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
@ -464,12 +430,9 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
|
|||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
model.eval_buf_size = 256u * 1024 * 1024;
|
model.eval_buf.resize(256u * 1024 * 1024);
|
||||||
model.eval_buf = malloc(model.eval_buf_size);
|
model.scr0_buf.resize(256u * 1024 * 1024);
|
||||||
model.scr0_buf_size = 256u * 1024 * 1024;
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
model.scr0_buf = malloc(model.scr0_buf_size);
|
|
||||||
model.scr1_buf_size = 256u * 1024 * 1024;
|
|
||||||
model.scr1_buf = malloc(model.scr1_buf_size);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -503,8 +466,8 @@ bool falcon_eval(
|
|||||||
const size_t head_dim = n_embd / n_head;
|
const size_t head_dim = n_embd / n_head;
|
||||||
|
|
||||||
struct ggml_init_params eval_ctx_params = {
|
struct ggml_init_params eval_ctx_params = {
|
||||||
.mem_size = model.eval_buf_size,
|
.mem_size = model.eval_buf.size,
|
||||||
.mem_buffer = model.eval_buf,
|
.mem_buffer = model.eval_buf.addr,
|
||||||
.no_alloc = false,
|
.no_alloc = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -526,7 +489,7 @@ bool falcon_eval(
|
|||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * layernorm_output;
|
struct ggml_tensor * layernorm_output;
|
||||||
|
|
||||||
ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
@ -667,7 +630,7 @@ bool falcon_eval(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_scratch(ctx0, {0, model.scr1_buf_size, model.scr1_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
|
||||||
|
|
||||||
struct ggml_tensor* inpFF = layernorm_output;
|
struct ggml_tensor* inpFF = layernorm_output;
|
||||||
struct ggml_tensor* attn_out = ggml_cpy(
|
struct ggml_tensor* attn_out = ggml_cpy(
|
||||||
@ -685,7 +648,7 @@ bool falcon_eval(
|
|||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
@ -864,15 +827,6 @@ Falcon::~Falcon() {
|
|||||||
ggml_free(d_ptr->model->ctx);
|
ggml_free(d_ptr->model->ctx);
|
||||||
d_ptr->model->ctx = nullptr;
|
d_ptr->model->ctx = nullptr;
|
||||||
}
|
}
|
||||||
if(d_ptr->model->eval_buf) {
|
|
||||||
free(d_ptr->model->eval_buf);
|
|
||||||
}
|
|
||||||
if(d_ptr->model->scr0_buf) {
|
|
||||||
free(d_ptr->model->scr0_buf);
|
|
||||||
}
|
|
||||||
if(d_ptr->model->scr1_buf) {
|
|
||||||
free(d_ptr->model->scr1_buf);
|
|
||||||
}
|
|
||||||
delete d_ptr->model;
|
delete d_ptr->model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include "gptj_impl.h"
|
#include "gptj_impl.h"
|
||||||
|
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -63,39 +64,6 @@ struct gptj_layer {
|
|||||||
struct ggml_tensor * c_mlp_proj_b;
|
struct ggml_tensor * c_mlp_proj_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gptj_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
~gptj_buffer() {
|
|
||||||
fflush(stdout);
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gptj_kv_cache {
|
|
||||||
struct ggml_tensor * k;
|
|
||||||
struct ggml_tensor * v;
|
|
||||||
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
gptj_buffer buf;
|
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
|
||||||
|
|
||||||
~gptj_kv_cache() {
|
|
||||||
if (ctx) {
|
|
||||||
ggml_free(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gptj_model {
|
struct gptj_model {
|
||||||
gptj_hparams hparams;
|
gptj_hparams hparams;
|
||||||
|
|
||||||
@ -111,13 +79,13 @@ struct gptj_model {
|
|||||||
std::vector<gptj_layer> layers;
|
std::vector<gptj_layer> layers;
|
||||||
|
|
||||||
// key + value memory
|
// key + value memory
|
||||||
struct gptj_kv_cache kv_self;
|
struct llm_kv_cache kv_self;
|
||||||
|
|
||||||
//
|
//
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
std::map<std::string, struct ggml_tensor *> tensors;
|
std::map<std::string, struct ggml_tensor *> tensors;
|
||||||
|
|
||||||
gptj_buffer buf;
|
llm_buffer buf;
|
||||||
|
|
||||||
~gptj_model() {
|
~gptj_model() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
@ -128,7 +96,7 @@ struct gptj_model {
|
|||||||
|
|
||||||
static bool kv_cache_init(
|
static bool kv_cache_init(
|
||||||
const struct gptj_hparams & hparams,
|
const struct gptj_hparams & hparams,
|
||||||
struct gptj_kv_cache & cache,
|
struct llm_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
36
gpt4all-backend/llmodel_shared.h
Normal file
36
gpt4all-backend/llmodel_shared.h
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <ggml.h>
|
||||||
|
|
||||||
|
struct llm_buffer {
|
||||||
|
uint8_t * addr = NULL;
|
||||||
|
size_t size = 0;
|
||||||
|
|
||||||
|
void resize(size_t size) {
|
||||||
|
delete[] addr;
|
||||||
|
addr = new uint8_t[size];
|
||||||
|
this->size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llm_buffer() {
|
||||||
|
delete[] addr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_kv_cache {
|
||||||
|
struct ggml_tensor * k;
|
||||||
|
struct ggml_tensor * v;
|
||||||
|
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
llm_buffer buf;
|
||||||
|
|
||||||
|
int n; // number of tokens currently in the cache
|
||||||
|
|
||||||
|
~llm_kv_cache() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
@ -2,6 +2,7 @@
|
|||||||
#include "mpt_impl.h"
|
#include "mpt_impl.h"
|
||||||
|
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -62,39 +63,6 @@ struct mpt_layer {
|
|||||||
struct ggml_tensor * ffn_down_proj_w;
|
struct ggml_tensor * ffn_down_proj_w;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mpt_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
~mpt_buffer() {
|
|
||||||
fflush(stdout);
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mpt_kv_cache {
|
|
||||||
struct ggml_tensor * k;
|
|
||||||
struct ggml_tensor * v;
|
|
||||||
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
mpt_buffer buf;
|
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
|
||||||
|
|
||||||
~mpt_kv_cache() {
|
|
||||||
if (ctx) {
|
|
||||||
ggml_free(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mpt_model {
|
struct mpt_model {
|
||||||
mpt_hparams hparams;
|
mpt_hparams hparams;
|
||||||
|
|
||||||
@ -107,12 +75,12 @@ struct mpt_model {
|
|||||||
|
|
||||||
std::vector<mpt_layer> layers;
|
std::vector<mpt_layer> layers;
|
||||||
|
|
||||||
struct mpt_kv_cache kv_self;
|
struct llm_kv_cache kv_self;
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
std::map<std::string, struct ggml_tensor *> tensors;
|
std::map<std::string, struct ggml_tensor *> tensors;
|
||||||
|
|
||||||
|
|
||||||
mpt_buffer buf;
|
llm_buffer buf;
|
||||||
|
|
||||||
~mpt_model() {
|
~mpt_model() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
@ -123,7 +91,7 @@ struct mpt_model {
|
|||||||
|
|
||||||
static bool kv_cache_init(
|
static bool kv_cache_init(
|
||||||
const struct mpt_hparams & hparams,
|
const struct mpt_hparams & hparams,
|
||||||
struct mpt_kv_cache & cache,
|
struct llm_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -2,8 +2,10 @@
|
|||||||
#include "replit_impl.h"
|
#include "replit_impl.h"
|
||||||
|
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -181,40 +183,6 @@ struct replit_layer {
|
|||||||
|
|
||||||
struct ggml_tensor * c_mlp_mlp_down_weight;
|
struct ggml_tensor * c_mlp_mlp_down_weight;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct replit_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
~replit_buffer() {
|
|
||||||
fflush(stdout);
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct replit_kv_cache {
|
|
||||||
struct ggml_tensor * k;
|
|
||||||
struct ggml_tensor * v;
|
|
||||||
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
replit_buffer buf;
|
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
|
||||||
|
|
||||||
~replit_kv_cache() {
|
|
||||||
if (ctx) {
|
|
||||||
ggml_free(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct replit_model {
|
struct replit_model {
|
||||||
mpt_hparams hparams;
|
mpt_hparams hparams;
|
||||||
|
|
||||||
@ -224,15 +192,12 @@ struct replit_model {
|
|||||||
std::vector<replit_layer> layers;
|
std::vector<replit_layer> layers;
|
||||||
|
|
||||||
// key + value memory
|
// key + value memory
|
||||||
struct replit_kv_cache kv_self;
|
struct llm_kv_cache kv_self;
|
||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
void * eval_buf;
|
llm_buffer eval_buf;
|
||||||
size_t eval_buf_size;
|
llm_buffer scr0_buf;
|
||||||
void * scr0_buf;
|
llm_buffer scr1_buf;
|
||||||
size_t scr0_buf_size;
|
|
||||||
void * scr1_buf;
|
|
||||||
size_t scr1_buf_size;
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
struct ggml_metal_context * ctx_metal;
|
struct ggml_metal_context * ctx_metal;
|
||||||
#endif
|
#endif
|
||||||
@ -241,7 +206,7 @@ struct replit_model {
|
|||||||
|
|
||||||
static bool kv_cache_init(
|
static bool kv_cache_init(
|
||||||
const struct mpt_hparams & hparams,
|
const struct mpt_hparams & hparams,
|
||||||
struct replit_kv_cache & cache,
|
struct llm_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
@ -438,7 +403,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
|
|||||||
|
|
||||||
const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
|
const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
|
||||||
|
|
||||||
printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
|
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
// load weights
|
// load weights
|
||||||
@ -520,12 +485,9 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
|
|||||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
|
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
|
||||||
}
|
}
|
||||||
|
|
||||||
model.eval_buf_size = 256u * 1024 * 1024;
|
model.eval_buf.resize(256u * 1024 * 1024);
|
||||||
model.eval_buf = malloc(model.eval_buf_size);
|
model.scr0_buf.resize(256u * 1024 * 1024);
|
||||||
model.scr0_buf_size = 256u * 1024 * 1024;
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
model.scr0_buf = malloc(model.scr0_buf_size);
|
|
||||||
model.scr1_buf_size = 256u * 1024 * 1024;
|
|
||||||
model.scr1_buf = malloc(model.scr1_buf_size);
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
model.ctx_metal = ggml_metal_init();
|
model.ctx_metal = ggml_metal_init();
|
||||||
@ -542,9 +504,9 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
|
|||||||
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size));
|
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size));
|
||||||
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx),
|
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx),
|
||||||
ggml_get_mem_size(model.kv_self.ctx), 0));
|
ggml_get_mem_size(model.kv_self.ctx), 0));
|
||||||
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf, model.eval_buf_size, 0));
|
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf.addr, model.eval_buf.size, 0));
|
||||||
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf, model.scr0_buf_size, 0));
|
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf.addr, model.scr0_buf.size, 0));
|
||||||
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr1", model.scr1_buf, model.scr1_buf_size, 0));
|
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr1", model.scr1_buf.addr, model.scr1_buf.size, 0));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -585,8 +547,8 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
struct ggml_init_params eval_ctx_params = {
|
struct ggml_init_params eval_ctx_params = {
|
||||||
.mem_size = model.eval_buf_size,
|
.mem_size = model.eval_buf.size,
|
||||||
.mem_buffer = model.eval_buf,
|
.mem_buffer = model.eval_buf.addr,
|
||||||
.no_alloc = false,
|
.no_alloc = false,
|
||||||
};
|
};
|
||||||
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
||||||
@ -598,7 +560,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
|
|
||||||
// a = self.ln_1(x)
|
// a = self.ln_1(x)
|
||||||
@ -689,7 +651,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
// projection
|
// projection
|
||||||
{ cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
|
{ cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
|
||||||
}
|
}
|
||||||
ggml_set_scratch(ctx0, {0, model.scr1_buf_size, model.scr1_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
|
||||||
|
|
||||||
inpL = ggml_add(ctx0, inpL, cur);
|
inpL = ggml_add(ctx0, inpL, cur);
|
||||||
|
|
||||||
@ -716,7 +678,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
// x = x + n
|
// x = x + n
|
||||||
inpL = ggml_add(ctx0, inpL, cur);
|
inpL = ggml_add(ctx0, inpL, cur);
|
||||||
}
|
}
|
||||||
ggml_set_scratch(ctx0, {0, model.scr0_buf_size, model.scr0_buf, });
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
inpL = ggml_norm(ctx0, inpL);
|
inpL = ggml_norm(ctx0, inpL);
|
||||||
@ -946,15 +908,6 @@ Replit::~Replit()
|
|||||||
ggml_free(d_ptr->model->ctx);
|
ggml_free(d_ptr->model->ctx);
|
||||||
d_ptr->model->ctx = nullptr;
|
d_ptr->model->ctx = nullptr;
|
||||||
}
|
}
|
||||||
if(d_ptr->model->eval_buf) {
|
|
||||||
free(d_ptr->model->eval_buf);
|
|
||||||
}
|
|
||||||
if(d_ptr->model->scr0_buf) {
|
|
||||||
free(d_ptr->model->scr0_buf);
|
|
||||||
}
|
|
||||||
if(d_ptr->model->scr1_buf) {
|
|
||||||
free(d_ptr->model->scr1_buf);
|
|
||||||
}
|
|
||||||
delete d_ptr->model;
|
delete d_ptr->model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user