From ff2fdecce1e480348086df6962ef320104c84550 Mon Sep 17 00:00:00 2001 From: Adam Treat Date: Sat, 8 Apr 2023 23:28:39 -0400 Subject: [PATCH] Initial commit. --- .gitignore | 1 + .gitmodules | 3 + CMakeLists.txt | 40 ++ LICENSE | 21 + README.md | 1 + ggml | 1 + gptj.cpp | 781 ++++++++++++++++++++++++++++++++++++++ gptj.h | 24 ++ icons/regenerate.svg | 1 + icons/send_message.svg | 1 + icons/stop_generating.svg | 1 + llm.cpp | 132 +++++++ llm.h | 84 ++++ main.cpp | 31 ++ main.qml | 233 ++++++++++++ 15 files changed, 1355 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 160000 ggml create mode 100644 gptj.cpp create mode 100644 gptj.h create mode 100644 icons/regenerate.svg create mode 100644 icons/send_message.svg create mode 100644 icons/stop_generating.svg create mode 100644 llm.cpp create mode 100644 llm.h create mode 100644 main.cpp create mode 100644 main.qml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..01e00f3a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +CMakeLists.txt.user diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..1a30094e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ggml"] + path = ggml + url = https://github.com/manyoso/ggml.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..5f40839c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.16) + +project(gpt4all-chat VERSION 0.1 LANGUAGES CXX) + +set(CMAKE_AUTOMOC ON) +set(CMAKE_AUTORCC ON) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(Qt6 6.2 COMPONENTS Quick REQUIRED) + +set(GGML_BUILD_EXAMPLES ON CACHE BOOL "ggml: build examples" FORCE) +add_subdirectory(ggml) + +qt_add_executable(chat + main.cpp + gptj.h gptj.cpp + llm.h llm.cpp +) + +qt_add_qml_module(chat + URI gpt4all-chat + VERSION 1.0 + QML_FILES main.qml + RESOURCES icons/send_message.svg icons/stop_generating.svg icons/regenerate.svg +) + +set_target_properties(chat PROPERTIES + MACOSX_BUNDLE_GUI_IDENTIFIER my.example.com + MACOSX_BUNDLE_BUNDLE_VERSION ${PROJECT_VERSION} + MACOSX_BUNDLE_SHORT_VERSION_STRING ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR} + MACOSX_BUNDLE TRUE + WIN32_EXECUTABLE TRUE +) + +target_compile_definitions(chat + PRIVATE $<$,$>:QT_QML_DEBUG>) +target_link_libraries(chat + PRIVATE Qt6::Quick Qt6::Svg) +target_link_libraries(chat + PRIVATE ggml ggml_utils) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..70d54194 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Adam Treat + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..4a3e6d27 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# gpt4all-chat diff --git a/ggml b/ggml new file mode 160000 index 00000000..c9f702ac --- /dev/null +++ b/ggml @@ -0,0 +1 @@ +Subproject commit c9f702ac573a2be4a1b9926979084941f95d0e33 diff --git a/gptj.cpp b/gptj.cpp new file mode 100644 index 00000000..0858e944 --- /dev/null +++ b/gptj.cpp @@ -0,0 +1,781 @@ +#include "gptj.h" +#include "ggml/ggml.h" + +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// default hparams (GPT-J 6B) +struct gptj_hparams { + int32_t n_vocab = 50400; + int32_t n_ctx = 2048; + int32_t n_embd = 4096; + int32_t n_head = 16; + int32_t n_layer = 28; + int32_t n_rot = 64; + int32_t f16 = 1; +}; + +struct gptj_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + // attention + struct ggml_tensor * c_attn_q_proj_w; + struct ggml_tensor * c_attn_k_proj_w; + struct ggml_tensor * c_attn_v_proj_w; + + struct ggml_tensor * c_attn_proj_w; + + // ff + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct gptj_model { + gptj_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + + struct ggml_tensor * lmh_g; // language model head + struct ggml_tensor * lmh_b; // language model bias + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +// load the model's weights from a stream +bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) { + printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); + fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + printf("%s: f16 = %d\n", __func__, hparams.f16); + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return false; + } + + std::string word; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + word.resize(len); + fin.read((char *) word.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = GGML_TYPE_COUNT; + switch (model.hparams.f16) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + case 2: wtype = GGML_TYPE_Q4_0; break; + case 3: wtype = GGML_TYPE_Q4_1; break; + default: + { + fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", + __func__, fname.c_str(), model.hparams.f16); + return false; + } + } + + const ggml_type wtype2 = GGML_TYPE_F32; + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g + ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (5 + 10*n_layer)*256; // object overhead + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + .mem_size = ctx_size, + .mem_buffer = NULL, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + + // map by name + model.tensors["transformer.wte.weight"] = model.wte; + + model.tensors["transformer.ln_f.weight"] = model.ln_f_g; + model.tensors["transformer.ln_f.bias"] = model.ln_f_b; + + model.tensors["lm_head.weight"] = model.lmh_g; + model.tensors["lm_head.bias"] = model.lmh_b; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g; + model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b; + + model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w; + + model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w; + + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w; + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b; + + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + int n_tensors = 0; + size_t total_size = 0; + + printf("%s: ", __func__); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); + return false; + } + + if (0) { + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + size_t bpe = 0; + + switch (ftype) { + case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; + case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; + case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; + case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + default: + { + fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); + return false; + } + }; + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_nbytes(tensor); + if (++n_tensors % 8 == 0) { + printf("."); + fflush(stdout); + } + } + + printf(" done\n"); + + printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + } + + return true; +} + +// load the model's weights from a file path +bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) { + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + bool loaded = gptj_model_load(fname, fin, model, vocab); + fin.close(); + return loaded; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +// The GPT-J model requires about 16MB of memory per input token. +// +bool gptj_eval( + const gptj_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + const int n_rot = hparams.n_rot; + + const int d_key = n_embd/n_head; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + .mem_size = buf_size, + .mem_buffer = buf, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = { .n_threads = n_threads }; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + // wte + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + struct ggml_tensor * inpSA = cur; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_rope(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + n_past, n_rot, 0), + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_rope(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + n_past, n_rot, 1), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + } + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + // this is independent of the self-attention result, so it could be done in parallel to the self-attention + { + // note here we pass inpSA instead of cur + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + inpSA); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // projection + // cur = proj_w*cur + proj_b + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // self-attention + FF + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpL); + } + + // norm + { + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + // lm_head + { + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + + inpL = ggml_add(ctx0, + ggml_repeat(ctx0, model.lmh_b, inpL), + inpL); + } + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute (ctx0, &gf); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} + +struct GPTJPrivate { + const std::string modelPath; + bool modelLoaded; + gpt_vocab vocab; + gptj_model model; + int64_t t_main_start_us = 0; + int64_t t_load_us = 0; + int64_t n_threads = 0; + std::mt19937 rng; +}; + +GPTJ::GPTJ() + : d_ptr(new GPTJPrivate) { + + d_ptr->modelLoaded = false; +} + +bool GPTJ::loadModel(const std::string &modelPath, std::istream &fin) { + d_ptr->t_main_start_us = ggml_time_us(); + std::mt19937 rng(time(NULL)); + d_ptr->rng = rng; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gptj_model_load(modelPath, fin, d_ptr->model, d_ptr->vocab)) { + std::cerr << "GPT-J ERROR: failed to load model from" << modelPath; + return false; + } + + d_ptr->t_load_us = ggml_time_us() - t_start_us; + } + + d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + d_ptr->modelLoaded = true; + return true; +} + +GPTJ::~GPTJ() +{ + ggml_free(d_ptr->model.ctx); +} + +bool GPTJ::isModelLoaded() const +{ + return d_ptr->modelLoaded; +} + +void GPTJ::prompt(const std::string &prompt, std::function response, + int32_t n_predict, int32_t top_k, float top_p, float temp, + int32_t n_batch) { + + if (!isModelLoaded()) { + std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n"; + return; + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt); + + n_predict = std::min(n_predict, d_ptr->model.hparams.n_ctx - (int) embd_inp.size()); + + std::vector embd; + + // determine the required inference memory per token: + size_t mem_per_token = 0; + gptj_eval(d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gptj_eval(d_ptr->model, d_ptr->n_threads, n_past, embd, logits, mem_per_token)) { + std::cerr << "GPT-J ERROR: Failed to predict\n"; + return; + } + + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + + const int n_vocab = d_ptr->model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(d_ptr->vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, d_ptr->rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (int k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (embd.size() > n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + if (!response(d_ptr->vocab.id_to_token[id])) + goto stop_generating; + } + + // end of text token + if (embd.back() == 50256) { + break; + } + } + +stop_generating: +#if 1 + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n"; + std::cout << "GPT-J INFO: load time = " << d_ptr->t_load_us/1000.0f << " ms\n"; + std::cout << "GPT-J INFO: sample time = " << t_sample_us/1000.0f << " ms\n"; + std::cout << "GPT-J INFO: predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/n_past << " ms per token\n"; + std::cout << "GPT-J INFO: total time = " << (t_main_end_us - d_ptr->t_main_start_us)/1000.0f << " ms\n"; + fflush(stdout); + fflush(stderr); + } +#endif + + return; +} diff --git a/gptj.h b/gptj.h new file mode 100644 index 00000000..3a698180 --- /dev/null +++ b/gptj.h @@ -0,0 +1,24 @@ +#ifndef GPTJ_H +#define GPTJ_H + +#include +#include +#include + +class GPTJPrivate; +class GPTJ { +public: + GPTJ(); + ~GPTJ(); + + bool loadModel(const std::string &modelPath, std::istream &fin); + bool isModelLoaded() const; + void prompt(const std::string &prompt, std::function response, + int32_t n_predict = 200, int32_t top_k = 40, float top_p = 0.9f, float temp = 0.9f, + int32_t n_batch = 9); + +private: + GPTJPrivate *d_ptr; +}; + +#endif // GPTJ_H \ No newline at end of file diff --git a/icons/regenerate.svg b/icons/regenerate.svg new file mode 100644 index 00000000..016e6a52 --- /dev/null +++ b/icons/regenerate.svg @@ -0,0 +1 @@ + diff --git a/icons/send_message.svg b/icons/send_message.svg new file mode 100644 index 00000000..d8650b66 --- /dev/null +++ b/icons/send_message.svg @@ -0,0 +1 @@ + diff --git a/icons/stop_generating.svg b/icons/stop_generating.svg new file mode 100644 index 00000000..c627ac0e --- /dev/null +++ b/icons/stop_generating.svg @@ -0,0 +1 @@ + diff --git a/llm.cpp b/llm.cpp new file mode 100644 index 00000000..6e2ca906 --- /dev/null +++ b/llm.cpp @@ -0,0 +1,132 @@ +#include "llm.h" + +#include +#include +#include +#include + +class MyLLM: public LLM { }; +Q_GLOBAL_STATIC(MyLLM, llmInstance) +LLM *LLM::globalInstance() +{ + return llmInstance(); +} + +GPTJObject::GPTJObject() + : QObject{nullptr} + , m_gptj(new GPTJ) +{ + moveToThread(&m_llmThread); + connect(&m_llmThread, &QThread::started, this, &GPTJObject::loadModel); + m_llmThread.setObjectName("llm thread"); + m_llmThread.start(); +} + +bool GPTJObject::loadModel() +{ + if (isModelLoaded()) + return true; + + QString modelName("ggml-model-q4_0.bin"); + QFile file(QCoreApplication::applicationDirPath() + QDir::separator() + modelName); + if (file.open(QIODevice::ReadOnly)) { + + QByteArray data = file.readAll(); + std::istringstream iss(data.toStdString()); + + m_gptj->loadModel(modelName.toStdString(), iss); + emit isModelLoadedChanged(); + } + + return m_gptj; +} + +bool GPTJObject::isModelLoaded() const +{ + return m_gptj->isModelLoaded(); +} + +void GPTJObject::resetResponse() +{ + m_response = std::string(); +} + +QString GPTJObject::response() const +{ + return QString::fromStdString(m_response); +} + +bool GPTJObject::handleResponse(const std::string &response) +{ +#if 0 + printf("%s", response.c_str()); + fflush(stdout); +#endif + m_response.append(response); + emit responseChanged(); + return !m_stopGenerating; +} + +bool GPTJObject::prompt(const QString &prompt) +{ + if (!isModelLoaded()) + return false; + + m_stopGenerating = false; + auto func = std::bind(&GPTJObject::handleResponse, this, std::placeholders::_1); + emit responseStarted(); + m_gptj->prompt(prompt.toStdString(), func); + emit responseStopped(); + return true; +} + +LLM::LLM() + : QObject{nullptr} + , m_gptj(new GPTJObject) + , m_responseInProgress(false) +{ + connect(m_gptj, &GPTJObject::isModelLoadedChanged, this, &LLM::isModelLoadedChanged, Qt::QueuedConnection); + connect(m_gptj, &GPTJObject::responseChanged, this, &LLM::responseChanged, Qt::QueuedConnection); + connect(m_gptj, &GPTJObject::responseStarted, this, &LLM::responseStarted, Qt::QueuedConnection); + connect(m_gptj, &GPTJObject::responseStopped, this, &LLM::responseStopped, Qt::QueuedConnection); + + connect(this, &LLM::promptRequested, m_gptj, &GPTJObject::prompt, Qt::QueuedConnection); + connect(this, &LLM::resetResponseRequested, m_gptj, &GPTJObject::resetResponse, Qt::BlockingQueuedConnection); +} + +bool LLM::isModelLoaded() const +{ + return m_gptj->isModelLoaded(); +} + +void LLM::prompt(const QString &prompt) +{ + emit promptRequested(prompt); +} + +void LLM::resetResponse() +{ + emit resetResponseRequested(); // blocking queued connection +} + +void LLM::stopGenerating() +{ + m_gptj->stopGenerating(); +} + +QString LLM::response() const +{ + return m_gptj->response(); +} + +void LLM::responseStarted() +{ + m_responseInProgress = true; + emit responseInProgressChanged(); +} + +void LLM::responseStopped() +{ + m_responseInProgress = false; + emit responseInProgressChanged(); +} diff --git a/llm.h b/llm.h new file mode 100644 index 00000000..285aa009 --- /dev/null +++ b/llm.h @@ -0,0 +1,84 @@ +#ifndef LLM_H +#define LLM_H + +#include +#include +#include "gptj.h" + +class GPTJObject : public QObject +{ + Q_OBJECT + Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged) + Q_PROPERTY(QString response READ response NOTIFY responseChanged) + +public: + + GPTJObject(); + + bool loadModel(); + bool isModelLoaded() const; + void resetResponse(); + void stopGenerating() { m_stopGenerating = true; } + + QString response() const; + +public Q_SLOTS: + bool prompt(const QString &prompt); + +Q_SIGNALS: + void isModelLoadedChanged(); + void responseChanged(); + void responseStarted(); + void responseStopped(); + +private: + bool handleResponse(const std::string &response); + +private: + GPTJ *m_gptj; + std::stringstream m_debug; + std::string m_response; + QThread m_llmThread; + std::atomic m_stopGenerating; +}; + +class LLM : public QObject +{ + Q_OBJECT + Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged) + Q_PROPERTY(QString response READ response NOTIFY responseChanged) + Q_PROPERTY(bool responseInProgress READ responseInProgress NOTIFY responseInProgressChanged) +public: + + static LLM *globalInstance(); + + Q_INVOKABLE bool isModelLoaded() const; + Q_INVOKABLE void prompt(const QString &prompt); + Q_INVOKABLE void resetResponse(); + Q_INVOKABLE void stopGenerating(); + + QString response() const; + bool responseInProgress() const { return m_responseInProgress; } + +Q_SIGNALS: + void isModelLoadedChanged(); + void responseChanged(); + void responseInProgressChanged(); + void promptRequested(const QString &prompt); + void resetResponseRequested(); + +private Q_SLOTS: + void responseStarted(); + void responseStopped(); + +private: + GPTJObject *m_gptj; + bool m_responseInProgress; + +private: + explicit LLM(); + ~LLM() {} + friend class MyLLM; +}; + +#endif // LLM_H diff --git a/main.cpp b/main.cpp new file mode 100644 index 00000000..04b25c53 --- /dev/null +++ b/main.cpp @@ -0,0 +1,31 @@ +#include +#include +#include + +#include + +#include "llm.h" + +int main(int argc, char *argv[]) +{ + QGuiApplication app(argc, argv); + QQmlApplicationEngine engine; + qmlRegisterSingletonInstance("llm", 1, 0, "LLM", LLM::globalInstance()); + const QUrl url(u"qrc:/gpt4all-chat/main.qml"_qs); + + QObject::connect(&engine, &QQmlApplicationEngine::objectCreated, + &app, [url](QObject *obj, const QUrl &objUrl) { + if (!obj && url == objUrl) + QCoreApplication::exit(-1); + }, Qt::QueuedConnection); + engine.load(url); + +#if 1 + QDirIterator it("qrc:", QDirIterator::Subdirectories); + while (it.hasNext()) { + qDebug() << it.next(); + } +#endif + + return app.exec(); +} diff --git a/main.qml b/main.qml new file mode 100644 index 00000000..2e9038ca --- /dev/null +++ b/main.qml @@ -0,0 +1,233 @@ +import QtQuick +import QtQuick.Controls +import llm + +Window { + id: window + width: 1280 + height: 720 + visible: true + title: qsTr("GPT4All Chat") + + Rectangle { + id: conversationList + width: 300 + anchors.left: parent.left + anchors.top: parent.top + anchors.bottom: parent.bottom + color: "#202123" + + Button { + id: newChat + text: qsTr("New chat") + anchors.top: parent.top + anchors.left: parent.left + anchors.right: parent.right + anchors.margins: 15 + padding: 15 + background: Rectangle { + opacity: .5 + border.color: "#7d7d8e" + border.width: 1 + radius: 10 + color: "#343541" + } + } + } + + Rectangle { + id: conversation + color: "#343541" + anchors.left: conversationList.right + anchors.right: parent.right + anchors.bottom: parent.bottom + anchors.top: parent.top + + ScrollView { + id: scrollView + anchors.left: parent.left + anchors.right: parent.right + anchors.top: parent.top + anchors.bottom: textInput.top + anchors.bottomMargin: 30 + ScrollBar.vertical.policy: ScrollBar.AlwaysOn + + ListModel { + id: chatModel + } + + Rectangle { + anchors.fill: parent + color: "#444654" + + ListView { + id: listView + anchors.fill: parent + header: TextField { + id: modelName + width: parent.width + color: "#d1d5db" + padding: 20 + font.pixelSize: 24 + text: "Model: GPT-J-6B-4bit" + background: Rectangle { + color: "#444654" + } + focus: false + horizontalAlignment: TextInput.AlignHCenter + } + + model: chatModel + delegate: TextArea { + text: currentResponse ? LLM.response : value + width: parent.width + color: "#d1d5db" + wrapMode: Text.WordWrap + focus: false + padding: 20 + font.pixelSize: 24 + cursorVisible: currentResponse ? LLM.responseInProgress : false + cursorPosition: text.length + background: Rectangle { + color: name === qsTr("Response: ") ? "#444654" : "#343541" + } + + leftPadding: 100 + + Rectangle { + anchors.left: parent.left + anchors.top: parent.top + anchors.leftMargin: 20 + anchors.topMargin: 20 + width: 30 + height: 30 + radius: 5 + color: name === qsTr("Response: ") ? "#10a37f" : "#ec86bf" + + Text { + anchors.centerIn: parent + text: name === qsTr("Response: ") ? "R" : "P" + color: "white" + } + } + } + + property bool shouldAutoScroll: true + property bool isAutoScrolling: false + + Connections { + target: LLM + function onResponseChanged() { + if (listView.shouldAutoScroll) { + listView.isAutoScrolling = true + listView.positionViewAtEnd() + listView.isAutoScrolling = false + } + } + } + + onContentYChanged: { + if (!isAutoScrolling) + shouldAutoScroll = atYEnd + } + + Component.onCompleted: { + shouldAutoScroll = true + positionViewAtEnd() + } + + footer: Item { + id: bottomPadding + width: parent.width + height: 60 + } + } + } + } + + Button { + Image { + anchors.verticalCenter: parent.verticalCenter + anchors.left: parent.left + anchors.leftMargin: 15 + source: LLM.responseInProgress ? "qrc:/gpt4all-chat/icons/stop_generating.svg" : "qrc:/gpt4all-chat/icons/regenerate.svg" + } + text: LLM.responseInProgress ? qsTr(" Stop generating") : qsTr(" Regenerate response") + onClicked: { + if (LLM.responseInProgress) + LLM.stopGenerating() + else { + LLM.resetResponse() + if (chatModel.count) { + var listElement = chatModel.get(chatModel.count - 1) + if (listElement.name === qsTr("Response: ")) { + listElement.currentResponse = true + listElement.value = LLM.response + LLM.prompt(listElement.prompt) + } + } + } + } + anchors.bottom: textInput.top + anchors.horizontalCenter: textInput.horizontalCenter + anchors.bottomMargin: 40 + padding: 15 + background: Rectangle { + opacity: .5 + border.color: "#7d7d8e" + border.width: 1 + radius: 10 + color: "#343541" + } + } + + TextField { + id: textInput + anchors.left: parent.left + anchors.right: parent.right + anchors.bottom: parent.bottom + anchors.margins: 30 + color: "#dadadc" + padding: 20 + font.pixelSize: 24 + placeholderText: qsTr("Send a message...") + placeholderTextColor: "#7d7d8e" + background: Rectangle { + color: "#40414f" + radius: 10 + } + onAccepted: { + LLM.stopGenerating() + + if (chatModel.count) { + var listElement = chatModel.get(chatModel.count - 1) + listElement.currentResponse = false + listElement.value = LLM.response + } + chatModel.append({"name": qsTr("Prompt: "), "currentResponse": false, "value": textInput.text}) + chatModel.append({"name": qsTr("Response: "), "currentResponse": true, "value": "", "prompt": textInput.text}) + + LLM.resetResponse() + LLM.prompt(textInput.text) + textInput.text = "" + } + + Button { + anchors.right: textInput.right + anchors.verticalCenter: textInput.verticalCenter + anchors.rightMargin: 15 + width: 30 + height: 30 + + background: Image { + anchors.centerIn: parent + source: "qrc:/gpt4all-chat/icons/send_message.svg" + } + + onClicked: { + textInput.onAccepted() + } + } + } + } +}