diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp index 77bdadce..f9d66839 100644 --- a/gpt4all-backend/bert.cpp +++ b/gpt4all-backend/bert.cpp @@ -345,7 +345,7 @@ void bert_eval( // embd norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, 1e-5f); inpL = ggml_add(ctx0, ggml_mul(ctx0, @@ -406,7 +406,7 @@ void bert_eval( // attention norm { - cur = ggml_norm(ctx0, cur); + cur = ggml_norm(ctx0, cur, 1e-5f); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -432,7 +432,7 @@ void bert_eval( // output norm { - cur = ggml_norm(ctx0, cur); + cur = ggml_norm(ctx0, cur, 1e-5f); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() { return GGML_BUILD_VARIANT; } -DLL_EXPORT bool magic_match(std::istream& f) { +DLL_EXPORT bool magic_match(const char* fname) { +#if 0 uint32_t magic = 0; f.read(reinterpret_cast(&magic), sizeof(magic)); if (magic != 0x62657274) { return false; } return true; +#endif + return false; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-backend/falcon.cpp b/gpt4all-backend/falcon.cpp index fe1ec9f4..de198358 100644 --- a/gpt4all-backend/falcon.cpp +++ b/gpt4all-backend/falcon.cpp @@ -2,10 +2,11 @@ #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #include "falcon_impl.h" #include "llama.h" -#include "llama-util.h" #include "utils.h" #include "llmodel_shared.h" +#include +#include #include #include #include @@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca const int n_vocab = hparams.n_vocab; const int head_dim = hparams.n_embd / hparams.n_head; - ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings - ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm - ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b - ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head + ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings + ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm + ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b + ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head // if (hparams.version == 40) { // Falcon-40B // ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm // ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b // } - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up - ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down - + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up + ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } @@ -494,7 +495,7 @@ bool falcon_eval( // self-attention { - layernorm_output = ggml_norm(ctx0, inpL); + layernorm_output = ggml_norm(ctx0, inpL, 1e-5f); layernorm_output = ggml_add(ctx0, ggml_mul(ctx0, @@ -653,7 +654,7 @@ bool falcon_eval( // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, 1e-5f); // inpL = ln_f_g*inpL + ln_f_b inpL = ggml_add(ctx0, @@ -680,7 +681,7 @@ bool falcon_eval( // run the computation ggml_build_forward_expand(&gf, inpL); ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); - + //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() { return GGML_BUILD_VARIANT; } -DLL_EXPORT bool magic_match(std::istream& f) { +DLL_EXPORT bool magic_match(const char* fname) { +#if 0 uint32_t magic = 0; f.read(reinterpret_cast(&magic), sizeof(magic)); uint32_t version = 0; f.read(reinterpret_cast(&version), sizeof(version)); if (magic != FALCON_MAGIC) { - return false; + return false; } falcon_hparams hparams; f.read(reinterpret_cast(&hparams), sizeof(hparams)); @@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { return false; } return true; +#endif + return false; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index 99c5c9a0..37a0be31 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit 99c5c9a0d834888c33669855f3a1cf425df37dd2 +Subproject commit 37a0be313d21f8b61184a3adcaac123353128238 diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake index 78fb0d24..b540ee75 100644 --- a/gpt4all-backend/llama.cpp.cmake +++ b/gpt4all-backend/llama.cpp.cmake @@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE) string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") set(OUTPUT_HEADER_FILE "${HEADER_FILE}") message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") + if(CMAKE_GENERATOR MATCHES "Visual Studio") add_custom_command( OUTPUT ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} @@ -346,6 +346,13 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +if (MSVC) + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") +else () + set(CMAKE_GENERATOR_PLATFORM_LWR "") +endif () + if (NOT MSVC) if (LLAMA_STATIC) add_link_options(-static) @@ -361,6 +368,138 @@ if (NOT MSVC) endif() endif() +if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) + message(STATUS "ARM detected") + if (MSVC) + add_compile_definitions(__ARM_NEON) + add_compile_definitions(__ARM_FEATURE_FMA) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16 + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + add_compile_options(-mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + # Raspberry Pi 2 + add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Raspberry Pi 3, 4, Zero 2 (32-bit) + add_compile_options(-mno-unaligned-access) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) + message(STATUS "x86 detected") + if (MSVC) + if (LLAMA_AVX512) + add_compile_options($<$:/arch:AVX512>) + add_compile_options($<$:/arch:AVX512>) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (LLAMA_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + endif() + elseif (LLAMA_AVX2) + add_compile_options($<$:/arch:AVX2>) + add_compile_options($<$:/arch:AVX2>) + elseif (LLAMA_AVX) + add_compile_options($<$:/arch:AVX>) + add_compile_options($<$:/arch:AVX>) + endif() + else() + if (LLAMA_F16C) + add_compile_options(-mf16c) + endif() + if (LLAMA_FMA) + add_compile_options(-mfma) + endif() + if (LLAMA_AVX) + add_compile_options(-mavx) + endif() + if (LLAMA_AVX2) + add_compile_options(-mavx2) + endif() + if (LLAMA_AVX512) + add_compile_options(-mavx512f) + add_compile_options(-mavx512bw) + endif() + if (LLAMA_AVX512_VBMI) + add_compile_options(-mavx512vbmi) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_options(-mavx512vnni) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + add_compile_options(-mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) +else() + message(STATUS "Unknown architecture") +endif() + +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) +add_compile_definitions(_XOPEN_SOURCE=600) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + remove_definitions(-D_XOPEN_SOURCE=600) + add_compile_definitions(_XOPEN_SOURCE=700) +endif() + +# Data types, macros and functions related to controlling CPU affinity and +# some memory allocation are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if ( + CMAKE_SYSTEM_NAME MATCHES "Darwin" OR + CMAKE_SYSTEM_NAME MATCHES "iOS" OR + CMAKE_SYSTEM_NAME MATCHES "tvOS" OR + CMAKE_SYSTEM_NAME MATCHES "DragonFly" +) + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}") @@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) if (WITH_LLAMA) # Backwards compatibility with old llama.cpp versions - set(LLAMA_UTIL_SOURCE_FILE llama-util.h) +# set(LLAMA_UTIL_SOURCE_FILE llama-util.h) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) set(LLAMA_UTIL_SOURCE_FILE llama_util.h) endif() add_library(llama${SUFFIX} STATIC ${DIRECTORY}/llama.cpp - ${DIRECTORY}/llama.h - ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) + ${DIRECTORY}/llama.h) if (LLAMA_METAL AND GGML_METAL_SOURCES) target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index ecae5f0e..3770e284 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src) std::vector LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const { - const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); + const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx)); std::vector fres(str.size()+4); - auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS); + auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS); fres.resize(fres_len); return fres; } @@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector &tokens) const { // When we recalculate context we could have erased the original BOS token... we need to replace it - const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); + const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx)); if (useBOS) { std::vector myTokens; - myTokens.push_back(llama_token_bos()); + myTokens.push_back(llama_token_bos(d_ptr->ctx)); myTokens.insert(myTokens.end(), tokens.begin(), tokens.end()); ctx.n_past += 1; return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0; @@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const const std::vector &LLamaModel::endTokens() const { - static const std::vector fres = {llama_token_eos()}; + static const std::vector fres = {llama_token_eos(d_ptr->ctx)}; return fres; } @@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice() return false; } +std::string get_arch_name(gguf_context *ctx_gguf) { + std::string arch_name; + const int kid = gguf_find_key(ctx_gguf, "general.architecture"); + enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid); + if (ktype != (GGUF_TYPE_STRING)) { + throw std::runtime_error("ERROR: Can't get general architecture from gguf file."); + } + return gguf_get_val_str(ctx_gguf, kid); +} + #if defined(_WIN32) #define DLL_EXPORT __declspec(dllexport) #else @@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() { return GGML_BUILD_VARIANT; } -DLL_EXPORT bool magic_match(std::istream& f) { - // Check magic - uint32_t magic = 0; - f.read(reinterpret_cast(&magic), sizeof(magic)); - if (magic != 0x67676a74) return false; - // Check version - uint32_t version = 0; - f.read(reinterpret_cast(&version), sizeof(version)); - if (!(version LLAMA_VERSIONS)) { +DLL_EXPORT bool magic_match(const char * fname) { + + struct ggml_context * ctx_meta = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + gguf_context *ctx_gguf = gguf_init_from_file(fname, params); + if (!ctx_gguf) return false; - } - llama_file_hparams hparams; - f.read(reinterpret_cast(&hparams), sizeof(hparams)); - if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) { - return false; // not a llama. - } + + bool isValid = gguf_get_version(ctx_gguf) <= 2; + isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid; + #ifdef GGML_USE_METAL - // Check quant supported on metal - // skip fields - switch(hparams.ftype) { - // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 - case LLAMA_FTYPE_MOSTLY_F16: - case LLAMA_FTYPE_MOSTLY_Q2_K: - case LLAMA_FTYPE_MOSTLY_Q4_0: - case LLAMA_FTYPE_MOSTLY_Q6_K: - case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: - return true; - default: // unsupported quant-type for Metal - return false; + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); + switch(meta->type) { + // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 + case LLAMA_FTYPE_MOSTLY_F16: + case LLAMA_FTYPE_MOSTLY_Q2_K: + case LLAMA_FTYPE_MOSTLY_Q4_0: + case LLAMA_FTYPE_MOSTLY_Q6_K: + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: + break; + default: // unsupported quant-type for Metal + isValid = false; + } } #endif - return true; + + gguf_free(ctx_gguf); + return isValid; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp index 40276865..ca89800f 100644 --- a/gpt4all-backend/llmodel.cpp +++ b/gpt4all-backend/llmodel.cpp @@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_) auto get_build_variant = m_dlhandle->get("get_build_variant"); assert(get_build_variant); m_buildVariant = get_build_variant(); - m_magicMatch = m_dlhandle->get("magic_match"); + m_magicMatch = m_dlhandle->get("magic_match"); assert(m_magicMatch); m_construct = m_dlhandle->get("construct"); assert(m_construct); @@ -111,10 +111,9 @@ const std::vector &LLModel::Implementation::implementat return *libs; } -const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) { +const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) { for (const auto& i : implementationList()) { - f.seekg(0); - if (!i.m_magicMatch(f)) continue; + if (!i.m_magicMatch(fname)) continue; if (buildVariant != i.m_buildVariant) continue; return &i; } @@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s if (!has_at_least_minimal_hardware()) return nullptr; - // Read magic - std::ifstream f(modelPath, std::ios::binary); - if (!f) return nullptr; // Get correct implementation const Implementation* impl = nullptr; @@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s buildVariant = "default"; } } - impl = implementation(f, buildVariant); + impl = implementation(modelPath.c_str(), buildVariant); if (!impl) return nullptr; } - f.close(); // Construct and return llmodel implementation auto fres = impl->m_construct(); diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 70d3b0b1..af3e3ff2 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -27,13 +27,13 @@ public: static bool isImplementation(const Dlhandle&); static const std::vector& implementationList(); - static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant); + static const Implementation *implementation(const char *fname, const std::string& buildVariant); static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto"); static void setImplementationsSearchPath(const std::string& path); static const std::string& implementationsSearchPath(); private: - bool (*m_magicMatch)(std::ifstream& f); + bool (*m_magicMatch)(const char *fname); LLModel *(*m_construct)(); private: diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp index 196545e3..3ab330d2 100644 --- a/gpt4all-backend/replit.cpp +++ b/gpt4all-backend/replit.cpp @@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, // a = self.ln_1(x) { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, 1e-5f); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur); } @@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, // m = self.ln_2(x) { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, 1e-5f); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur); } @@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, 1e-5f); // inpL = ln_f_g*inpL inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL); } @@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() { return GGML_BUILD_VARIANT; } -DLL_EXPORT bool magic_match(std::istream& f) { +DLL_EXPORT bool magic_match(const char *fname) { +#if 0 uint32_t magic = 0; f.read(reinterpret_cast(&magic), sizeof(magic)); if (magic != 0x7265706c) return false; @@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { #else return true; #endif +#endif + return false; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-backend/starcoder.cpp b/gpt4all-backend/starcoder.cpp index 1a0ef935..b1836231 100644 --- a/gpt4all-backend/starcoder.cpp +++ b/gpt4all-backend/starcoder.cpp @@ -1,10 +1,11 @@ #define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #include "starcoder_impl.h" #include "llama.h" -#include "llama-util.h" #include "utils.h" #include "llmodel_shared.h" +#include +#include #include #include #include @@ -501,7 +502,7 @@ bool starcoder_eval( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, 1e-5f); // cur = ln_1_g*cur + ln_1_b // [ 768, N] @@ -650,7 +651,7 @@ bool starcoder_eval( { // norm { - cur = ggml_norm(ctx0, inpFF); + cur = ggml_norm(ctx0, inpFF, 1e-5f); // cur = ln_2_g*cur + ln_2_b // [ 768, N] @@ -707,7 +708,7 @@ bool starcoder_eval( // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, 1e-5f); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] @@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() { return GGML_BUILD_VARIANT; } -DLL_EXPORT bool magic_match(std::istream& f) { +DLL_EXPORT bool magic_match(const char *fname) { +#if 0 uint32_t magic = 0; f.read(reinterpret_cast(&magic), sizeof(magic)); if (magic != STARCODER_MAGIC) { @@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { return false; } return true; +#endif + return false; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index ae6359c0..b17bf500 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename())); } - if (m_llModelInfo.model) + if (m_llModelInfo.model) { setModelInfo(modelInfo); - - processSystemPrompt(); + processSystemPrompt(); + } return m_llModelInfo.model; } diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml index 158ddc9a..304112d1 100644 --- a/gpt4all-chat/main.qml +++ b/gpt4all-chat/main.qml @@ -189,7 +189,7 @@ Window { + "causes include a bad file format, an incomplete or corrupted download, the wrong file " + "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:" + "
    " - + "
  • Ensure the model file has a compatible ggml format and type" + + "
  • Ensure the model file has a compatible format and type" + "
  • Check the model file is complete in the download folder" + "
  • You can find the download folder in the settings dialog" + "
  • If you've sideloaded the model ensure the file is not corrupt by checking md5sum" diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp index 6aacc82d..2afd7e58 100644 --- a/gpt4all-chat/modellist.cpp +++ b/gpt4all-chat/modellist.cpp @@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory() QString filename = it.fileName(); // All files that end with .bin and have 'ggml' somewhere in the name - if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete")) + if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete")) || (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) { QString filePath = it.filePath();