Latest rebase on llama.cpp with gguf support.

2025-10-23 17:09:06 +00:00 · 2023-09-21 12:41:48 -04:00
parent 5f3d739205
commit d90d003a1d
12 changed files with 245 additions and 85 deletions
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -345,7 +345,7 @@ void bert_eval(
    // embd norm
    {
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);
        inpL = ggml_add(ctx0,
                        ggml_mul(ctx0,
@@ -406,7 +406,7 @@ void bert_eval(
        // attention norm
        {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, 1e-5f);
            cur = ggml_add(ctx0,
                           ggml_mul(ctx0,
@@ -432,7 +432,7 @@ void bert_eval(
        // output norm
        {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, 1e-5f);
            cur = ggml_add(ctx0,
                           ggml_mul(ctx0,
@@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
-DLL_EXPORT bool magic_match(std::istream& f) {
+DLL_EXPORT bool magic_match(const char* fname) {
 #if 0
    uint32_t magic = 0;
    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    if (magic != 0x62657274) {
         return false;
    }
    return true;
 #endif
    return false;
 }
 DLL_EXPORT LLModel *construct() {
--- a/gpt4all-backend/falcon.cpp
+++ b/gpt4all-backend/falcon.cpp
@@ -2,10 +2,11 @@
 #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "falcon_impl.h"
 #include "llama.h"
 #include "llama-util.h"
 #include "utils.h"
 #include "llmodel_shared.h"
 #include <stdio.h>
 #include <string.h>
 #include <cassert>
 #include <cinttypes>
 #include <iostream>
@@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
        const int n_vocab = hparams.n_vocab;
        const int head_dim = hparams.n_embd / hparams.n_head;
-        ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // tok_embeddings
+        ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // tok_embeddings
-        ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm
+        ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm
-        ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm_b
+        ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm_b
-        ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // lm_head
+        ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // lm_head
        // if (hparams.version == 40) { // Falcon-40B
        //     ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd);  // attention_norm
        //     ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd);  // attention_norm_b
        // }
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm_b
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm_b
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim);  // query_key_value
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim);  // query_key_value
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd);  // wo
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd);  // wo
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff);  // ffn_up
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff);  // ffn_up
-        ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd);  // ffn_down
+        ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd);  // ffn_down
- 
+
        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }
@@ -494,7 +495,7 @@ bool falcon_eval(
        // self-attention
        {
-            layernorm_output = ggml_norm(ctx0, inpL);
+            layernorm_output = ggml_norm(ctx0, inpL, 1e-5f);
            layernorm_output = ggml_add(ctx0,
                    ggml_mul(ctx0,
@@ -653,7 +654,7 @@ bool falcon_eval(
    // norm
    {
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);
        // inpL = ln_f_g*inpL + ln_f_b
        inpL = ggml_add(ctx0,
@@ -680,7 +681,7 @@ bool falcon_eval(
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
-  
+
    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
@@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
-DLL_EXPORT bool magic_match(std::istream& f) {
+DLL_EXPORT bool magic_match(const char* fname) {
 #if 0
    uint32_t magic = 0;
    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    uint32_t version = 0;
    f.read(reinterpret_cast<char*>(&version), sizeof(version));
    if (magic != FALCON_MAGIC) {
-         return false; 
+         return false;
    }
    falcon_hparams hparams;
    f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
@@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
        return false;
    }
    return true;
 #endif
    return false;
 }
 DLL_EXPORT LLModel *construct() {
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE)
        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        if(CMAKE_GENERATOR MATCHES "Visual Studio") 
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
            add_custom_command(
              OUTPUT ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
@@ -346,6 +346,13 @@ endif()
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (MSVC)
  string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
  message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
 else ()
  set(CMAKE_GENERATOR_PLATFORM_LWR "")
 endif ()
 if (NOT MSVC)
    if (LLAMA_STATIC)
        add_link_options(-static)
@@ -361,6 +368,138 @@ if (NOT MSVC)
    endif()
 endif()
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
    message(STATUS "ARM detected")
    if (MSVC)
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)
        add_compile_definitions(__ARM_FEATURE_DOTPROD)
        # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
            add_compile_options(-mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            # Raspberry Pi 2
            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            add_compile_options(-mno-unaligned-access)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
    message(STATUS "x86 detected")
    if (MSVC)
        if (LLAMA_AVX512)
            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if (LLAMA_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if (LLAMA_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
        elseif (LLAMA_AVX)
            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
        if (LLAMA_F16C)
            add_compile_options(-mf16c)
        endif()
        if (LLAMA_FMA)
            add_compile_options(-mfma)
        endif()
        if (LLAMA_AVX)
            add_compile_options(-mavx)
        endif()
        if (LLAMA_AVX2)
            add_compile_options(-mavx2)
        endif()
        if (LLAMA_AVX512)
            add_compile_options(-mavx512f)
            add_compile_options(-mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
            add_compile_options(-mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
            add_compile_options(-mavx512vnni)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    add_compile_options(-mcpu=native -mtune=native)
    #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
 else()
    message(STATUS "Unknown architecture")
 endif()
 #
 # POSIX conformance
 #
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
 # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
 add_compile_definitions(_XOPEN_SOURCE=600)
 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    remove_definitions(-D_XOPEN_SOURCE=600)
    add_compile_definitions(_XOPEN_SOURCE=700)
 endif()
 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_compile_definitions(_GNU_SOURCE)
 endif()
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
 # and on macOS its availability depends on enabling Darwin extensions
 # similarly on DragonFly, enabling BSD extensions is necessary
 if (
    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
 )
    add_compile_definitions(_DARWIN_C_SOURCE)
 endif()
 # alloca is a non-standard interface that is not visible on BSDs when
 # POSIX conformance is specified, but not all of them provide a clean way
 # to enable it in such cases
 if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
    add_compile_definitions(__BSD_VISIBLE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
    add_compile_definitions(_NETBSD_SOURCE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    add_compile_definitions(_BSD_SOURCE)
 endif()
 function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
    message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}")
@@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
    if (WITH_LLAMA)
        # Backwards compatibility with old llama.cpp versions
-        set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
+#        set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
        if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
            set(LLAMA_UTIL_SOURCE_FILE llama_util.h)
        endif()
        add_library(llama${SUFFIX} STATIC
                    ${DIRECTORY}/llama.cpp
-                    ${DIRECTORY}/llama.h
+                    ${DIRECTORY}/llama.h)
                    ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
        if (LLAMA_METAL AND GGML_METAL_SOURCES)
            target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src)
 std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
 {
-    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
+    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
    std::vector<LLModel::Token> fres(str.size()+4);
-    auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);
+    auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS);
    fres.resize(fres_len);
    return fres;
 }
@@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
    // When we recalculate context we could have erased the original BOS token... we need to replace it
-    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
+    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
    if (useBOS) {
        std::vector<int32_t> myTokens;
-        myTokens.push_back(llama_token_bos());
+        myTokens.push_back(llama_token_bos(d_ptr->ctx));
        myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
        ctx.n_past += 1;
        return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
@@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const
 const std::vector<LLModel::Token> &LLamaModel::endTokens() const
 {
-    static const std::vector<LLModel::Token> fres = {llama_token_eos()};
+    static const std::vector<LLModel::Token> fres = {llama_token_eos(d_ptr->ctx)};
    return fres;
 }
@@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice()
    return false;
 }
 std::string get_arch_name(gguf_context *ctx_gguf) {
    std::string arch_name;
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
    if (ktype != (GGUF_TYPE_STRING)) {
        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
    }
    return gguf_get_val_str(ctx_gguf, kid);
 }
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
@@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
-DLL_EXPORT bool magic_match(std::istream& f) {
+DLL_EXPORT bool magic_match(const char * fname) {
-    // Check magic
+
-    uint32_t magic = 0;
+    struct ggml_context * ctx_meta = NULL;
-    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+    struct gguf_init_params params = {
-    if (magic != 0x67676a74) return false;
+        /*.no_alloc = */ true,
-    // Check version
+        /*.ctx      = */ &ctx_meta,
-    uint32_t version = 0;
+    };
-    f.read(reinterpret_cast<char*>(&version), sizeof(version));
+    gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
-    if (!(version LLAMA_VERSIONS)) {
+    if (!ctx_gguf)
        return false;
-    }
+
-    llama_file_hparams hparams;
+    bool isValid = gguf_get_version(ctx_gguf) <= 2;
-    f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
+    isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid;
-    if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
+
        return false; // not a llama.
    }
 #ifdef GGML_USE_METAL
-    // Check quant supported on metal
+    const int n_tensors = gguf_get_n_tensors(ctx_gguf);
-    // skip fields
+    for (int i = 0; i < n_tensors; i++) {
-    switch(hparams.ftype) {
+        const char * name = gguf_get_tensor_name(ctx_gguf, i);
-        // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
+        struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
-        case LLAMA_FTYPE_MOSTLY_F16:
+        switch(meta->type) {
-        case LLAMA_FTYPE_MOSTLY_Q2_K:
+            // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
-        case LLAMA_FTYPE_MOSTLY_Q4_0:
+            case LLAMA_FTYPE_MOSTLY_F16:
-        case LLAMA_FTYPE_MOSTLY_Q6_K:
+            case LLAMA_FTYPE_MOSTLY_Q2_K:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:
+            case LLAMA_FTYPE_MOSTLY_Q4_0:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:
+            case LLAMA_FTYPE_MOSTLY_Q6_K:
-            return true;
+            case LLAMA_FTYPE_MOSTLY_Q4_K_S:
-        default: // unsupported quant-type for Metal
+            case LLAMA_FTYPE_MOSTLY_Q4_K_M:
-            return false;
+                break;
            default: // unsupported quant-type for Metal
                isValid = false;
        }
    }
 #endif
-    return true;
+
    gguf_free(ctx_gguf);
    return isValid;
 }
 DLL_EXPORT LLModel *construct() {
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
    assert(get_build_variant);
    m_buildVariant = get_build_variant();
-    m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match");
+    m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match");
    assert(m_magicMatch);
    m_construct = m_dlhandle->get<LLModel *()>("construct");
    assert(m_construct);
@@ -111,10 +111,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }
-const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) {
+const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    for (const auto& i : implementationList()) {
-        f.seekg(0);
+        if (!i.m_magicMatch(fname)) continue;
        if (!i.m_magicMatch(f)) continue;
        if (buildVariant != i.m_buildVariant) continue;
        return &i;
    }
@@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
    if (!has_at_least_minimal_hardware())
        return nullptr;
    // Read magic
    std::ifstream f(modelPath, std::ios::binary);
    if (!f) return nullptr;
    // Get correct implementation
    const Implementation* impl = nullptr;
@@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
                buildVariant = "default";
            }
        }
-        impl = implementation(f, buildVariant);
+        impl = implementation(modelPath.c_str(), buildVariant);
        if (!impl) return nullptr;
    }
    f.close();
    // Construct and return llmodel implementation
    auto fres = impl->m_construct();
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -27,13 +27,13 @@ public:
        static bool isImplementation(const Dlhandle&);
        static const std::vector<Implementation>& implementationList();
-        static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
+        static const Implementation *implementation(const char *fname, const std::string& buildVariant);
        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
        static void setImplementationsSearchPath(const std::string& path);
        static const std::string& implementationsSearchPath();
    private:
-        bool (*m_magicMatch)(std::ifstream& f);
+        bool (*m_magicMatch)(const char *fname);
        LLModel *(*m_construct)();
    private:
--- a/gpt4all-backend/replit.cpp
+++ b/gpt4all-backend/replit.cpp
@@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
        // a = self.ln_1(x)
        {
-            cur = ggml_norm(ctx0, inpL);
+            cur = ggml_norm(ctx0, inpL, 1e-5f);
            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur);
        }
@@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
        // m = self.ln_2(x)
        {
-            cur = ggml_norm(ctx0, inpL);
+            cur = ggml_norm(ctx0, inpL, 1e-5f);
            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur);
        }
@@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
    ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
    // norm
    {
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);
        // inpL = ln_f_g*inpL
        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL);
    }
@@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
-DLL_EXPORT bool magic_match(std::istream& f) {
+DLL_EXPORT bool magic_match(const char *fname) {
 #if 0
    uint32_t magic = 0;
    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    if (magic != 0x7265706c) return false;
@@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
    #else
    return true;
    #endif
 #endif
    return false;
 }
 DLL_EXPORT LLModel *construct() {
--- a/gpt4all-backend/starcoder.cpp
+++ b/gpt4all-backend/starcoder.cpp
@@ -1,10 +1,11 @@
 #define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "starcoder_impl.h"
 #include "llama.h"
 #include "llama-util.h"
 #include "utils.h"
 #include "llmodel_shared.h"
 #include <stdio.h>
 #include <string.h>
 #include <cassert>
 #include <cinttypes>
 #include <iostream>
@@ -501,7 +502,7 @@ bool starcoder_eval(
        // norm
        {
            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL);
+            cur = ggml_norm(ctx0, inpL, 1e-5f);
            // cur = ln_1_g*cur + ln_1_b
            // [ 768, N]
@@ -650,7 +651,7 @@ bool starcoder_eval(
        {
            // norm
            {
-                cur = ggml_norm(ctx0, inpFF);
+                cur = ggml_norm(ctx0, inpFF, 1e-5f);
                // cur = ln_2_g*cur + ln_2_b
                // [ 768, N]
@@ -707,7 +708,7 @@ bool starcoder_eval(
    // norm
    {
        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);
        // inpL = ln_f_g*inpL + ln_f_b
        // [ 768, N]
@@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
-DLL_EXPORT bool magic_match(std::istream& f) {
+DLL_EXPORT bool magic_match(const char *fname) {
 #if 0
    uint32_t magic = 0;
    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    if (magic != STARCODER_MAGIC) {
@@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
        return false;
    }
    return true;
 #endif
    return false;
 }
 DLL_EXPORT LLModel *construct() {
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
        emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename()));
    }
-    if (m_llModelInfo.model)
+    if (m_llModelInfo.model) {
        setModelInfo(modelInfo);
-
+        processSystemPrompt();
-    processSystemPrompt();
+    }
    return m_llModelInfo.model;
 }
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -189,7 +189,7 @@ Window {
            + "causes include a bad file format, an incomplete or corrupted download, the wrong file "
            + "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:"
            + "<br><ul>"
-            + "<li>Ensure the model file has a compatible ggml format and type"
+            + "<li>Ensure the model file has a compatible format and type"
            + "<li>Check the model file is complete in the download folder"
            + "<li>You can find the download folder in the settings dialog"
            + "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum"
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory()
                QString filename = it.fileName();
                // All files that end with .bin and have 'ggml' somewhere in the name
-                if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete"))
+                if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete"))
                    || (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) {
                    QString filePath = it.filePath();