mirror of
				https://github.com/nomic-ai/gpt4all.git
				synced 2025-10-30 21:30:42 +00:00 
			
		
		
		
	Latest rebase on llama.cpp with gguf support.
This commit is contained in:
		| @@ -345,7 +345,7 @@ void bert_eval( | ||||
|  | ||||
|     // embd norm | ||||
|     { | ||||
|         inpL = ggml_norm(ctx0, inpL); | ||||
|         inpL = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|         inpL = ggml_add(ctx0, | ||||
|                         ggml_mul(ctx0, | ||||
| @@ -406,7 +406,7 @@ void bert_eval( | ||||
|  | ||||
|         // attention norm | ||||
|         { | ||||
|             cur = ggml_norm(ctx0, cur); | ||||
|             cur = ggml_norm(ctx0, cur, 1e-5f); | ||||
|  | ||||
|             cur = ggml_add(ctx0, | ||||
|                            ggml_mul(ctx0, | ||||
| @@ -432,7 +432,7 @@ void bert_eval( | ||||
|  | ||||
|         // output norm | ||||
|         { | ||||
|             cur = ggml_norm(ctx0, cur); | ||||
|             cur = ggml_norm(ctx0, cur, 1e-5f); | ||||
|  | ||||
|             cur = ggml_add(ctx0, | ||||
|                            ggml_mul(ctx0, | ||||
| @@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() { | ||||
|     return GGML_BUILD_VARIANT; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT bool magic_match(std::istream& f) { | ||||
| DLL_EXPORT bool magic_match(const char* fname) { | ||||
| #if 0 | ||||
|     uint32_t magic = 0; | ||||
|     f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||||
|     if (magic != 0x62657274) { | ||||
|          return false; | ||||
|     } | ||||
|     return true; | ||||
| #endif | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT LLModel *construct() { | ||||
|   | ||||
| @@ -2,10 +2,11 @@ | ||||
| #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE | ||||
| #include "falcon_impl.h" | ||||
| #include "llama.h" | ||||
| #include "llama-util.h" | ||||
| #include "utils.h" | ||||
| #include "llmodel_shared.h" | ||||
|  | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
| #include <cassert> | ||||
| #include <cinttypes> | ||||
| #include <iostream> | ||||
| @@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca | ||||
|         const int n_vocab = hparams.n_vocab; | ||||
|         const int head_dim = hparams.n_embd / hparams.n_head; | ||||
|  | ||||
|         ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // tok_embeddings | ||||
|         ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm | ||||
|         ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm_b | ||||
|         ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // lm_head | ||||
|         ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // tok_embeddings | ||||
|         ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm | ||||
|         ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd;  // output_norm_b | ||||
|         ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab;  // lm_head | ||||
|  | ||||
|         // if (hparams.version == 40) { // Falcon-40B | ||||
|         //     ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd);  // attention_norm | ||||
|         //     ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd);  // attention_norm_b | ||||
|         // } | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm_b | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim);  // query_key_value | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd);  // wo | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff);  // ffn_up | ||||
|         ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd);  // ffn_down | ||||
|   | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd);  // input_layernorm_b | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim);  // query_key_value | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd);  // wo | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff);  // ffn_up | ||||
|         ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd);  // ffn_down | ||||
|  | ||||
|         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); | ||||
|     } | ||||
|  | ||||
| @@ -494,7 +495,7 @@ bool falcon_eval( | ||||
|  | ||||
|         // self-attention | ||||
|         { | ||||
|             layernorm_output = ggml_norm(ctx0, inpL); | ||||
|             layernorm_output = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|             layernorm_output = ggml_add(ctx0, | ||||
|                     ggml_mul(ctx0, | ||||
| @@ -653,7 +654,7 @@ bool falcon_eval( | ||||
|  | ||||
|     // norm | ||||
|     { | ||||
|         inpL = ggml_norm(ctx0, inpL); | ||||
|         inpL = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|         // inpL = ln_f_g*inpL + ln_f_b | ||||
|         inpL = ggml_add(ctx0, | ||||
| @@ -680,7 +681,7 @@ bool falcon_eval( | ||||
|     // run the computation | ||||
|     ggml_build_forward_expand(&gf, inpL); | ||||
|     ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); | ||||
|    | ||||
|  | ||||
|  | ||||
|     //if (n_past%100 == 0) { | ||||
|     //    ggml_graph_print   (&gf); | ||||
| @@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() { | ||||
|     return GGML_BUILD_VARIANT; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT bool magic_match(std::istream& f) { | ||||
| DLL_EXPORT bool magic_match(const char* fname) { | ||||
| #if 0 | ||||
|     uint32_t magic = 0; | ||||
|     f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||||
|     uint32_t version = 0; | ||||
|     f.read(reinterpret_cast<char*>(&version), sizeof(version)); | ||||
|     if (magic != FALCON_MAGIC) { | ||||
|          return false;  | ||||
|          return false; | ||||
|     } | ||||
|     falcon_hparams hparams; | ||||
|     f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams)); | ||||
| @@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { | ||||
|         return false; | ||||
|     } | ||||
|     return true; | ||||
| #endif | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT LLModel *construct() { | ||||
|   | ||||
 Submodule gpt4all-backend/llama.cpp-mainline updated: 99c5c9a0d8...37a0be313d
									
								
							| @@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE) | ||||
|         string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") | ||||
|         set(OUTPUT_HEADER_FILE "${HEADER_FILE}") | ||||
|         message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") | ||||
|         if(CMAKE_GENERATOR MATCHES "Visual Studio")  | ||||
|         if(CMAKE_GENERATOR MATCHES "Visual Studio") | ||||
|             add_custom_command( | ||||
|               OUTPUT ${OUTPUT_HEADER_FILE} | ||||
|               COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} | ||||
| @@ -346,6 +346,13 @@ endif() | ||||
| # TODO: probably these flags need to be tweaked on some architectures | ||||
| #       feel free to update the Makefile for your architecture and send a pull request or issue | ||||
| message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") | ||||
| if (MSVC) | ||||
|   string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) | ||||
|   message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") | ||||
| else () | ||||
|   set(CMAKE_GENERATOR_PLATFORM_LWR "") | ||||
| endif () | ||||
|  | ||||
| if (NOT MSVC) | ||||
|     if (LLAMA_STATIC) | ||||
|         add_link_options(-static) | ||||
| @@ -361,6 +368,138 @@ if (NOT MSVC) | ||||
|     endif() | ||||
| endif() | ||||
|  | ||||
| if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) | ||||
|     message(STATUS "ARM detected") | ||||
|     if (MSVC) | ||||
|         add_compile_definitions(__ARM_NEON) | ||||
|         add_compile_definitions(__ARM_FEATURE_FMA) | ||||
|         add_compile_definitions(__ARM_FEATURE_DOTPROD) | ||||
|         # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16 | ||||
|         add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead | ||||
|     else() | ||||
|         check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) | ||||
|         if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") | ||||
|             add_compile_options(-mfp16-format=ieee) | ||||
|         endif() | ||||
|         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") | ||||
|             # Raspberry Pi 1, Zero | ||||
|             add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) | ||||
|         endif() | ||||
|         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") | ||||
|             # Raspberry Pi 2 | ||||
|             add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) | ||||
|         endif() | ||||
|         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") | ||||
|             # Raspberry Pi 3, 4, Zero 2 (32-bit) | ||||
|             add_compile_options(-mno-unaligned-access) | ||||
|         endif() | ||||
|     endif() | ||||
| elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) | ||||
|     message(STATUS "x86 detected") | ||||
|     if (MSVC) | ||||
|         if (LLAMA_AVX512) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>) | ||||
|             # MSVC has no compile-time flags enabling specific | ||||
|             # AVX512 extensions, neither it defines the | ||||
|             # macros corresponding to the extensions. | ||||
|             # Do it manually. | ||||
|             if (LLAMA_AVX512_VBMI) | ||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) | ||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>) | ||||
|             endif() | ||||
|             if (LLAMA_AVX512_VNNI) | ||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) | ||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) | ||||
|             endif() | ||||
|         elseif (LLAMA_AVX2) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>) | ||||
|         elseif (LLAMA_AVX) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>) | ||||
|             add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>) | ||||
|         endif() | ||||
|     else() | ||||
|         if (LLAMA_F16C) | ||||
|             add_compile_options(-mf16c) | ||||
|         endif() | ||||
|         if (LLAMA_FMA) | ||||
|             add_compile_options(-mfma) | ||||
|         endif() | ||||
|         if (LLAMA_AVX) | ||||
|             add_compile_options(-mavx) | ||||
|         endif() | ||||
|         if (LLAMA_AVX2) | ||||
|             add_compile_options(-mavx2) | ||||
|         endif() | ||||
|         if (LLAMA_AVX512) | ||||
|             add_compile_options(-mavx512f) | ||||
|             add_compile_options(-mavx512bw) | ||||
|         endif() | ||||
|         if (LLAMA_AVX512_VBMI) | ||||
|             add_compile_options(-mavx512vbmi) | ||||
|         endif() | ||||
|         if (LLAMA_AVX512_VNNI) | ||||
|             add_compile_options(-mavx512vnni) | ||||
|         endif() | ||||
|     endif() | ||||
| elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") | ||||
|     message(STATUS "PowerPC detected") | ||||
|     add_compile_options(-mcpu=native -mtune=native) | ||||
|     #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) | ||||
| else() | ||||
|     message(STATUS "Unknown architecture") | ||||
| endif() | ||||
|  | ||||
| # | ||||
| # POSIX conformance | ||||
| # | ||||
|  | ||||
| # clock_gettime came in POSIX.1b (1993) | ||||
| # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional | ||||
| # posix_memalign came in POSIX.1-2001 / SUSv3 | ||||
| # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) | ||||
| add_compile_definitions(_XOPEN_SOURCE=600) | ||||
|  | ||||
| # Somehow in OpenBSD whenever POSIX conformance is specified | ||||
| # some string functions rely on locale_t availability, | ||||
| # which was introduced in POSIX.1-2008, forcing us to go higher | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") | ||||
|     remove_definitions(-D_XOPEN_SOURCE=600) | ||||
|     add_compile_definitions(_XOPEN_SOURCE=700) | ||||
| endif() | ||||
|  | ||||
| # Data types, macros and functions related to controlling CPU affinity and | ||||
| # some memory allocation are available on Linux through GNU extensions in libc | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "Linux") | ||||
|     add_compile_definitions(_GNU_SOURCE) | ||||
| endif() | ||||
|  | ||||
| # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, | ||||
| # and on macOS its availability depends on enabling Darwin extensions | ||||
| # similarly on DragonFly, enabling BSD extensions is necessary | ||||
| if ( | ||||
|     CMAKE_SYSTEM_NAME MATCHES "Darwin" OR | ||||
|     CMAKE_SYSTEM_NAME MATCHES "iOS" OR | ||||
|     CMAKE_SYSTEM_NAME MATCHES "tvOS" OR | ||||
|     CMAKE_SYSTEM_NAME MATCHES "DragonFly" | ||||
| ) | ||||
|     add_compile_definitions(_DARWIN_C_SOURCE) | ||||
| endif() | ||||
|  | ||||
| # alloca is a non-standard interface that is not visible on BSDs when | ||||
| # POSIX conformance is specified, but not all of them provide a clean way | ||||
| # to enable it in such cases | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") | ||||
|     add_compile_definitions(__BSD_VISIBLE) | ||||
| endif() | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") | ||||
|     add_compile_definitions(_NETBSD_SOURCE) | ||||
| endif() | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") | ||||
|     add_compile_definitions(_BSD_SOURCE) | ||||
| endif() | ||||
|  | ||||
| function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) | ||||
|     message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}") | ||||
|  | ||||
| @@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) | ||||
|  | ||||
|     if (WITH_LLAMA) | ||||
|         # Backwards compatibility with old llama.cpp versions | ||||
|         set(LLAMA_UTIL_SOURCE_FILE llama-util.h) | ||||
| #        set(LLAMA_UTIL_SOURCE_FILE llama-util.h) | ||||
|         if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||||
|             set(LLAMA_UTIL_SOURCE_FILE llama_util.h) | ||||
|         endif() | ||||
|  | ||||
|         add_library(llama${SUFFIX} STATIC | ||||
|                     ${DIRECTORY}/llama.cpp | ||||
|                     ${DIRECTORY}/llama.h | ||||
|                     ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||||
|                     ${DIRECTORY}/llama.h) | ||||
|  | ||||
|         if (LLAMA_METAL AND GGML_METAL_SOURCES) | ||||
|             target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG) | ||||
|   | ||||
| @@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src) | ||||
|  | ||||
| std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const | ||||
| { | ||||
|     const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); | ||||
|     const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx)); | ||||
|     std::vector<LLModel::Token> fres(str.size()+4); | ||||
|     auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS); | ||||
|     auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS); | ||||
|     fres.resize(fres_len); | ||||
|     return fres; | ||||
| } | ||||
| @@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const | ||||
| bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const | ||||
| { | ||||
|     // When we recalculate context we could have erased the original BOS token... we need to replace it | ||||
|     const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); | ||||
|     const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx)); | ||||
|     if (useBOS) { | ||||
|         std::vector<int32_t> myTokens; | ||||
|         myTokens.push_back(llama_token_bos()); | ||||
|         myTokens.push_back(llama_token_bos(d_ptr->ctx)); | ||||
|         myTokens.insert(myTokens.end(), tokens.begin(), tokens.end()); | ||||
|         ctx.n_past += 1; | ||||
|         return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0; | ||||
| @@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const | ||||
|  | ||||
| const std::vector<LLModel::Token> &LLamaModel::endTokens() const | ||||
| { | ||||
|     static const std::vector<LLModel::Token> fres = {llama_token_eos()}; | ||||
|     static const std::vector<LLModel::Token> fres = {llama_token_eos(d_ptr->ctx)}; | ||||
|     return fres; | ||||
| } | ||||
|  | ||||
| @@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice() | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| std::string get_arch_name(gguf_context *ctx_gguf) { | ||||
|     std::string arch_name; | ||||
|     const int kid = gguf_find_key(ctx_gguf, "general.architecture"); | ||||
|     enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid); | ||||
|     if (ktype != (GGUF_TYPE_STRING)) { | ||||
|         throw std::runtime_error("ERROR: Can't get general architecture from gguf file."); | ||||
|     } | ||||
|     return gguf_get_val_str(ctx_gguf, kid); | ||||
| } | ||||
|  | ||||
| #if defined(_WIN32) | ||||
| #define DLL_EXPORT __declspec(dllexport) | ||||
| #else | ||||
| @@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() { | ||||
|     return GGML_BUILD_VARIANT; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT bool magic_match(std::istream& f) { | ||||
|     // Check magic | ||||
|     uint32_t magic = 0; | ||||
|     f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||||
|     if (magic != 0x67676a74) return false; | ||||
|     // Check version | ||||
|     uint32_t version = 0; | ||||
|     f.read(reinterpret_cast<char*>(&version), sizeof(version)); | ||||
|     if (!(version LLAMA_VERSIONS)) { | ||||
| DLL_EXPORT bool magic_match(const char * fname) { | ||||
|  | ||||
|     struct ggml_context * ctx_meta = NULL; | ||||
|     struct gguf_init_params params = { | ||||
|         /*.no_alloc = */ true, | ||||
|         /*.ctx      = */ &ctx_meta, | ||||
|     }; | ||||
|     gguf_context *ctx_gguf = gguf_init_from_file(fname, params); | ||||
|     if (!ctx_gguf) | ||||
|         return false; | ||||
|     } | ||||
|     llama_file_hparams hparams; | ||||
|     f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams)); | ||||
|     if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) { | ||||
|         return false; // not a llama. | ||||
|     } | ||||
|  | ||||
|     bool isValid = gguf_get_version(ctx_gguf) <= 2; | ||||
|     isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid; | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|     // Check quant supported on metal | ||||
|     // skip fields | ||||
|     switch(hparams.ftype) { | ||||
|         // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 | ||||
|         case LLAMA_FTYPE_MOSTLY_F16: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q6_K: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_K_S: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_K_M: | ||||
|             return true; | ||||
|         default: // unsupported quant-type for Metal | ||||
|             return false; | ||||
|     const int n_tensors = gguf_get_n_tensors(ctx_gguf); | ||||
|     for (int i = 0; i < n_tensors; i++) { | ||||
|         const char * name = gguf_get_tensor_name(ctx_gguf, i); | ||||
|         struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); | ||||
|         switch(meta->type) { | ||||
|             // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 | ||||
|             case LLAMA_FTYPE_MOSTLY_F16: | ||||
|             case LLAMA_FTYPE_MOSTLY_Q2_K: | ||||
|             case LLAMA_FTYPE_MOSTLY_Q4_0: | ||||
|             case LLAMA_FTYPE_MOSTLY_Q6_K: | ||||
|             case LLAMA_FTYPE_MOSTLY_Q4_K_S: | ||||
|             case LLAMA_FTYPE_MOSTLY_Q4_K_M: | ||||
|                 break; | ||||
|             default: // unsupported quant-type for Metal | ||||
|                 isValid = false; | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|     return true; | ||||
|  | ||||
|     gguf_free(ctx_gguf); | ||||
|     return isValid; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT LLModel *construct() { | ||||
|   | ||||
| @@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_) | ||||
|     auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant"); | ||||
|     assert(get_build_variant); | ||||
|     m_buildVariant = get_build_variant(); | ||||
|     m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match"); | ||||
|     m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match"); | ||||
|     assert(m_magicMatch); | ||||
|     m_construct = m_dlhandle->get<LLModel *()>("construct"); | ||||
|     assert(m_construct); | ||||
| @@ -111,10 +111,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat | ||||
|     return *libs; | ||||
| } | ||||
|  | ||||
| const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) { | ||||
| const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) { | ||||
|     for (const auto& i : implementationList()) { | ||||
|         f.seekg(0); | ||||
|         if (!i.m_magicMatch(f)) continue; | ||||
|         if (!i.m_magicMatch(fname)) continue; | ||||
|         if (buildVariant != i.m_buildVariant) continue; | ||||
|         return &i; | ||||
|     } | ||||
| @@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s | ||||
|     if (!has_at_least_minimal_hardware()) | ||||
|         return nullptr; | ||||
|  | ||||
|     // Read magic | ||||
|     std::ifstream f(modelPath, std::ios::binary); | ||||
|     if (!f) return nullptr; | ||||
|     // Get correct implementation | ||||
|     const Implementation* impl = nullptr; | ||||
|  | ||||
| @@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s | ||||
|                 buildVariant = "default"; | ||||
|             } | ||||
|         } | ||||
|         impl = implementation(f, buildVariant); | ||||
|         impl = implementation(modelPath.c_str(), buildVariant); | ||||
|         if (!impl) return nullptr; | ||||
|     } | ||||
|     f.close(); | ||||
|  | ||||
|     // Construct and return llmodel implementation | ||||
|     auto fres = impl->m_construct(); | ||||
|   | ||||
| @@ -27,13 +27,13 @@ public: | ||||
|  | ||||
|         static bool isImplementation(const Dlhandle&); | ||||
|         static const std::vector<Implementation>& implementationList(); | ||||
|         static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant); | ||||
|         static const Implementation *implementation(const char *fname, const std::string& buildVariant); | ||||
|         static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto"); | ||||
|         static void setImplementationsSearchPath(const std::string& path); | ||||
|         static const std::string& implementationsSearchPath(); | ||||
|  | ||||
|     private: | ||||
|         bool (*m_magicMatch)(std::ifstream& f); | ||||
|         bool (*m_magicMatch)(const char *fname); | ||||
|         LLModel *(*m_construct)(); | ||||
|  | ||||
|     private: | ||||
|   | ||||
| @@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, | ||||
|  | ||||
|         // a = self.ln_1(x) | ||||
|         { | ||||
|             cur = ggml_norm(ctx0, inpL); | ||||
|             cur = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|             cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur); | ||||
|         } | ||||
| @@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, | ||||
|  | ||||
|         // m = self.ln_2(x) | ||||
|         { | ||||
|             cur = ggml_norm(ctx0, inpL); | ||||
|             cur = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|             cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur); | ||||
|         } | ||||
| @@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past, | ||||
|     ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); | ||||
|     // norm | ||||
|     { | ||||
|         inpL = ggml_norm(ctx0, inpL); | ||||
|         inpL = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|         // inpL = ln_f_g*inpL | ||||
|         inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL); | ||||
|     } | ||||
| @@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() { | ||||
|     return GGML_BUILD_VARIANT; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT bool magic_match(std::istream& f) { | ||||
| DLL_EXPORT bool magic_match(const char *fname) { | ||||
| #if 0 | ||||
|     uint32_t magic = 0; | ||||
|     f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||||
|     if (magic != 0x7265706c) return false; | ||||
| @@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { | ||||
|     #else | ||||
|     return true; | ||||
|     #endif | ||||
| #endif | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT LLModel *construct() { | ||||
|   | ||||
| @@ -1,10 +1,11 @@ | ||||
| #define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE | ||||
| #include "starcoder_impl.h" | ||||
| #include "llama.h" | ||||
| #include "llama-util.h" | ||||
| #include "utils.h" | ||||
| #include "llmodel_shared.h" | ||||
|  | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
| #include <cassert> | ||||
| #include <cinttypes> | ||||
| #include <iostream> | ||||
| @@ -501,7 +502,7 @@ bool starcoder_eval( | ||||
|         // norm | ||||
|         { | ||||
|             // [ 768, N] | ||||
|             cur = ggml_norm(ctx0, inpL); | ||||
|             cur = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|             // cur = ln_1_g*cur + ln_1_b | ||||
|             // [ 768, N] | ||||
| @@ -650,7 +651,7 @@ bool starcoder_eval( | ||||
|         { | ||||
|             // norm | ||||
|             { | ||||
|                 cur = ggml_norm(ctx0, inpFF); | ||||
|                 cur = ggml_norm(ctx0, inpFF, 1e-5f); | ||||
|  | ||||
|                 // cur = ln_2_g*cur + ln_2_b | ||||
|                 // [ 768, N] | ||||
| @@ -707,7 +708,7 @@ bool starcoder_eval( | ||||
|     // norm | ||||
|     { | ||||
|         // [ 768, N] | ||||
|         inpL = ggml_norm(ctx0, inpL); | ||||
|         inpL = ggml_norm(ctx0, inpL, 1e-5f); | ||||
|  | ||||
|         // inpL = ln_f_g*inpL + ln_f_b | ||||
|         // [ 768, N] | ||||
| @@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() { | ||||
|     return GGML_BUILD_VARIANT; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT bool magic_match(std::istream& f) { | ||||
| DLL_EXPORT bool magic_match(const char *fname) { | ||||
| #if 0 | ||||
|     uint32_t magic = 0; | ||||
|     f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||||
|     if (magic != STARCODER_MAGIC) { | ||||
| @@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) { | ||||
|         return false; | ||||
|     } | ||||
|     return true; | ||||
| #endif | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| DLL_EXPORT LLModel *construct() { | ||||
|   | ||||
| @@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) | ||||
|         emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename())); | ||||
|     } | ||||
|  | ||||
|     if (m_llModelInfo.model) | ||||
|     if (m_llModelInfo.model) { | ||||
|         setModelInfo(modelInfo); | ||||
|  | ||||
|     processSystemPrompt(); | ||||
|         processSystemPrompt(); | ||||
|     } | ||||
|     return m_llModelInfo.model; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -189,7 +189,7 @@ Window { | ||||
|             + "causes include a bad file format, an incomplete or corrupted download, the wrong file " | ||||
|             + "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:" | ||||
|             + "<br><ul>" | ||||
|             + "<li>Ensure the model file has a compatible ggml format and type" | ||||
|             + "<li>Ensure the model file has a compatible format and type" | ||||
|             + "<li>Check the model file is complete in the download folder" | ||||
|             + "<li>You can find the download folder in the settings dialog" | ||||
|             + "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum" | ||||
|   | ||||
| @@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory() | ||||
|                 QString filename = it.fileName(); | ||||
|  | ||||
|                 // All files that end with .bin and have 'ggml' somewhere in the name | ||||
|                 if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete")) | ||||
|                 if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete")) | ||||
|                     || (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) { | ||||
|  | ||||
|                     QString filePath = it.filePath(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user