Latest rebase on llama.cpp with gguf support.

This commit is contained in:
Adam Treat 2023-09-21 12:41:48 -04:00
parent 5f3d739205
commit d90d003a1d
12 changed files with 245 additions and 85 deletions

View File

@ -345,7 +345,7 @@ void bert_eval(
// embd norm // embd norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, 1e-5f);
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -406,7 +406,7 @@ void bert_eval(
// attention norm // attention norm
{ {
cur = ggml_norm(ctx0, cur); cur = ggml_norm(ctx0, cur, 1e-5f);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -432,7 +432,7 @@ void bert_eval(
// output norm // output norm
{ {
cur = ggml_norm(ctx0, cur); cur = ggml_norm(ctx0, cur, 1e-5f);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT; return GGML_BUILD_VARIANT;
} }
DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT bool magic_match(const char* fname) {
#if 0
uint32_t magic = 0; uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != 0x62657274) { if (magic != 0x62657274) {
return false; return false;
} }
return true; return true;
#endif
return false;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

View File

@ -2,10 +2,11 @@
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "falcon_impl.h" #include "falcon_impl.h"
#include "llama.h" #include "llama.h"
#include "llama-util.h"
#include "utils.h" #include "utils.h"
#include "llmodel_shared.h" #include "llmodel_shared.h"
#include <stdio.h>
#include <string.h>
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
#include <iostream> #include <iostream>
@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
const int head_dim = hparams.n_embd / hparams.n_head; const int head_dim = hparams.n_embd / hparams.n_head;
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
// if (hparams.version == 40) { // Falcon-40B // if (hparams.version == 40) { // Falcon-40B
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm // ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b // ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b
// } // }
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
} }
@ -494,7 +495,7 @@ bool falcon_eval(
// self-attention // self-attention
{ {
layernorm_output = ggml_norm(ctx0, inpL); layernorm_output = ggml_norm(ctx0, inpL, 1e-5f);
layernorm_output = ggml_add(ctx0, layernorm_output = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -653,7 +654,7 @@ bool falcon_eval(
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,
@ -680,7 +681,7 @@ bool falcon_eval(
// run the computation // run the computation
ggml_build_forward_expand(&gf, inpL); ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
//if (n_past%100 == 0) { //if (n_past%100 == 0) {
// ggml_graph_print (&gf); // ggml_graph_print (&gf);
@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT; return GGML_BUILD_VARIANT;
} }
DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT bool magic_match(const char* fname) {
#if 0
uint32_t magic = 0; uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
uint32_t version = 0; uint32_t version = 0;
f.read(reinterpret_cast<char*>(&version), sizeof(version)); f.read(reinterpret_cast<char*>(&version), sizeof(version));
if (magic != FALCON_MAGIC) { if (magic != FALCON_MAGIC) {
return false; return false;
} }
falcon_hparams hparams; falcon_hparams hparams;
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams)); f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
return false; return false;
} }
return true; return true;
#endif
return false;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

@ -1 +1 @@
Subproject commit 99c5c9a0d834888c33669855f3a1cf425df37dd2 Subproject commit 37a0be313d21f8b61184a3adcaac123353128238

View File

@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}") set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
if(CMAKE_GENERATOR MATCHES "Visual Studio") if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command( add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE} OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
@ -346,6 +346,13 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_STATIC) if (LLAMA_STATIC)
add_link_options(-static) add_link_options(-static)
@ -361,6 +368,138 @@ if (NOT MSVC)
endif() endif()
endif() endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mno-unaligned-access)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
if (LLAMA_F16C)
add_compile_options(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_options(-mfma)
endif()
if (LLAMA_AVX)
add_compile_options(-mavx)
endif()
if (LLAMA_AVX2)
add_compile_options(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
add_compile_options(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_options(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else()
message(STATUS "Unknown architecture")
endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}") message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}")
@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
if (WITH_LLAMA) if (WITH_LLAMA)
# Backwards compatibility with old llama.cpp versions # Backwards compatibility with old llama.cpp versions
set(LLAMA_UTIL_SOURCE_FILE llama-util.h) # set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
set(LLAMA_UTIL_SOURCE_FILE llama_util.h) set(LLAMA_UTIL_SOURCE_FILE llama_util.h)
endif() endif()
add_library(llama${SUFFIX} STATIC add_library(llama${SUFFIX} STATIC
${DIRECTORY}/llama.cpp ${DIRECTORY}/llama.cpp
${DIRECTORY}/llama.h ${DIRECTORY}/llama.h)
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
if (LLAMA_METAL AND GGML_METAL_SOURCES) if (LLAMA_METAL AND GGML_METAL_SOURCES)
target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG) target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)

View File

@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src)
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
{ {
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
std::vector<LLModel::Token> fres(str.size()+4); std::vector<LLModel::Token> fres(str.size()+4);
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS); auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS);
fres.resize(fres_len); fres.resize(fres_len);
return fres; return fres;
} }
@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
{ {
// When we recalculate context we could have erased the original BOS token... we need to replace it // When we recalculate context we could have erased the original BOS token... we need to replace it
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos()); const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
if (useBOS) { if (useBOS) {
std::vector<int32_t> myTokens; std::vector<int32_t> myTokens;
myTokens.push_back(llama_token_bos()); myTokens.push_back(llama_token_bos(d_ptr->ctx));
myTokens.insert(myTokens.end(), tokens.begin(), tokens.end()); myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
ctx.n_past += 1; ctx.n_past += 1;
return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0; return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const
const std::vector<LLModel::Token> &LLamaModel::endTokens() const const std::vector<LLModel::Token> &LLamaModel::endTokens() const
{ {
static const std::vector<LLModel::Token> fres = {llama_token_eos()}; static const std::vector<LLModel::Token> fres = {llama_token_eos(d_ptr->ctx)};
return fres; return fres;
} }
@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice()
return false; return false;
} }
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
#if defined(_WIN32) #if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport) #define DLL_EXPORT __declspec(dllexport)
#else #else
@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT; return GGML_BUILD_VARIANT;
} }
DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT bool magic_match(const char * fname) {
// Check magic
uint32_t magic = 0; struct ggml_context * ctx_meta = NULL;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); struct gguf_init_params params = {
if (magic != 0x67676a74) return false; /*.no_alloc = */ true,
// Check version /*.ctx = */ &ctx_meta,
uint32_t version = 0; };
f.read(reinterpret_cast<char*>(&version), sizeof(version)); gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
if (!(version LLAMA_VERSIONS)) { if (!ctx_gguf)
return false; return false;
}
llama_file_hparams hparams; bool isValid = gguf_get_version(ctx_gguf) <= 2;
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams)); isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid;
if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
return false; // not a llama.
}
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
// Check quant supported on metal const int n_tensors = gguf_get_n_tensors(ctx_gguf);
// skip fields for (int i = 0; i < n_tensors; i++) {
switch(hparams.ftype) { const char * name = gguf_get_tensor_name(ctx_gguf, i);
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
case LLAMA_FTYPE_MOSTLY_F16: switch(meta->type) {
case LLAMA_FTYPE_MOSTLY_Q2_K: // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
case LLAMA_FTYPE_MOSTLY_Q4_0: case LLAMA_FTYPE_MOSTLY_F16:
case LLAMA_FTYPE_MOSTLY_Q6_K: case LLAMA_FTYPE_MOSTLY_Q2_K:
case LLAMA_FTYPE_MOSTLY_Q4_K_S: case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q4_K_M: case LLAMA_FTYPE_MOSTLY_Q6_K:
return true; case LLAMA_FTYPE_MOSTLY_Q4_K_S:
default: // unsupported quant-type for Metal case LLAMA_FTYPE_MOSTLY_Q4_K_M:
return false; break;
default: // unsupported quant-type for Metal
isValid = false;
}
} }
#endif #endif
return true;
gguf_free(ctx_gguf);
return isValid;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

View File

@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant"); auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
assert(get_build_variant); assert(get_build_variant);
m_buildVariant = get_build_variant(); m_buildVariant = get_build_variant();
m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match"); m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match");
assert(m_magicMatch); assert(m_magicMatch);
m_construct = m_dlhandle->get<LLModel *()>("construct"); m_construct = m_dlhandle->get<LLModel *()>("construct");
assert(m_construct); assert(m_construct);
@ -111,10 +111,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
return *libs; return *libs;
} }
const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) { const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
for (const auto& i : implementationList()) { for (const auto& i : implementationList()) {
f.seekg(0); if (!i.m_magicMatch(fname)) continue;
if (!i.m_magicMatch(f)) continue;
if (buildVariant != i.m_buildVariant) continue; if (buildVariant != i.m_buildVariant) continue;
return &i; return &i;
} }
@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
if (!has_at_least_minimal_hardware()) if (!has_at_least_minimal_hardware())
return nullptr; return nullptr;
// Read magic
std::ifstream f(modelPath, std::ios::binary);
if (!f) return nullptr;
// Get correct implementation // Get correct implementation
const Implementation* impl = nullptr; const Implementation* impl = nullptr;
@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
buildVariant = "default"; buildVariant = "default";
} }
} }
impl = implementation(f, buildVariant); impl = implementation(modelPath.c_str(), buildVariant);
if (!impl) return nullptr; if (!impl) return nullptr;
} }
f.close();
// Construct and return llmodel implementation // Construct and return llmodel implementation
auto fres = impl->m_construct(); auto fres = impl->m_construct();

View File

@ -27,13 +27,13 @@ public:
static bool isImplementation(const Dlhandle&); static bool isImplementation(const Dlhandle&);
static const std::vector<Implementation>& implementationList(); static const std::vector<Implementation>& implementationList();
static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant); static const Implementation *implementation(const char *fname, const std::string& buildVariant);
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto"); static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
static void setImplementationsSearchPath(const std::string& path); static void setImplementationsSearchPath(const std::string& path);
static const std::string& implementationsSearchPath(); static const std::string& implementationsSearchPath();
private: private:
bool (*m_magicMatch)(std::ifstream& f); bool (*m_magicMatch)(const char *fname);
LLModel *(*m_construct)(); LLModel *(*m_construct)();
private: private:

View File

@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
// a = self.ln_1(x) // a = self.ln_1(x)
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, 1e-5f);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur);
} }
@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
// m = self.ln_2(x) // m = self.ln_2(x)
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, 1e-5f);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur);
} }
@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL // inpL = ln_f_g*inpL
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL); inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL);
} }
@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT; return GGML_BUILD_VARIANT;
} }
DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT bool magic_match(const char *fname) {
#if 0
uint32_t magic = 0; uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != 0x7265706c) return false; if (magic != 0x7265706c) return false;
@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
#else #else
return true; return true;
#endif #endif
#endif
return false;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

View File

@ -1,10 +1,11 @@
#define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "starcoder_impl.h" #include "starcoder_impl.h"
#include "llama.h" #include "llama.h"
#include "llama-util.h"
#include "utils.h" #include "utils.h"
#include "llmodel_shared.h" #include "llmodel_shared.h"
#include <stdio.h>
#include <string.h>
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
#include <iostream> #include <iostream>
@ -501,7 +502,7 @@ bool starcoder_eval(
// norm // norm
{ {
// [ 768, N] // [ 768, N]
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, 1e-5f);
// cur = ln_1_g*cur + ln_1_b // cur = ln_1_g*cur + ln_1_b
// [ 768, N] // [ 768, N]
@ -650,7 +651,7 @@ bool starcoder_eval(
{ {
// norm // norm
{ {
cur = ggml_norm(ctx0, inpFF); cur = ggml_norm(ctx0, inpFF, 1e-5f);
// cur = ln_2_g*cur + ln_2_b // cur = ln_2_g*cur + ln_2_b
// [ 768, N] // [ 768, N]
@ -707,7 +708,7 @@ bool starcoder_eval(
// norm // norm
{ {
// [ 768, N] // [ 768, N]
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
// [ 768, N] // [ 768, N]
@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT; return GGML_BUILD_VARIANT;
} }
DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT bool magic_match(const char *fname) {
#if 0
uint32_t magic = 0; uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != STARCODER_MAGIC) { if (magic != STARCODER_MAGIC) {
@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
return false; return false;
} }
return true; return true;
#endif
return false;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

View File

@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename())); emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename()));
} }
if (m_llModelInfo.model) if (m_llModelInfo.model) {
setModelInfo(modelInfo); setModelInfo(modelInfo);
processSystemPrompt();
processSystemPrompt(); }
return m_llModelInfo.model; return m_llModelInfo.model;
} }

View File

@ -189,7 +189,7 @@ Window {
+ "causes include a bad file format, an incomplete or corrupted download, the wrong file " + "causes include a bad file format, an incomplete or corrupted download, the wrong file "
+ "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:" + "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:"
+ "<br><ul>" + "<br><ul>"
+ "<li>Ensure the model file has a compatible ggml format and type" + "<li>Ensure the model file has a compatible format and type"
+ "<li>Check the model file is complete in the download folder" + "<li>Check the model file is complete in the download folder"
+ "<li>You can find the download folder in the settings dialog" + "<li>You can find the download folder in the settings dialog"
+ "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum" + "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum"

View File

@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory()
QString filename = it.fileName(); QString filename = it.fileName();
// All files that end with .bin and have 'ggml' somewhere in the name // All files that end with .bin and have 'ggml' somewhere in the name
if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete")) if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete"))
|| (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) { || (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) {
QString filePath = it.filePath(); QString filePath = it.filePath();