fix chat-style prompt templates (#1970)

Also use a new version of Mistral OpenOrca.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-02-21 15:45:32 -05:00
committed by GitHub
parent b8f5c74f40
commit 4fc4d94be4
22 changed files with 429 additions and 307 deletions

View File

@@ -6,38 +6,29 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iomanip>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h>
#else
#include <unistd.h>
#endif
#include <map>
#include <random>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>
#include <llama.h>
#include <ggml.h>
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#include <ggml-kompute.h>
#endif
using namespace std::string_literals;
// Maximum supported GGUF version
static constexpr int GGUF_VER_MAX = 3;
namespace {
const char *modelType_ = "LLaMA";
}
static const char * const modelType_ = "LLaMA";
static bool llama_verbose() {
const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
@@ -96,6 +87,56 @@ static int llama_sample_top_p_top_k(
return llama_sample_token(ctx, &candidates_p);
}
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
static gguf_context *load_gguf(const char *fname) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context *ctx = gguf_init_from_file(fname, params);
if (!ctx) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return nullptr;
}
int gguf_ver = gguf_get_version(ctx);
if (gguf_ver > GGUF_VER_MAX) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
gguf_free(ctx);
return nullptr;
}
return ctx;
}
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
auto * ctx = load_gguf(modelPath.c_str());
auto arch = get_arch_name(ctx);
int32_t value = -1;
if (ctx) {
auto key = arch + "." + archKey;
int keyidx = gguf_find_key(ctx, key.c_str());
if (keyidx != -1) {
value = gguf_get_val_u32(ctx, keyidx);
} else {
std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
}
}
gguf_free(ctx);
return value;
}
struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded;
@@ -148,6 +189,42 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
return filesize + est_kvcache_size;
}
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) {
auto * ctx = load_gguf(modelPath.c_str());
if (!ctx) {
std::cerr << __func__ << ": failed to load " << modelPath << "\n";
return false;
}
auto get_key = [ctx, &modelPath](const char *name) {
int keyidx = gguf_find_key(ctx, name);
if (keyidx == -1) {
throw std::logic_error(name + " not found in "s + modelPath);
}
return keyidx;
};
bool res = false;
try {
std::string name(gguf_get_val_str(ctx, get_key("general.name")));
int token_idx = get_key("tokenizer.ggml.tokens");
int n_vocab = gguf_get_arr_n(ctx, token_idx);
// check for known bad models
if (name == "open-orca_mistral-7b-openorca"
&& n_vocab == 32002
&& gguf_get_arr_str(ctx, token_idx, 32000) == "<dummy32000>"s // should be <|im_end|>
) {
res = true;
}
} catch (const std::logic_error &e) {
std::cerr << __func__ << ": " << e.what() << "\n";
}
gguf_free(ctx);
return res;
}
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
{
d_ptr->modelLoaded = false;
@@ -290,12 +367,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
}
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
{
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->model));
std::vector<LLModel::Token> fres(str.size()+4);
// TODO(cebtenzzre): we may want to use special=true here to process special tokens
auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, false);
const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
const bool useBOS = wantBOS && shouldAddBOS();
auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
std::vector<LLModel::Token> fres(strCat.size()+4);
auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
fres.resize(fres_len);
return fres;
}
@@ -349,55 +427,10 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
return d_ptr->end_tokens;
}
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
static gguf_context *load_gguf(const char *fname, std::string &arch) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context *ctx = gguf_init_from_file(fname, params);
if (!ctx) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return nullptr;
}
int gguf_ver = gguf_get_version(ctx);
if (gguf_ver > GGUF_VER_MAX) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
gguf_free(ctx);
return nullptr;
}
arch = get_arch_name(ctx);
return ctx;
}
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
std::string arch;
auto * ctx = load_gguf(modelPath.c_str(), arch);
int32_t value = -1;
if (ctx) {
auto key = arch + "." + archKey;
int keyidx = gguf_find_key(ctx, key.c_str());
if (keyidx != -1) {
value = gguf_get_val_u32(ctx, keyidx);
} else {
std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
}
}
gguf_free(ctx);
return value;
bool LLamaModel::shouldAddBOS() const
{
int add_bos = llama_add_bos_token(d_ptr->model);
return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
}
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@@ -513,8 +546,8 @@ DLL_EXPORT const char *get_build_variant() {
}
DLL_EXPORT bool magic_match(const char *fname) {
std::string arch;
auto * ctx = load_gguf(fname, arch);
auto * ctx = load_gguf(fname);
auto arch = get_arch_name(ctx);
bool valid = true;