chat: major UI redesign for v3.0.0 (#2396)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
AT
2024-06-24 18:49:23 -04:00
committed by GitHub
parent 1272b694ae
commit 9273b49b62
111 changed files with 8540 additions and 7879 deletions

View File

@@ -20,24 +20,28 @@ namespace fs = std::filesystem;
#ifndef _WIN32
Dlhandle::Dlhandle(const fs::path &fpath) {
Dlhandle::Dlhandle(const fs::path &fpath)
{
chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
if (!chandle) {
throw Exception("dlopen: "s + dlerror());
}
}
Dlhandle::~Dlhandle() {
Dlhandle::~Dlhandle()
{
if (chandle) dlclose(chandle);
}
void *Dlhandle::get_internal(const char *symbol) const {
void *Dlhandle::get_internal(const char *symbol) const
{
return dlsym(chandle, symbol);
}
#else // defined(_WIN32)
Dlhandle::Dlhandle(const fs::path &fpath) {
Dlhandle::Dlhandle(const fs::path &fpath)
{
fs::path afpath = fs::absolute(fpath);
// Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
@@ -58,11 +62,13 @@ Dlhandle::Dlhandle(const fs::path &fpath) {
}
}
Dlhandle::~Dlhandle() {
Dlhandle::~Dlhandle()
{
if (chandle) FreeLibrary(HMODULE(chandle));
}
void *Dlhandle::get_internal(const char *symbol) const {
void *Dlhandle::get_internal(const char *symbol) const
{
return GetProcAddress(HMODULE(chandle), symbol);
}

View File

@@ -123,7 +123,8 @@ static bool kv_cache_init(
}
// load the model's weights from a file path
bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) {
bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr)
{
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
if(mem_req != nullptr) {
*mem_req = 0;
@@ -667,7 +668,8 @@ GPTJ::GPTJ()
d_ptr->modelLoaded = false;
}
size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
{
(void)n_ctx;
(void)ngl;
gptj_model dummy_model;
@@ -677,7 +679,8 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
return mem_req;
}
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl)
{
(void)n_ctx;
(void)ngl;
d_ptr->modelLoaded = false;
@@ -698,7 +701,8 @@ bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
return true;
}
void GPTJ::setThreadCount(int32_t n_threads) {
void GPTJ::setThreadCount(int32_t n_threads)
{
d_ptr->n_threads = n_threads;
}
@@ -780,7 +784,8 @@ const std::vector<LLModel::Token> &GPTJ::endTokens() const
return fres;
}
const char *get_arch_name(gguf_context *ctx_gguf) {
const char *get_arch_name(gguf_context *ctx_gguf)
{
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
if (kid == -1)
throw std::runtime_error("key not found in model: general.architecture");
@@ -799,19 +804,23 @@ const char *get_arch_name(gguf_context *ctx_gguf) {
#endif
extern "C" {
DLL_EXPORT bool is_g4a_backend_model_implementation() {
DLL_EXPORT bool is_g4a_backend_model_implementation()
{
return true;
}
DLL_EXPORT const char *get_model_type() {
DLL_EXPORT const char *get_model_type()
{
return modelType_;
}
DLL_EXPORT const char *get_build_variant() {
DLL_EXPORT const char *get_build_variant()
{
return GGML_BUILD_VARIANT;
}
DLL_EXPORT char *get_file_arch(const char *fname) {
DLL_EXPORT char *get_file_arch(const char *fname)
{
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
@@ -832,11 +841,13 @@ DLL_EXPORT char *get_file_arch(const char *fname) {
return arch;
}
DLL_EXPORT bool is_arch_supported(const char *arch) {
DLL_EXPORT bool is_arch_supported(const char *arch)
{
return !strcmp(arch, "gptj");
}
DLL_EXPORT LLModel *construct() {
DLL_EXPORT LLModel *construct()
{
return new GPTJ;
}
}

View File

@@ -84,16 +84,19 @@ static const std::vector<const char *> EMBEDDING_ARCHES {
"bert", "nomic-bert",
};
static bool is_embedding_arch(const std::string &arch) {
static bool is_embedding_arch(const std::string &arch)
{
return std::find(EMBEDDING_ARCHES.begin(), EMBEDDING_ARCHES.end(), arch) < EMBEDDING_ARCHES.end();
}
static bool llama_verbose() {
static bool llama_verbose()
{
const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
return var && *var;
}
static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata) {
static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
fputs(text, stderr);
@@ -147,7 +150,8 @@ static int llama_sample_top_p_top_k(
return llama_sample_token(ctx, &candidates_p);
}
const char *get_arch_name(gguf_context *ctx_gguf) {
const char *get_arch_name(gguf_context *ctx_gguf)
{
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
if (kid == -1)
throw std::runtime_error("key not found in model: general.architecture");
@@ -159,7 +163,8 @@ const char *get_arch_name(gguf_context *ctx_gguf) {
return gguf_get_val_str(ctx_gguf, kid);
}
static gguf_context *load_gguf(const char *fname) {
static gguf_context *load_gguf(const char *fname)
{
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
@@ -180,7 +185,8 @@ static gguf_context *load_gguf(const char *fname) {
return ctx;
}
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey)
{
int32_t value = -1;
std::string arch;
@@ -237,7 +243,8 @@ struct llama_file_hparams {
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
};
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
{
// TODO(cebtenzzre): update to GGUF
(void)ngl; // FIXME(cetenzzre): use this value
auto fin = std::ifstream(modelPath, std::ios::binary);
@@ -261,7 +268,8 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
return filesize + est_kvcache_size;
}
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
{
auto * ctx = load_gguf(modelPath.c_str());
if (!ctx) {
std::cerr << __func__ << ": failed to load " << modelPath << "\n";
@@ -297,7 +305,8 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
return res;
}
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const {
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
{
bool result = false;
std::string arch;
@@ -453,12 +462,14 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
return true;
}
void LLamaModel::setThreadCount(int32_t n_threads) {
void LLamaModel::setThreadCount(int32_t n_threads)
{
d_ptr->n_threads = n_threads;
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
}
int32_t LLamaModel::threadCount() const {
int32_t LLamaModel::threadCount() const
{
return d_ptr->n_threads;
}
@@ -581,7 +592,8 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
}
#ifdef GGML_USE_VULKAN
static const char *getVulkanVendorName(uint32_t vendorID) {
static const char *getVulkanVendorName(uint32_t vendorID)
{
switch (vendorID) {
case 0x10DE: return "nvidia";
case 0x1002: return "amd";
@@ -738,11 +750,13 @@ bool LLamaModel::usingGPUDevice() const
return hasDevice;
}
const char *LLamaModel::backendName() const {
const char *LLamaModel::backendName() const
{
return d_ptr->backend_name;
}
const char *LLamaModel::gpuDeviceName() const {
const char *LLamaModel::gpuDeviceName() const
{
if (usingGPUDevice()) {
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->deviceName.c_str();
@@ -768,13 +782,15 @@ void llama_batch_add(
batch.n_tokens++;
}
static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id) {
static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id)
{
for (unsigned i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
}
}
size_t LLamaModel::embeddingSize() const {
size_t LLamaModel::embeddingSize() const
{
return llama_n_embd(d_ptr->model);
}
@@ -894,12 +910,14 @@ void LLamaModel::embed(
// MD5 hash of "nomic empty"
static const char EMPTY_PLACEHOLDER[] = "24df574ea1c998de59d5be15e769658e";
auto product(double a) -> std::function<double(double)> {
auto product(double a) -> std::function<double(double)>
{
return [a](double b) { return a * b; };
}
template <typename T>
double getL2NormScale(T *start, T *end) {
double getL2NormScale(T *start, T *end)
{
double magnitude = std::sqrt(std::inner_product(start, end, start, 0.0));
return 1.0 / std::max(magnitude, 1e-12);
}
@@ -1107,19 +1125,23 @@ void LLamaModel::embedInternal(
#endif
extern "C" {
DLL_EXPORT bool is_g4a_backend_model_implementation() {
DLL_EXPORT bool is_g4a_backend_model_implementation()
{
return true;
}
DLL_EXPORT const char *get_model_type() {
DLL_EXPORT const char *get_model_type()
{
return modelType_;
}
DLL_EXPORT const char *get_build_variant() {
DLL_EXPORT const char *get_build_variant()
{
return GGML_BUILD_VARIANT;
}
DLL_EXPORT char *get_file_arch(const char *fname) {
DLL_EXPORT char *get_file_arch(const char *fname)
{
char *arch = nullptr;
std::string archStr;
@@ -1144,11 +1166,13 @@ cleanup:
return arch;
}
DLL_EXPORT bool is_arch_supported(const char *arch) {
DLL_EXPORT bool is_arch_supported(const char *arch)
{
return std::find(KNOWN_ARCHES.begin(), KNOWN_ARCHES.end(), std::string(arch)) < KNOWN_ARCHES.end();
}
DLL_EXPORT LLModel *construct() {
DLL_EXPORT LLModel *construct()
{
llama_log_set(llama_log_callback, nullptr);
return new LLamaModel;
}

View File

@@ -92,17 +92,20 @@ LLModel::Implementation::Implementation(Implementation &&o)
o.m_dlhandle = nullptr;
}
LLModel::Implementation::~Implementation() {
LLModel::Implementation::~Implementation()
{
delete m_dlhandle;
}
static bool isImplementation(const Dlhandle &dl) {
static bool isImplementation(const Dlhandle &dl)
{
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
}
// Add the CUDA Toolkit to the DLL search path on Windows.
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
static void addCudaSearchPath() {
static void addCudaSearchPath()
{
#ifdef _WIN32
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
auto libDir = std::wstring(cudaPath) + L"\\bin";
@@ -114,7 +117,8 @@ static void addCudaSearchPath() {
#endif
}
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
{
if (cpu_supports_avx() == 0) {
throw std::runtime_error("CPU does not support AVX");
}
@@ -169,14 +173,16 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
return *libs;
}
static std::string applyCPUVariant(const std::string &buildVariant) {
static std::string applyCPUVariant(const std::string &buildVariant)
{
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
return buildVariant + "-avxonly";
}
return buildVariant;
}
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
{
bool buildVariantMatched = false;
std::optional<std::string> archName;
for (const auto& i : implementationList()) {
@@ -200,7 +206,8 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
throw BadArchError(std::move(*archName));
}
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
{
std::vector<std::string> desiredBackends;
if (backend != "auto") {
desiredBackends.push_back(backend);
@@ -240,7 +247,8 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
}
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
{
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
const std::vector<Implementation> *impls;
@@ -284,7 +292,8 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
return nullptr;
}
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
{
std::vector<LLModel::GPUDevice> devices;
#ifndef __APPLE__
static const std::string backends[] = {"kompute", "cuda"};
@@ -299,33 +308,40 @@ std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(siz
return devices;
}
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
{
auto *llama = constructGlobalLlama();
return llama ? llama->maxContextLength(modelPath) : -1;
}
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
{
auto *llama = constructGlobalLlama();
return llama ? llama->layerCount(modelPath) : -1;
}
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
{
auto *llama = constructGlobalLlama();
return llama && llama->isEmbeddingModel(modelPath);
}
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
{
s_implementations_search_path = path;
}
const std::string& LLModel::Implementation::implementationsSearchPath() {
const std::string& LLModel::Implementation::implementationsSearchPath()
{
return s_implementations_search_path;
}
bool LLModel::Implementation::hasSupportedCPU() {
bool LLModel::Implementation::hasSupportedCPU()
{
return cpu_supports_avx() != 0;
}
int LLModel::Implementation::cpuSupportsAVX2() {
int LLModel::Implementation::cpuSupportsAVX2()
{
return cpu_supports_avx2();
}

View File

@@ -20,7 +20,8 @@ struct LLModelWrapper {
~LLModelWrapper() { delete llModel; }
};
llmodel_model llmodel_model_create(const char *model_path) {
llmodel_model llmodel_model_create(const char *model_path)
{
const char *error;
auto fres = llmodel_model_create2(model_path, "auto", &error);
if (!fres) {
@@ -29,7 +30,8 @@ llmodel_model llmodel_model_create(const char *model_path) {
return fres;
}
static void llmodel_set_error(const char **errptr, const char *message) {
static void llmodel_set_error(const char **errptr, const char *message)
{
thread_local static std::string last_error_message;
if (errptr) {
last_error_message = message;
@@ -37,7 +39,8 @@ static void llmodel_set_error(const char **errptr, const char *message) {
}
}
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
{
LLModel *llModel;
try {
llModel = LLModel::Implementation::construct(model_path, backend);
@@ -51,7 +54,8 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend,
return wrapper;
}
void llmodel_model_destroy(llmodel_model model) {
void llmodel_model_destroy(llmodel_model model)
{
delete static_cast<LLModelWrapper *>(model);
}

View File

@@ -14,7 +14,8 @@
#include <vector>
// TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
{
int n_keep = shouldAddBOS();
const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
@@ -43,7 +44,8 @@ stop_generating:
recalculate(false);
}
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err) {
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
{
static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);

View File

@@ -38,7 +38,8 @@ struct llm_kv_cache {
}
};
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
{
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View File

@@ -8,7 +8,8 @@
#include <regex>
#include <utility>
void replace(std::string & str, const std::string & needle, const std::string & replacement) {
void replace(std::string & str, const std::string & needle, const std::string & replacement)
{
size_t pos = 0;
while ((pos = str.find(needle, pos)) != std::string::npos) {
str.replace(pos, needle.length(), replacement);
@@ -16,7 +17,8 @@ void replace(std::string & str, const std::string & needle, const std::string &
}
}
std::map<std::string, int32_t> json_parse(const std::string & fname) {
std::map<std::string, int32_t> json_parse(const std::string & fname)
{
std::map<std::string, int32_t> result;
// read file into string
@@ -107,7 +109,8 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
return result;
}
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
{
std::vector<std::string> words;
// first split the text into words
@@ -162,12 +165,14 @@ std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std
return tokens;
}
std::string regex_escape(const std::string &s) {
std::string regex_escape(const std::string &s)
{
static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
return std::regex_replace(s, metacharacters, "\\$&");
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
{
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
std::vector<gpt_vocab::id> out;
@@ -203,7 +208,8 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
{
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
vocab.token_to_id = ::json_parse(fname);

View File

@@ -14,7 +14,8 @@
//
// General purpose inline functions
//
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes) {
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
{
return bytes*1024*1024;
}