mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-04 17:42:25 +00:00
backend: move more stuff into LlamaCppBackend
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
9808be5e73
commit
595501fcde
@ -138,7 +138,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_library(llmodel
|
add_library(llmodel
|
||||||
llmodel.h llmodel.cpp llmodel_shared.cpp
|
llmodel.h llamacpp_backend.cpp
|
||||||
llmodel_c.h llmodel_c.cpp
|
llmodel_c.h llmodel_c.cpp
|
||||||
dlhandle.cpp
|
dlhandle.cpp
|
||||||
)
|
)
|
||||||
|
@ -1,20 +1,46 @@
|
|||||||
#include "llmodel.h"
|
#include "llamacpp_backend.h"
|
||||||
|
|
||||||
|
#include "dlhandle.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <iterator>
|
||||||
|
#include <memory>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
# define WIN32_LEAN_AND_MEAN
|
||||||
|
# ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
# endif
|
||||||
|
# include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
# include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__APPLE__) && defined(__aarch64__)
|
||||||
|
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace fs = std::filesystem;
|
||||||
namespace ranges = std::ranges;
|
namespace ranges = std::ranges;
|
||||||
|
|
||||||
|
|
||||||
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
|
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
|
||||||
{
|
{
|
||||||
static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
|
static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
|
||||||
@ -38,15 +64,16 @@ static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLModel::prompt(const std::string &prompt,
|
void LlamaCppBackend::prompt(
|
||||||
|
const std::string &prompt,
|
||||||
const std::string &promptTemplate,
|
const std::string &promptTemplate,
|
||||||
std::function<bool(int32_t)> promptCallback,
|
std::function<bool(int32_t)> promptCallback,
|
||||||
std::function<bool(int32_t, const std::string&)> responseCallback,
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
bool allowContextShift,
|
bool allowContextShift,
|
||||||
PromptContext &promptCtx,
|
PromptContext &promptCtx,
|
||||||
bool special,
|
bool special,
|
||||||
std::string *fakeReply)
|
std::string *fakeReply
|
||||||
{
|
) {
|
||||||
if (!isModelLoaded()) {
|
if (!isModelLoaded()) {
|
||||||
std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
|
std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
|
||||||
return;
|
return;
|
||||||
@ -153,11 +180,13 @@ void LLModel::prompt(const std::string &prompt,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// returns false on error
|
// returns false on error
|
||||||
bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
|
bool LlamaCppBackend::decodePrompt(
|
||||||
|
std::function<bool(int32_t)> promptCallback,
|
||||||
std::function<bool(int32_t, const std::string&)> responseCallback,
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
bool allowContextShift,
|
bool allowContextShift,
|
||||||
PromptContext &promptCtx,
|
PromptContext &promptCtx,
|
||||||
std::vector<Token> embd_inp) {
|
std::vector<Token> embd_inp
|
||||||
|
) {
|
||||||
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
|
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
|
||||||
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
|
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
|
||||||
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
||||||
@ -224,9 +253,11 @@ static std::string::size_type stringsOverlap(const std::string &s, const std::st
|
|||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
|
void LlamaCppBackend::generateResponse(
|
||||||
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
bool allowContextShift,
|
bool allowContextShift,
|
||||||
PromptContext &promptCtx) {
|
PromptContext &promptCtx
|
||||||
|
) {
|
||||||
static const char *stopSequences[] {
|
static const char *stopSequences[] {
|
||||||
"### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
|
"### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
|
||||||
};
|
};
|
||||||
@ -371,31 +402,327 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
|
|||||||
promptCtx.n_past -= cachedTokens.size();
|
promptCtx.n_past -= cachedTokens.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLModel::embed(
|
/* *********************************
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
* Backend implementation management
|
||||||
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
|
* ********************************* */
|
||||||
) {
|
|
||||||
(void)texts;
|
#ifndef __APPLE__
|
||||||
(void)embeddings;
|
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
||||||
(void)prefix;
|
#elif defined(__aarch64__)
|
||||||
(void)dimensionality;
|
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
||||||
(void)tokenCount;
|
#else
|
||||||
(void)doMean;
|
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
||||||
(void)atlas;
|
#endif
|
||||||
(void)cancelCb;
|
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
std::string s_implementations_search_path = ".";
|
||||||
|
|
||||||
|
#if !(defined(__x86_64__) || defined(_M_X64))
|
||||||
|
// irrelevant on non-x86_64
|
||||||
|
#define cpu_supports_avx() -1
|
||||||
|
#define cpu_supports_avx2() -1
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
// MSVC
|
||||||
|
static int get_cpu_info(int func_id, int reg_id) {
|
||||||
|
int info[4];
|
||||||
|
__cpuid(info, func_id);
|
||||||
|
return info[reg_id];
|
||||||
|
}
|
||||||
|
|
||||||
|
// AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
|
||||||
|
#define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28))
|
||||||
|
// AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
|
||||||
|
#define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5))
|
||||||
|
#else
|
||||||
|
// gcc/clang
|
||||||
|
#define cpu_supports_avx() !!__builtin_cpu_supports("avx")
|
||||||
|
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_)
|
||||||
|
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
||||||
|
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
||||||
|
assert(get_model_type);
|
||||||
|
m_modelType = get_model_type();
|
||||||
|
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||||
|
assert(get_build_variant);
|
||||||
|
m_buildVariant = get_build_variant();
|
||||||
|
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
||||||
|
assert(m_getFileArch);
|
||||||
|
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
||||||
|
assert(m_isArchSupported);
|
||||||
|
m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
|
||||||
|
assert(m_construct);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLModel::embed(
|
LlamaCppBackend::Implementation::Implementation(Implementation &&o)
|
||||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
: m_getFileArch(o.m_getFileArch)
|
||||||
bool doMean, bool atlas
|
, m_isArchSupported(o.m_isArchSupported)
|
||||||
) {
|
, m_construct(o.m_construct)
|
||||||
(void)texts;
|
, m_modelType(o.m_modelType)
|
||||||
(void)embeddings;
|
, m_buildVariant(o.m_buildVariant)
|
||||||
(void)isRetrieval;
|
, m_dlhandle(o.m_dlhandle) {
|
||||||
(void)dimensionality;
|
o.m_dlhandle = nullptr;
|
||||||
(void)tokenCount;
|
}
|
||||||
(void)doMean;
|
|
||||||
(void)atlas;
|
LlamaCppBackend::Implementation::~Implementation()
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
{
|
||||||
|
delete m_dlhandle;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool isImplementation(const Dlhandle &dl)
|
||||||
|
{
|
||||||
|
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the CUDA Toolkit to the DLL search path on Windows.
|
||||||
|
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
||||||
|
static void addCudaSearchPath()
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
||||||
|
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
||||||
|
if (!AddDllDirectory(libDir.c_str())) {
|
||||||
|
auto err = GetLastError();
|
||||||
|
std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LlamaCppBackend::Implementation> &LlamaCppBackend::Implementation::implementationList()
|
||||||
|
{
|
||||||
|
if (cpu_supports_avx() == 0) {
|
||||||
|
throw std::runtime_error("CPU does not support AVX");
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
||||||
|
// individual models without the cleanup of the static list interfering
|
||||||
|
static auto* libs = new std::vector<Implementation>([] () {
|
||||||
|
std::vector<Implementation> fres;
|
||||||
|
|
||||||
|
addCudaSearchPath();
|
||||||
|
|
||||||
|
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
||||||
|
if (cpu_supports_avx2() == 0) {
|
||||||
|
impl_name_re += "-avxonly";
|
||||||
|
}
|
||||||
|
std::regex re(impl_name_re);
|
||||||
|
auto search_in_directory = [&](const std::string& paths) {
|
||||||
|
std::stringstream ss(paths);
|
||||||
|
std::string path;
|
||||||
|
// Split the paths string by the delimiter and process each path.
|
||||||
|
while (std::getline(ss, path, ';')) {
|
||||||
|
std::u8string u8_path(path.begin(), path.end());
|
||||||
|
// Iterate over all libraries
|
||||||
|
for (const auto &f : fs::directory_iterator(u8_path)) {
|
||||||
|
const fs::path &p = f.path();
|
||||||
|
|
||||||
|
if (p.extension() != LIB_FILE_EXT) continue;
|
||||||
|
if (!std::regex_search(p.stem().string(), re)) {
|
||||||
|
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to list if model implementation
|
||||||
|
Dlhandle dl;
|
||||||
|
try {
|
||||||
|
dl = Dlhandle(p);
|
||||||
|
} catch (const Dlhandle::Exception &e) {
|
||||||
|
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!isImplementation(dl)) {
|
||||||
|
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
fres.emplace_back(Implementation(std::move(dl)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
search_in_directory(s_implementations_search_path);
|
||||||
|
|
||||||
|
return fres;
|
||||||
|
}());
|
||||||
|
// Return static result
|
||||||
|
return *libs;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string applyCPUVariant(const std::string &buildVariant)
|
||||||
|
{
|
||||||
|
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||||
|
return buildVariant + "-avxonly";
|
||||||
|
}
|
||||||
|
return buildVariant;
|
||||||
|
}
|
||||||
|
|
||||||
|
const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation(
|
||||||
|
const char *fname,
|
||||||
|
const std::string& buildVariant
|
||||||
|
) {
|
||||||
|
bool buildVariantMatched = false;
|
||||||
|
std::optional<std::string> archName;
|
||||||
|
for (const auto& i : implementationList()) {
|
||||||
|
if (buildVariant != i.m_buildVariant) continue;
|
||||||
|
buildVariantMatched = true;
|
||||||
|
|
||||||
|
char *arch = i.m_getFileArch(fname);
|
||||||
|
if (!arch) continue;
|
||||||
|
archName = arch;
|
||||||
|
|
||||||
|
bool archSupported = i.m_isArchSupported(arch);
|
||||||
|
free(arch);
|
||||||
|
if (archSupported) return &i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!buildVariantMatched)
|
||||||
|
return nullptr;
|
||||||
|
if (!archName)
|
||||||
|
throw UnsupportedModelError("Unsupported file format");
|
||||||
|
|
||||||
|
throw BadArchError(std::move(*archName));
|
||||||
|
}
|
||||||
|
|
||||||
|
LlamaCppBackend *LlamaCppBackend::Implementation::construct(
|
||||||
|
const std::string &modelPath,
|
||||||
|
const std::string &backend,
|
||||||
|
int n_ctx
|
||||||
|
) {
|
||||||
|
std::vector<std::string> desiredBackends;
|
||||||
|
if (backend != "auto") {
|
||||||
|
desiredBackends.push_back(backend);
|
||||||
|
} else {
|
||||||
|
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &desiredBackend: desiredBackends) {
|
||||||
|
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
||||||
|
|
||||||
|
if (impl) {
|
||||||
|
// Construct llmodel implementation
|
||||||
|
auto *fres = impl->m_construct();
|
||||||
|
fres->m_implementation = impl;
|
||||||
|
|
||||||
|
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
||||||
|
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
||||||
|
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||||
|
* most (all?) places where this is called, causing underestimation of required
|
||||||
|
* memory. */
|
||||||
|
if (backend == "auto" && desiredBackend == "metal") {
|
||||||
|
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||||
|
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
||||||
|
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
||||||
|
delete fres;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void)n_ctx;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
|
||||||
|
{
|
||||||
|
static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
|
||||||
|
|
||||||
|
const std::vector<Implementation> *impls;
|
||||||
|
try {
|
||||||
|
impls = &implementationList();
|
||||||
|
} catch (const std::runtime_error &e) {
|
||||||
|
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> desiredBackends;
|
||||||
|
if (backend) {
|
||||||
|
desiredBackends.push_back(backend.value());
|
||||||
|
} else {
|
||||||
|
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||||
|
}
|
||||||
|
|
||||||
|
const Implementation *impl = nullptr;
|
||||||
|
|
||||||
|
for (const auto &desiredBackend: desiredBackends) {
|
||||||
|
auto cacheIt = implCache.find(desiredBackend);
|
||||||
|
if (cacheIt != implCache.end())
|
||||||
|
return cacheIt->second.get(); // cached
|
||||||
|
|
||||||
|
for (const auto &i: *impls) {
|
||||||
|
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
||||||
|
impl = &i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (impl) {
|
||||||
|
auto *fres = impl->m_construct();
|
||||||
|
fres->m_implementation = impl;
|
||||||
|
implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
|
||||||
|
<< "\n";
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired)
|
||||||
|
{
|
||||||
|
std::vector<LlamaCppBackend::GPUDevice> devices;
|
||||||
|
#ifndef __APPLE__
|
||||||
|
static const std::string backends[] = {"kompute", "cuda"};
|
||||||
|
for (const auto &backend: backends) {
|
||||||
|
auto *llama = constructGlobalLlama(backend);
|
||||||
|
if (llama) {
|
||||||
|
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
||||||
|
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return devices;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
auto *llama = constructGlobalLlama();
|
||||||
|
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
auto *llama = constructGlobalLlama();
|
||||||
|
return llama ? llama->layerCount(modelPath) : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
auto *llama = constructGlobalLlama();
|
||||||
|
return llama && llama->isEmbeddingModel(modelPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path)
|
||||||
|
{
|
||||||
|
s_implementations_search_path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string& LlamaCppBackend::Implementation::implementationsSearchPath()
|
||||||
|
{
|
||||||
|
return s_implementations_search_path;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LlamaCppBackend::Implementation::hasSupportedCPU()
|
||||||
|
{
|
||||||
|
return cpu_supports_avx() != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int LlamaCppBackend::Implementation::cpuSupportsAVX2()
|
||||||
|
{
|
||||||
|
return cpu_supports_avx2();
|
||||||
}
|
}
|
187
gpt4all-backend/llamacpp_backend.h
Normal file
187
gpt4all-backend/llamacpp_backend.h
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
class LlamaCppBackend : public EmbLLModel {
|
||||||
|
public:
|
||||||
|
class BadArchError: public std::runtime_error {
|
||||||
|
public:
|
||||||
|
BadArchError(std::string arch)
|
||||||
|
: runtime_error("Unsupported model architecture: " + arch)
|
||||||
|
, m_arch(std::move(arch))
|
||||||
|
{}
|
||||||
|
|
||||||
|
const std::string &arch() const noexcept { return m_arch; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string m_arch;
|
||||||
|
};
|
||||||
|
|
||||||
|
class MissingImplementationError: public std::runtime_error {
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
class UnsupportedModelError: public std::runtime_error {
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GPUDevice {
|
||||||
|
const char *backend;
|
||||||
|
int index;
|
||||||
|
int type;
|
||||||
|
size_t heapSize;
|
||||||
|
std::string name;
|
||||||
|
std::string vendor;
|
||||||
|
|
||||||
|
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||||
|
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
||||||
|
vendor(std::move(vendor)) {}
|
||||||
|
|
||||||
|
std::string selectionName() const
|
||||||
|
{
|
||||||
|
assert(backend == "cuda"s || backend == "kompute"s);
|
||||||
|
return backendName() + ": " + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string backendName() const { return backendIdToName(backend); }
|
||||||
|
|
||||||
|
static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
|
||||||
|
|
||||||
|
static std::string updateSelectionName(const std::string &name) {
|
||||||
|
if (name == "Auto" || name == "CPU" || name == "Metal")
|
||||||
|
return name;
|
||||||
|
auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
|
||||||
|
return name.starts_with(entry.second + ": ");
|
||||||
|
});
|
||||||
|
if (it != s_backendNames.end())
|
||||||
|
return name;
|
||||||
|
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static inline const std::unordered_map<std::string, std::string> s_backendNames {
|
||||||
|
{"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
class Implementation {
|
||||||
|
public:
|
||||||
|
Implementation(const Implementation &) = delete;
|
||||||
|
Implementation(Implementation &&);
|
||||||
|
~Implementation();
|
||||||
|
|
||||||
|
std::string_view modelType() const { return m_modelType; }
|
||||||
|
std::string_view buildVariant() const { return m_buildVariant; }
|
||||||
|
|
||||||
|
static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||||
|
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||||
|
static int32_t maxContextLength(const std::string &modelPath);
|
||||||
|
static int32_t layerCount(const std::string &modelPath);
|
||||||
|
static bool isEmbeddingModel(const std::string &modelPath);
|
||||||
|
static void setImplementationsSearchPath(const std::string &path);
|
||||||
|
static const std::string &implementationsSearchPath();
|
||||||
|
static bool hasSupportedCPU();
|
||||||
|
// 0 for no, 1 for yes, -1 for non-x86_64
|
||||||
|
static int cpuSupportsAVX2();
|
||||||
|
|
||||||
|
private:
|
||||||
|
Implementation(Dlhandle &&);
|
||||||
|
|
||||||
|
static const std::vector<Implementation> &implementationList();
|
||||||
|
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
||||||
|
static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||||
|
|
||||||
|
char *(*m_getFileArch)(const char *fname);
|
||||||
|
bool (*m_isArchSupported)(const char *arch);
|
||||||
|
LlamaCppBackend *(*m_construct)();
|
||||||
|
|
||||||
|
std::string_view m_modelType;
|
||||||
|
std::string_view m_buildVariant;
|
||||||
|
Dlhandle *m_dlhandle;
|
||||||
|
};
|
||||||
|
|
||||||
|
using ProgressCallback = std::function<bool(float progress)>;
|
||||||
|
|
||||||
|
virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
|
||||||
|
virtual bool isEmbeddingModel(const std::string &modelPath) const = 0;
|
||||||
|
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
||||||
|
|
||||||
|
void prompt(const std::string &prompt,
|
||||||
|
const std::string &promptTemplate,
|
||||||
|
std::function<bool(int32_t)> promptCallback,
|
||||||
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
|
bool allowContextShift,
|
||||||
|
PromptContext &ctx,
|
||||||
|
bool special = false,
|
||||||
|
std::string *fakeReply = nullptr) override;
|
||||||
|
|
||||||
|
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
||||||
|
virtual int32_t threadCount() const { return 1; }
|
||||||
|
|
||||||
|
const Implementation &implementation() const { return *m_implementation; }
|
||||||
|
|
||||||
|
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
|
||||||
|
{
|
||||||
|
(void)memoryRequired;
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
||||||
|
{
|
||||||
|
(void)memoryRequired;
|
||||||
|
(void)name;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const
|
||||||
|
{
|
||||||
|
(void)device;
|
||||||
|
if (unavail_reason) {
|
||||||
|
*unavail_reason = "model has no GPU support";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool usingGPUDevice() const { return false; }
|
||||||
|
virtual const char *backendName() const { return "cpu"; }
|
||||||
|
virtual const char *gpuDeviceName() const { return nullptr; }
|
||||||
|
|
||||||
|
void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
|
||||||
|
virtual bool isSpecialToken(Token id) const = 0;
|
||||||
|
virtual std::string tokenToString(Token id) const = 0;
|
||||||
|
virtual Token sampleToken(PromptContext &ctx) const = 0;
|
||||||
|
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
|
||||||
|
virtual void shiftContext(PromptContext &promptCtx) = 0;
|
||||||
|
virtual int32_t contextLength() const = 0;
|
||||||
|
virtual const std::vector<Token> &endTokens() const = 0;
|
||||||
|
virtual bool shouldAddBOS() const = 0;
|
||||||
|
|
||||||
|
virtual int32_t maxContextLength(std::string const &modelPath) const = 0;
|
||||||
|
virtual int32_t layerCount(std::string const &modelPath) const = 0;
|
||||||
|
|
||||||
|
static bool staticProgressCallback(float progress, void* ctx)
|
||||||
|
{
|
||||||
|
LlamaCppBackend *model = static_cast<LlamaCppBackend *>(ctx);
|
||||||
|
if (model && model->m_progressCallback)
|
||||||
|
return model->m_progressCallback(progress);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool decodePrompt(std::function<bool(int32_t)> promptCallback,
|
||||||
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
|
bool allowContextShift,
|
||||||
|
PromptContext &promptCtx,
|
||||||
|
std::vector<Token> embd_inp);
|
||||||
|
void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
|
bool allowContextShift,
|
||||||
|
PromptContext &promptCtx);
|
||||||
|
|
||||||
|
const Implementation *m_implementation = nullptr;
|
||||||
|
ProgressCallback m_progressCallback;
|
||||||
|
Token m_tokenize_last_token = -1;
|
||||||
|
};
|
@ -378,7 +378,7 @@ bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int
|
|||||||
d_ptr->model_params.use_mlock = params.use_mlock;
|
d_ptr->model_params.use_mlock = params.use_mlock;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
|
d_ptr->model_params.progress_callback = &LlamaCppBackend::staticProgressCallback;
|
||||||
d_ptr->model_params.progress_callback_user_data = this;
|
d_ptr->model_params.progress_callback_user_data = this;
|
||||||
|
|
||||||
d_ptr->backend_name = "cpu"; // default
|
d_ptr->backend_name = "cpu"; // default
|
||||||
@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
|
std::vector<LlamaCppBackendImpl::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
@ -675,7 +675,7 @@ std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (lcppDevices) {
|
if (lcppDevices) {
|
||||||
std::vector<LLModel::GPUDevice> devices;
|
std::vector<GPUDevice> devices;
|
||||||
devices.reserve(count);
|
devices.reserve(count);
|
||||||
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
@ -909,7 +909,7 @@ void LlamaCppBackendImpl::embed(
|
|||||||
|
|
||||||
void LlamaCppBackendImpl::embed(
|
void LlamaCppBackendImpl::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
||||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
|
size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb
|
||||||
) {
|
) {
|
||||||
if (!d_ptr->model)
|
if (!d_ptr->model)
|
||||||
throw std::logic_error("no model is loaded");
|
throw std::logic_error("no model is loaded");
|
||||||
@ -967,7 +967,7 @@ double getL2NormScale(T *start, T *end)
|
|||||||
|
|
||||||
void LlamaCppBackendImpl::embedInternal(
|
void LlamaCppBackendImpl::embedInternal(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
||||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
|
size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
|
||||||
) {
|
) {
|
||||||
typedef std::vector<LLModel::Token> TokenString;
|
typedef std::vector<LLModel::Token> TokenString;
|
||||||
static constexpr int32_t atlasMaxLength = 8192;
|
static constexpr int32_t atlasMaxLength = 8192;
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#endif
|
#endif
|
||||||
#ifndef LLAMACPP_BACKEND_IMPL_H
|
|
||||||
#define LLAMACPP_BACKEND_IMPL_H
|
|
||||||
|
|
||||||
#include "llmodel.h"
|
#include "llamacpp_backend.h"
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -13,7 +13,7 @@
|
|||||||
struct LlamaPrivate;
|
struct LlamaPrivate;
|
||||||
struct EmbModelSpec;
|
struct EmbModelSpec;
|
||||||
|
|
||||||
class LlamaCppBackendImpl : public LLModel {
|
class LlamaCppBackendImpl : public LlamaCppBackend {
|
||||||
public:
|
public:
|
||||||
LlamaCppBackendImpl();
|
LlamaCppBackendImpl();
|
||||||
~LlamaCppBackendImpl();
|
~LlamaCppBackendImpl();
|
||||||
@ -68,5 +68,3 @@ protected:
|
|||||||
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
|
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
|
||||||
const EmbModelSpec *spec);
|
const EmbModelSpec *spec);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // LLAMACPP_BACKEND_IMPL_H
|
|
||||||
|
@ -1,350 +0,0 @@
|
|||||||
#include "llmodel.h"
|
|
||||||
|
|
||||||
#include "dlhandle.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <iterator>
|
|
||||||
#include <memory>
|
|
||||||
#include <optional>
|
|
||||||
#include <regex>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
# define WIN32_LEAN_AND_MEAN
|
|
||||||
# ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
# include <intrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__aarch64__)
|
|
||||||
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
|
||||||
|
|
||||||
#ifndef __APPLE__
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
|
||||||
#else
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string s_implementations_search_path = ".";
|
|
||||||
|
|
||||||
#if !(defined(__x86_64__) || defined(_M_X64))
|
|
||||||
// irrelevant on non-x86_64
|
|
||||||
#define cpu_supports_avx() -1
|
|
||||||
#define cpu_supports_avx2() -1
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
// MSVC
|
|
||||||
static int get_cpu_info(int func_id, int reg_id) {
|
|
||||||
int info[4];
|
|
||||||
__cpuid(info, func_id);
|
|
||||||
return info[reg_id];
|
|
||||||
}
|
|
||||||
|
|
||||||
// AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
|
|
||||||
#define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28))
|
|
||||||
// AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
|
|
||||||
#define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5))
|
|
||||||
#else
|
|
||||||
// gcc/clang
|
|
||||||
#define cpu_supports_avx() !!__builtin_cpu_supports("avx")
|
|
||||||
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
|
|
||||||
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
|
||||||
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
|
||||||
assert(get_model_type);
|
|
||||||
m_modelType = get_model_type();
|
|
||||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
|
||||||
assert(get_build_variant);
|
|
||||||
m_buildVariant = get_build_variant();
|
|
||||||
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
|
||||||
assert(m_getFileArch);
|
|
||||||
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
|
||||||
assert(m_isArchSupported);
|
|
||||||
m_construct = m_dlhandle->get<LLModel *()>("construct");
|
|
||||||
assert(m_construct);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel::Implementation::Implementation(Implementation &&o)
|
|
||||||
: m_getFileArch(o.m_getFileArch)
|
|
||||||
, m_isArchSupported(o.m_isArchSupported)
|
|
||||||
, m_construct(o.m_construct)
|
|
||||||
, m_modelType(o.m_modelType)
|
|
||||||
, m_buildVariant(o.m_buildVariant)
|
|
||||||
, m_dlhandle(o.m_dlhandle) {
|
|
||||||
o.m_dlhandle = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel::Implementation::~Implementation()
|
|
||||||
{
|
|
||||||
delete m_dlhandle;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool isImplementation(const Dlhandle &dl)
|
|
||||||
{
|
|
||||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
|
||||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
|
||||||
static void addCudaSearchPath()
|
|
||||||
{
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
|
||||||
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
|
||||||
if (!AddDllDirectory(libDir.c_str())) {
|
|
||||||
auto err = GetLastError();
|
|
||||||
std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
|
|
||||||
{
|
|
||||||
if (cpu_supports_avx() == 0) {
|
|
||||||
throw std::runtime_error("CPU does not support AVX");
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
|
||||||
// individual models without the cleanup of the static list interfering
|
|
||||||
static auto* libs = new std::vector<Implementation>([] () {
|
|
||||||
std::vector<Implementation> fres;
|
|
||||||
|
|
||||||
addCudaSearchPath();
|
|
||||||
|
|
||||||
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
|
||||||
if (cpu_supports_avx2() == 0) {
|
|
||||||
impl_name_re += "-avxonly";
|
|
||||||
}
|
|
||||||
std::regex re(impl_name_re);
|
|
||||||
auto search_in_directory = [&](const std::string& paths) {
|
|
||||||
std::stringstream ss(paths);
|
|
||||||
std::string path;
|
|
||||||
// Split the paths string by the delimiter and process each path.
|
|
||||||
while (std::getline(ss, path, ';')) {
|
|
||||||
std::u8string u8_path(path.begin(), path.end());
|
|
||||||
// Iterate over all libraries
|
|
||||||
for (const auto &f : fs::directory_iterator(u8_path)) {
|
|
||||||
const fs::path &p = f.path();
|
|
||||||
|
|
||||||
if (p.extension() != LIB_FILE_EXT) continue;
|
|
||||||
if (!std::regex_search(p.stem().string(), re)) {
|
|
||||||
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add to list if model implementation
|
|
||||||
Dlhandle dl;
|
|
||||||
try {
|
|
||||||
dl = Dlhandle(p);
|
|
||||||
} catch (const Dlhandle::Exception &e) {
|
|
||||||
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!isImplementation(dl)) {
|
|
||||||
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fres.emplace_back(Implementation(std::move(dl)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
search_in_directory(s_implementations_search_path);
|
|
||||||
|
|
||||||
return fres;
|
|
||||||
}());
|
|
||||||
// Return static result
|
|
||||||
return *libs;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
|
||||||
{
|
|
||||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
|
||||||
return buildVariant + "-avxonly";
|
|
||||||
}
|
|
||||||
return buildVariant;
|
|
||||||
}
|
|
||||||
|
|
||||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
|
|
||||||
{
|
|
||||||
bool buildVariantMatched = false;
|
|
||||||
std::optional<std::string> archName;
|
|
||||||
for (const auto& i : implementationList()) {
|
|
||||||
if (buildVariant != i.m_buildVariant) continue;
|
|
||||||
buildVariantMatched = true;
|
|
||||||
|
|
||||||
char *arch = i.m_getFileArch(fname);
|
|
||||||
if (!arch) continue;
|
|
||||||
archName = arch;
|
|
||||||
|
|
||||||
bool archSupported = i.m_isArchSupported(arch);
|
|
||||||
free(arch);
|
|
||||||
if (archSupported) return &i;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!buildVariantMatched)
|
|
||||||
return nullptr;
|
|
||||||
if (!archName)
|
|
||||||
throw UnsupportedModelError("Unsupported file format");
|
|
||||||
|
|
||||||
throw BadArchError(std::move(*archName));
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
|
|
||||||
{
|
|
||||||
std::vector<std::string> desiredBackends;
|
|
||||||
if (backend != "auto") {
|
|
||||||
desiredBackends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &desiredBackend: desiredBackends) {
|
|
||||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
|
||||||
|
|
||||||
if (impl) {
|
|
||||||
// Construct llmodel implementation
|
|
||||||
auto *fres = impl->m_construct();
|
|
||||||
fres->m_implementation = impl;
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
|
||||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
|
||||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
|
||||||
* most (all?) places where this is called, causing underestimation of required
|
|
||||||
* memory. */
|
|
||||||
if (backend == "auto" && desiredBackend == "metal") {
|
|
||||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
|
||||||
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
|
||||||
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
|
||||||
delete fres;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
(void)n_ctx;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return fres;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
|
|
||||||
{
|
|
||||||
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
|
|
||||||
|
|
||||||
const std::vector<Implementation> *impls;
|
|
||||||
try {
|
|
||||||
impls = &implementationList();
|
|
||||||
} catch (const std::runtime_error &e) {
|
|
||||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> desiredBackends;
|
|
||||||
if (backend) {
|
|
||||||
desiredBackends.push_back(backend.value());
|
|
||||||
} else {
|
|
||||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
|
||||||
}
|
|
||||||
|
|
||||||
const Implementation *impl = nullptr;
|
|
||||||
|
|
||||||
for (const auto &desiredBackend: desiredBackends) {
|
|
||||||
auto cacheIt = implCache.find(desiredBackend);
|
|
||||||
if (cacheIt != implCache.end())
|
|
||||||
return cacheIt->second.get(); // cached
|
|
||||||
|
|
||||||
for (const auto &i: *impls) {
|
|
||||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
|
||||||
impl = &i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (impl) {
|
|
||||||
auto *fres = impl->m_construct();
|
|
||||||
fres->m_implementation = impl;
|
|
||||||
implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
|
|
||||||
return fres;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
|
|
||||||
{
|
|
||||||
std::vector<LLModel::GPUDevice> devices;
|
|
||||||
#ifndef __APPLE__
|
|
||||||
static const std::string backends[] = {"kompute", "cuda"};
|
|
||||||
for (const auto &backend: backends) {
|
|
||||||
auto *llama = constructGlobalLlama(backend);
|
|
||||||
if (llama) {
|
|
||||||
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
|
||||||
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return devices;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama ? llama->layerCount(modelPath) : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama && llama->isEmbeddingModel(modelPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
|
|
||||||
{
|
|
||||||
s_implementations_search_path = path;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string& LLModel::Implementation::implementationsSearchPath()
|
|
||||||
{
|
|
||||||
return s_implementations_search_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LLModel::Implementation::hasSupportedCPU()
|
|
||||||
{
|
|
||||||
return cpu_supports_avx() != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int LLModel::Implementation::cpuSupportsAVX2()
|
|
||||||
{
|
|
||||||
return cpu_supports_avx2();
|
|
||||||
}
|
|
@ -1,5 +1,4 @@
|
|||||||
#ifndef LLMODEL_H
|
#pragma once
|
||||||
#define LLMODEL_H
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -24,104 +23,6 @@ class LLModel {
|
|||||||
public:
|
public:
|
||||||
using Token = int32_t;
|
using Token = int32_t;
|
||||||
|
|
||||||
class BadArchError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
BadArchError(std::string arch)
|
|
||||||
: runtime_error("Unsupported model architecture: " + arch)
|
|
||||||
, m_arch(std::move(arch))
|
|
||||||
{}
|
|
||||||
|
|
||||||
const std::string &arch() const noexcept { return m_arch; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string m_arch;
|
|
||||||
};
|
|
||||||
|
|
||||||
class MissingImplementationError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
class UnsupportedModelError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct GPUDevice {
|
|
||||||
const char *backend;
|
|
||||||
int index;
|
|
||||||
int type;
|
|
||||||
size_t heapSize;
|
|
||||||
std::string name;
|
|
||||||
std::string vendor;
|
|
||||||
|
|
||||||
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
|
|
||||||
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
|
||||||
vendor(std::move(vendor)) {}
|
|
||||||
|
|
||||||
std::string selectionName() const
|
|
||||||
{
|
|
||||||
assert(backend == "cuda"s || backend == "kompute"s);
|
|
||||||
return backendName() + ": " + name;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string backendName() const { return backendIdToName(backend); }
|
|
||||||
|
|
||||||
static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
|
|
||||||
|
|
||||||
static std::string updateSelectionName(const std::string &name) {
|
|
||||||
if (name == "Auto" || name == "CPU" || name == "Metal")
|
|
||||||
return name;
|
|
||||||
auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
|
|
||||||
return name.starts_with(entry.second + ": ");
|
|
||||||
});
|
|
||||||
if (it != s_backendNames.end())
|
|
||||||
return name;
|
|
||||||
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
static inline const std::unordered_map<std::string, std::string> s_backendNames {
|
|
||||||
{"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
class Implementation {
|
|
||||||
public:
|
|
||||||
Implementation(const Implementation &) = delete;
|
|
||||||
Implementation(Implementation &&);
|
|
||||||
~Implementation();
|
|
||||||
|
|
||||||
std::string_view modelType() const { return m_modelType; }
|
|
||||||
std::string_view buildVariant() const { return m_buildVariant; }
|
|
||||||
|
|
||||||
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
|
||||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
|
||||||
static int32_t maxContextLength(const std::string &modelPath);
|
|
||||||
static int32_t layerCount(const std::string &modelPath);
|
|
||||||
static bool isEmbeddingModel(const std::string &modelPath);
|
|
||||||
static void setImplementationsSearchPath(const std::string &path);
|
|
||||||
static const std::string &implementationsSearchPath();
|
|
||||||
static bool hasSupportedCPU();
|
|
||||||
// 0 for no, 1 for yes, -1 for non-x86_64
|
|
||||||
static int cpuSupportsAVX2();
|
|
||||||
|
|
||||||
private:
|
|
||||||
Implementation(Dlhandle &&);
|
|
||||||
|
|
||||||
static const std::vector<Implementation> &implementationList();
|
|
||||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
|
||||||
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
|
||||||
|
|
||||||
char *(*m_getFileArch)(const char *fname);
|
|
||||||
bool (*m_isArchSupported)(const char *arch);
|
|
||||||
LLModel *(*m_construct)();
|
|
||||||
|
|
||||||
std::string_view m_modelType;
|
|
||||||
std::string_view m_buildVariant;
|
|
||||||
Dlhandle *m_dlhandle;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct PromptContext {
|
struct PromptContext {
|
||||||
std::vector<int32_t> tokens; // current tokens in the context window
|
std::vector<int32_t> tokens; // current tokens in the context window
|
||||||
int32_t n_past = 0; // number of tokens in past conversation
|
int32_t n_past = 0; // number of tokens in past conversation
|
||||||
@ -137,18 +38,11 @@ public:
|
|||||||
float contextErase = 0.5f; // percent of context to erase if we exceed the context window
|
float contextErase = 0.5f; // percent of context to erase if we exceed the context window
|
||||||
};
|
};
|
||||||
|
|
||||||
using ProgressCallback = std::function<bool(float progress)>;
|
|
||||||
|
|
||||||
explicit LLModel() {}
|
|
||||||
virtual ~LLModel() {}
|
virtual ~LLModel() {}
|
||||||
|
|
||||||
virtual bool supportsEmbedding() const = 0;
|
virtual bool supportsCompletion() const { return true; }
|
||||||
virtual bool supportsCompletion() const = 0;
|
|
||||||
virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
||||||
virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
|
|
||||||
virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
|
|
||||||
virtual bool isModelLoaded() const = 0;
|
virtual bool isModelLoaded() const = 0;
|
||||||
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
|
||||||
virtual size_t stateSize() const { return 0; }
|
virtual size_t stateSize() const { return 0; }
|
||||||
virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
|
virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
|
||||||
virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
|
virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
|
||||||
@ -162,101 +56,25 @@ public:
|
|||||||
bool allowContextShift,
|
bool allowContextShift,
|
||||||
PromptContext &ctx,
|
PromptContext &ctx,
|
||||||
bool special = false,
|
bool special = false,
|
||||||
std::string *fakeReply = nullptr);
|
std::string *fakeReply = nullptr) = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
explicit LLModel() {}
|
||||||
|
};
|
||||||
|
|
||||||
|
class EmbLLModel: virtual public LLModel {
|
||||||
|
public:
|
||||||
using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
|
using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
|
||||||
|
|
||||||
virtual size_t embeddingSize() const {
|
virtual bool supportsCompletion() const = 0;
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
virtual bool supportsEmbedding() const = 0;
|
||||||
}
|
virtual size_t embeddingSize() const = 0;
|
||||||
|
|
||||||
// user-specified prefix
|
// user-specified prefix
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
||||||
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
|
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
|
||||||
EmbedCancelCallback *cancelCb = nullptr);
|
EmbedCancelCallback *cancelCb = nullptr) = 0;
|
||||||
// automatic prefix
|
// automatic prefix
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
|
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
|
||||||
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
|
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) = 0;
|
||||||
|
|
||||||
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
|
||||||
virtual int32_t threadCount() const { return 1; }
|
|
||||||
|
|
||||||
const Implementation &implementation() const {
|
|
||||||
return *m_implementation;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
|
|
||||||
(void)memoryRequired;
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
|
|
||||||
(void)memoryRequired;
|
|
||||||
(void)name;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
|
|
||||||
(void)device;
|
|
||||||
if (unavail_reason) {
|
|
||||||
*unavail_reason = "model has no GPU support";
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool usingGPUDevice() const { return false; }
|
|
||||||
virtual const char *backendName() const { return "cpu"; }
|
|
||||||
virtual const char *gpuDeviceName() const { return nullptr; }
|
|
||||||
|
|
||||||
void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
// These are pure virtual because subclasses need to implement as the default implementation of
|
|
||||||
// 'prompt' above calls these functions
|
|
||||||
virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
|
|
||||||
virtual bool isSpecialToken(Token id) const = 0;
|
|
||||||
virtual std::string tokenToString(Token id) const = 0;
|
|
||||||
virtual Token sampleToken(PromptContext &ctx) const = 0;
|
|
||||||
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
|
|
||||||
virtual void shiftContext(PromptContext &promptCtx) = 0;
|
|
||||||
virtual int32_t contextLength() const = 0;
|
|
||||||
virtual const std::vector<Token> &endTokens() const = 0;
|
|
||||||
virtual bool shouldAddBOS() const = 0;
|
|
||||||
|
|
||||||
virtual int32_t maxContextLength(std::string const &modelPath) const
|
|
||||||
{
|
|
||||||
(void)modelPath;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual int32_t layerCount(std::string const &modelPath) const
|
|
||||||
{
|
|
||||||
(void)modelPath;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const Implementation *m_implementation = nullptr;
|
|
||||||
|
|
||||||
ProgressCallback m_progressCallback;
|
|
||||||
static bool staticProgressCallback(float progress, void* ctx)
|
|
||||||
{
|
|
||||||
LLModel* model = static_cast<LLModel*>(ctx);
|
|
||||||
if (model && model->m_progressCallback)
|
|
||||||
return model->m_progressCallback(progress);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool decodePrompt(std::function<bool(int32_t)> promptCallback,
|
|
||||||
std::function<bool(int32_t, const std::string&)> responseCallback,
|
|
||||||
bool allowContextShift,
|
|
||||||
PromptContext &promptCtx,
|
|
||||||
std::vector<Token> embd_inp);
|
|
||||||
void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
|
|
||||||
bool allowContextShift,
|
|
||||||
PromptContext &promptCtx);
|
|
||||||
|
|
||||||
Token m_tokenize_last_token = -1; // not serialized
|
|
||||||
|
|
||||||
friend class LLMImplementation;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // LLMODEL_H
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "llmodel_c.h"
|
#include "llmodel_c.h"
|
||||||
|
|
||||||
|
#include "llamacpp_backend.h"
|
||||||
#include "llmodel.h"
|
#include "llmodel.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -15,7 +16,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct LLModelWrapper {
|
struct LLModelWrapper {
|
||||||
LLModel *llModel = nullptr;
|
LlamaCppBackend *llModel = nullptr;
|
||||||
LLModel::PromptContext promptContext;
|
LLModel::PromptContext promptContext;
|
||||||
~LLModelWrapper() { delete llModel; }
|
~LLModelWrapper() { delete llModel; }
|
||||||
};
|
};
|
||||||
@ -41,9 +42,9 @@ static void llmodel_set_error(const char **errptr, const char *message)
|
|||||||
|
|
||||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
|
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
|
||||||
{
|
{
|
||||||
LLModel *llModel;
|
LlamaCppBackend *llModel;
|
||||||
try {
|
try {
|
||||||
llModel = LLModel::Implementation::construct(model_path, backend);
|
llModel = LlamaCppBackend::Implementation::construct(model_path, backend);
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
llmodel_set_error(error, e.what());
|
llmodel_set_error(error, e.what());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -214,12 +215,12 @@ int32_t llmodel_threadCount(llmodel_model model)
|
|||||||
|
|
||||||
void llmodel_set_implementation_search_path(const char *path)
|
void llmodel_set_implementation_search_path(const char *path)
|
||||||
{
|
{
|
||||||
LLModel::Implementation::setImplementationsSearchPath(path);
|
LlamaCppBackend::Implementation::setImplementationsSearchPath(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *llmodel_get_implementation_search_path()
|
const char *llmodel_get_implementation_search_path()
|
||||||
{
|
{
|
||||||
return LLModel::Implementation::implementationsSearchPath().c_str();
|
return LlamaCppBackend::Implementation::implementationsSearchPath().c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
// RAII wrapper around a C-style struct
|
// RAII wrapper around a C-style struct
|
||||||
@ -244,7 +245,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
|
|||||||
{
|
{
|
||||||
static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
|
static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
|
||||||
|
|
||||||
auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
|
auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired);
|
||||||
*num_devices = devices.size();
|
*num_devices = devices.size();
|
||||||
|
|
||||||
if (devices.empty()) { return nullptr; /* no devices */ }
|
if (devices.empty()) { return nullptr; /* no devices */ }
|
||||||
|
@ -32,14 +32,6 @@ ChatAPI::ChatAPI()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ChatAPI::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
|
||||||
{
|
|
||||||
Q_UNUSED(modelPath);
|
|
||||||
Q_UNUSED(n_ctx);
|
|
||||||
Q_UNUSED(ngl);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||||
{
|
{
|
||||||
Q_UNUSED(modelPath);
|
Q_UNUSED(modelPath);
|
||||||
@ -48,20 +40,7 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChatAPI::setThreadCount(int32_t n_threads)
|
ChatAPI::~ChatAPI() {}
|
||||||
{
|
|
||||||
Q_UNUSED(n_threads);
|
|
||||||
qt_noop();
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t ChatAPI::threadCount() const
|
|
||||||
{
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ChatAPI::~ChatAPI()
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ChatAPI::isModelLoaded() const
|
bool ChatAPI::isModelLoaded() const
|
||||||
{
|
{
|
||||||
|
@ -57,11 +57,8 @@ public:
|
|||||||
ChatAPI();
|
ChatAPI();
|
||||||
virtual ~ChatAPI();
|
virtual ~ChatAPI();
|
||||||
|
|
||||||
bool supportsEmbedding() const override { return false; }
|
|
||||||
bool supportsCompletion() const override { return true; }
|
|
||||||
bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
|
bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
|
||||||
bool isModelLoaded() const override;
|
bool isModelLoaded() const override;
|
||||||
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
|
|
||||||
size_t stateSize() const override;
|
size_t stateSize() const override;
|
||||||
size_t saveState(uint8_t *dest) const override;
|
size_t saveState(uint8_t *dest) const override;
|
||||||
size_t restoreState(const uint8_t *src) override;
|
size_t restoreState(const uint8_t *src) override;
|
||||||
@ -74,9 +71,6 @@ public:
|
|||||||
bool special,
|
bool special,
|
||||||
std::string *fakeReply) override;
|
std::string *fakeReply) override;
|
||||||
|
|
||||||
void setThreadCount(int32_t n_threads) override;
|
|
||||||
int32_t threadCount() const override;
|
|
||||||
|
|
||||||
void setModelName(const QString &modelName) { m_modelName = modelName; }
|
void setModelName(const QString &modelName) { m_modelName = modelName; }
|
||||||
void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; }
|
void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; }
|
||||||
void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; }
|
void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; }
|
||||||
@ -92,65 +86,6 @@ Q_SIGNALS:
|
|||||||
LLModel::PromptContext *ctx,
|
LLModel::PromptContext *ctx,
|
||||||
const QByteArray &array);
|
const QByteArray &array);
|
||||||
|
|
||||||
protected:
|
|
||||||
// We have to implement these as they are pure virtual in base class, but we don't actually use
|
|
||||||
// them as they are only called from the default implementation of 'prompt' which we override and
|
|
||||||
// completely replace
|
|
||||||
|
|
||||||
std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override
|
|
||||||
{
|
|
||||||
(void)ctx;
|
|
||||||
(void)str;
|
|
||||||
(void)special;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isSpecialToken(Token id) const override
|
|
||||||
{
|
|
||||||
(void)id;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string tokenToString(Token id) const override
|
|
||||||
{
|
|
||||||
(void)id;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
Token sampleToken(PromptContext &ctx) const override
|
|
||||||
{
|
|
||||||
(void)ctx;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override
|
|
||||||
{
|
|
||||||
(void)ctx;
|
|
||||||
(void)tokens;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
void shiftContext(PromptContext &promptCtx) override
|
|
||||||
{
|
|
||||||
(void)promptCtx;
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t contextLength() const override
|
|
||||||
{
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::vector<Token> &endTokens() const override
|
|
||||||
{
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
bool shouldAddBOS() const override
|
|
||||||
{
|
|
||||||
throw std::logic_error("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::function<bool(int32_t, const std::string&)> m_responseCallback;
|
std::function<bool(int32_t, const std::string&)> m_responseCallback;
|
||||||
QString m_modelName;
|
QString m_modelName;
|
||||||
|
@ -412,19 +412,20 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
|
|
||||||
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
||||||
|
|
||||||
auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) {
|
auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) -> LlamaCppBackend * {
|
||||||
|
LlamaCppBackend *lcppmodel;
|
||||||
QString constructError;
|
QString constructError;
|
||||||
m_llModelInfo.resetModel(this);
|
m_llModelInfo.resetModel(this);
|
||||||
try {
|
try {
|
||||||
auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
lcppmodel = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
||||||
m_llModelInfo.resetModel(this, model);
|
m_llModelInfo.resetModel(this, lcppmodel);
|
||||||
} catch (const LLModel::MissingImplementationError &e) {
|
} catch (const LlamaCppBackend::MissingImplementationError &e) {
|
||||||
modelLoadProps.insert("error", "missing_model_impl");
|
modelLoadProps.insert("error", "missing_model_impl");
|
||||||
constructError = e.what();
|
constructError = e.what();
|
||||||
} catch (const LLModel::UnsupportedModelError &e) {
|
} catch (const LlamaCppBackend::UnsupportedModelError &e) {
|
||||||
modelLoadProps.insert("error", "unsupported_model_file");
|
modelLoadProps.insert("error", "unsupported_model_file");
|
||||||
constructError = e.what();
|
constructError = e.what();
|
||||||
} catch (const LLModel::BadArchError &e) {
|
} catch (const LlamaCppBackend::BadArchError &e) {
|
||||||
constructError = e.what();
|
constructError = e.what();
|
||||||
modelLoadProps.insert("error", "unsupported_model_arch");
|
modelLoadProps.insert("error", "unsupported_model_arch");
|
||||||
modelLoadProps.insert("model_arch", QString::fromStdString(e.arch()));
|
modelLoadProps.insert("model_arch", QString::fromStdString(e.arch()));
|
||||||
@ -435,21 +436,22 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
resetModel();
|
resetModel();
|
||||||
emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
|
emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
|
||||||
return false;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_llModelInfo.model->setProgressCallback([this](float progress) -> bool {
|
lcppmodel->setProgressCallback([this](float progress) -> bool {
|
||||||
progress = std::max(progress, std::numeric_limits<float>::min()); // keep progress above zero
|
progress = std::max(progress, std::numeric_limits<float>::min()); // keep progress above zero
|
||||||
emit modelLoadingPercentageChanged(progress);
|
emit modelLoadingPercentageChanged(progress);
|
||||||
return m_shouldBeLoaded;
|
return m_shouldBeLoaded;
|
||||||
});
|
});
|
||||||
return true;
|
return lcppmodel;
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!construct(backend))
|
auto *lcppmodel = construct(backend);
|
||||||
|
if (!lcppmodel)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (m_llModelInfo.model->isModelBlacklisted(filePath.toStdString())) {
|
if (lcppmodel->isModelBlacklisted(filePath.toStdString())) {
|
||||||
static QSet<QString> warned;
|
static QSet<QString> warned;
|
||||||
auto fname = modelInfo.filename();
|
auto fname = modelInfo.filename();
|
||||||
if (!warned.contains(fname)) {
|
if (!warned.contains(fname)) {
|
||||||
@ -460,16 +462,16 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto approxDeviceMemGB = [](const LLModel::GPUDevice *dev) {
|
auto approxDeviceMemGB = [](const LlamaCppBackend::GPUDevice *dev) {
|
||||||
float memGB = dev->heapSize / float(1024 * 1024 * 1024);
|
float memGB = dev->heapSize / float(1024 * 1024 * 1024);
|
||||||
return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<LLModel::GPUDevice> availableDevices;
|
std::vector<LlamaCppBackend::GPUDevice> availableDevices;
|
||||||
const LLModel::GPUDevice *defaultDevice = nullptr;
|
const LlamaCppBackend::GPUDevice *defaultDevice = nullptr;
|
||||||
{
|
{
|
||||||
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
|
const size_t requiredMemory = lcppmodel->requiredMem(filePath.toStdString(), n_ctx, ngl);
|
||||||
availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
|
availableDevices = lcppmodel->availableGPUDevices(requiredMemory);
|
||||||
// Pick the best device
|
// Pick the best device
|
||||||
// NB: relies on the fact that Kompute devices are listed first
|
// NB: relies on the fact that Kompute devices are listed first
|
||||||
if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
|
if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
|
||||||
@ -485,14 +487,14 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
bool actualDeviceIsCPU = true;
|
bool actualDeviceIsCPU = true;
|
||||||
|
|
||||||
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
||||||
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
|
if (lcppmodel->implementation().buildVariant() == "metal")
|
||||||
actualDeviceIsCPU = false;
|
actualDeviceIsCPU = false;
|
||||||
#else
|
#else
|
||||||
if (requestedDevice != "CPU") {
|
if (requestedDevice != "CPU") {
|
||||||
const auto *device = defaultDevice;
|
const auto *device = defaultDevice;
|
||||||
if (requestedDevice != "Auto") {
|
if (requestedDevice != "Auto") {
|
||||||
// Use the selected device
|
// Use the selected device
|
||||||
for (const LLModel::GPUDevice &d : availableDevices) {
|
for (const auto &d : availableDevices) {
|
||||||
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
|
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
|
||||||
device = &d;
|
device = &d;
|
||||||
break;
|
break;
|
||||||
@ -503,7 +505,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
std::string unavail_reason;
|
std::string unavail_reason;
|
||||||
if (!device) {
|
if (!device) {
|
||||||
// GPU not available
|
// GPU not available
|
||||||
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
|
} else if (!lcppmodel->initializeGPUDevice(device->index, &unavail_reason)) {
|
||||||
m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason);
|
m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason);
|
||||||
} else {
|
} else {
|
||||||
actualDeviceIsCPU = false;
|
actualDeviceIsCPU = false;
|
||||||
@ -512,7 +514,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
|
bool success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, ngl);
|
||||||
|
|
||||||
if (!m_shouldBeLoaded) {
|
if (!m_shouldBeLoaded) {
|
||||||
m_llModelInfo.resetModel(this);
|
m_llModelInfo.resetModel(this);
|
||||||
@ -531,10 +533,13 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
|
modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
|
||||||
|
|
||||||
// For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
|
// For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
|
||||||
if (backend == "cuda" && !construct("auto"))
|
if (backend == "cuda") {
|
||||||
|
lcppmodel = construct("auto");
|
||||||
|
if (!lcppmodel)
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
|
success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, 0);
|
||||||
|
|
||||||
if (!m_shouldBeLoaded) {
|
if (!m_shouldBeLoaded) {
|
||||||
m_llModelInfo.resetModel(this);
|
m_llModelInfo.resetModel(this);
|
||||||
@ -544,7 +549,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
emit modelLoadingPercentageChanged(0.0f);
|
emit modelLoadingPercentageChanged(0.0f);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (!m_llModelInfo.model->usingGPUDevice()) {
|
} else if (!lcppmodel->usingGPUDevice()) {
|
||||||
// ggml_vk_init was not called in llama.cpp
|
// ggml_vk_init was not called in llama.cpp
|
||||||
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
||||||
// for instance if the quantization method is not supported on Vulkan yet
|
// for instance if the quantization method is not supported on Vulkan yet
|
||||||
@ -562,7 +567,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (m_llModelInfo.model->implementation().modelType()[0]) {
|
switch (lcppmodel->implementation().modelType()[0]) {
|
||||||
case 'L': m_llModelType = LLModelType::LLAMA_; break;
|
case 'L': m_llModelType = LLModelType::LLAMA_; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -774,11 +779,15 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
|
|||||||
m_ctx.n_batch = n_batch;
|
m_ctx.n_batch = n_batch;
|
||||||
m_ctx.repeat_penalty = repeat_penalty;
|
m_ctx.repeat_penalty = repeat_penalty;
|
||||||
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
||||||
m_llModelInfo.model->setThreadCount(n_threads);
|
|
||||||
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
||||||
|
lcppmodel->setThreadCount(n_threads);
|
||||||
|
|
||||||
#if defined(DEBUG)
|
#if defined(DEBUG)
|
||||||
printf("%s", qPrintable(prompt));
|
printf("%s", qPrintable(prompt));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
QElapsedTimer totalTime;
|
QElapsedTimer totalTime;
|
||||||
totalTime.start();
|
totalTime.start();
|
||||||
m_timer->start();
|
m_timer->start();
|
||||||
@ -1238,11 +1247,15 @@ void ChatLLM::processSystemPrompt()
|
|||||||
m_ctx.n_batch = n_batch;
|
m_ctx.n_batch = n_batch;
|
||||||
m_ctx.repeat_penalty = repeat_penalty;
|
m_ctx.repeat_penalty = repeat_penalty;
|
||||||
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
||||||
m_llModelInfo.model->setThreadCount(n_threads);
|
|
||||||
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
||||||
|
lcppmodel->setThreadCount(n_threads);
|
||||||
|
|
||||||
#if defined(DEBUG)
|
#if defined(DEBUG)
|
||||||
printf("%s", qPrintable(QString::fromStdString(systemPrompt)));
|
printf("%s", qPrintable(QString::fromStdString(systemPrompt)));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
|
auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
|
||||||
// use "%1%2" and not "%1" to avoid implicit whitespace
|
// use "%1%2" and not "%1" to avoid implicit whitespace
|
||||||
m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true);
|
m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true);
|
||||||
@ -1288,7 +1301,9 @@ void ChatLLM::processRestoreStateFromText()
|
|||||||
m_ctx.n_batch = n_batch;
|
m_ctx.n_batch = n_batch;
|
||||||
m_ctx.repeat_penalty = repeat_penalty;
|
m_ctx.repeat_penalty = repeat_penalty;
|
||||||
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
||||||
m_llModelInfo.model->setThreadCount(n_threads);
|
|
||||||
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
||||||
|
lcppmodel->setThreadCount(n_threads);
|
||||||
|
|
||||||
auto it = m_stateFromText.begin();
|
auto it = m_stateFromText.begin();
|
||||||
while (it < m_stateFromText.end()) {
|
while (it < m_stateFromText.end()) {
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "database.h" // IWYU pragma: keep
|
#include "database.h" // IWYU pragma: keep
|
||||||
#include "modellist.h"
|
#include "modellist.h"
|
||||||
|
|
||||||
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llmodel.h"
|
||||||
|
|
||||||
#include <QByteArray>
|
#include <QByteArray>
|
||||||
@ -128,15 +129,17 @@ public:
|
|||||||
|
|
||||||
QString deviceBackend() const
|
QString deviceBackend() const
|
||||||
{
|
{
|
||||||
if (!isModelLoaded()) return QString();
|
auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get());
|
||||||
std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName());
|
if (!isModelLoaded() && !lcppmodel) return QString();
|
||||||
|
std::string name = LlamaCppBackend::GPUDevice::backendIdToName(lcppmodel->backendName());
|
||||||
return QString::fromStdString(name);
|
return QString::fromStdString(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
QString device() const
|
QString device() const
|
||||||
{
|
{
|
||||||
if (!isModelLoaded()) return QString();
|
auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get());
|
||||||
const char *name = m_llModelInfo.model->gpuDeviceName();
|
if (!isModelLoaded() || !lcppmodel) return QString();
|
||||||
|
const char *name = lcppmodel->gpuDeviceName();
|
||||||
return name ? QString(name) : u"CPU"_s;
|
return name ? QString(name) : u"CPU"_s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#include "modellist.h"
|
#include "modellist.h"
|
||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
|
|
||||||
#include <QCoreApplication>
|
#include <QCoreApplication>
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
@ -99,7 +99,7 @@ bool EmbeddingLLMWorker::loadModel()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
try {
|
try {
|
||||||
m_model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
|
qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
|
||||||
return false;
|
return false;
|
||||||
@ -112,11 +112,11 @@ bool EmbeddingLLMWorker::loadModel()
|
|||||||
actualDeviceIsCPU = false;
|
actualDeviceIsCPU = false;
|
||||||
#else
|
#else
|
||||||
if (requestedDevice != "CPU") {
|
if (requestedDevice != "CPU") {
|
||||||
const LLModel::GPUDevice *device = nullptr;
|
const LlamaCppBackend::GPUDevice *device = nullptr;
|
||||||
std::vector<LLModel::GPUDevice> availableDevices = m_model->availableGPUDevices(0);
|
auto availableDevices = m_model->availableGPUDevices(0);
|
||||||
if (requestedDevice != "Auto") {
|
if (requestedDevice != "Auto") {
|
||||||
// Use the selected device
|
// Use the selected device
|
||||||
for (const LLModel::GPUDevice &d : availableDevices) {
|
for (const auto &d : availableDevices) {
|
||||||
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
|
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
|
||||||
device = &d;
|
device = &d;
|
||||||
break;
|
break;
|
||||||
@ -145,7 +145,7 @@ bool EmbeddingLLMWorker::loadModel()
|
|||||||
if (backend == "cuda") {
|
if (backend == "cuda") {
|
||||||
// For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
|
// For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
|
||||||
try {
|
try {
|
||||||
m_model = LLModel::Implementation::construct(filePath.toStdString(), "auto", n_ctx);
|
m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), "auto", n_ctx);
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
|
qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
|
||||||
return false;
|
return false;
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
class LLModel;
|
class LlamaCppBackend;
|
||||||
class QNetworkAccessManager;
|
class QNetworkAccessManager;
|
||||||
|
|
||||||
struct EmbeddingChunk {
|
struct EmbeddingChunk {
|
||||||
@ -67,7 +67,7 @@ private:
|
|||||||
QString m_nomicAPIKey;
|
QString m_nomicAPIKey;
|
||||||
QNetworkAccessManager *m_networkManager;
|
QNetworkAccessManager *m_networkManager;
|
||||||
std::vector<float> m_lastResponse;
|
std::vector<float> m_lastResponse;
|
||||||
LLModel *m_model = nullptr;
|
LlamaCppBackend *m_model = nullptr;
|
||||||
std::atomic<bool> m_stopGenerating;
|
std::atomic<bool> m_stopGenerating;
|
||||||
QThread m_workerThread;
|
QThread m_workerThread;
|
||||||
QMutex m_mutex; // guards m_model and m_nomicAPIKey
|
QMutex m_mutex; // guards m_model and m_nomicAPIKey
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include "llm.h"
|
#include "llm.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
#include "../gpt4all-backend/sysinfo.h"
|
#include "../gpt4all-backend/sysinfo.h"
|
||||||
|
|
||||||
#include <QCoreApplication>
|
#include <QCoreApplication>
|
||||||
@ -30,7 +30,7 @@ LLM *LLM::globalInstance()
|
|||||||
|
|
||||||
LLM::LLM()
|
LLM::LLM()
|
||||||
: QObject{nullptr}
|
: QObject{nullptr}
|
||||||
, m_compatHardware(LLModel::Implementation::hasSupportedCPU())
|
, m_compatHardware(LlamaCppBackend::Implementation::hasSupportedCPU())
|
||||||
{
|
{
|
||||||
QNetworkInformation::loadDefaultBackend();
|
QNetworkInformation::loadDefaultBackend();
|
||||||
auto * netinfo = QNetworkInformation::instance();
|
auto * netinfo = QNetworkInformation::instance();
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
#include "network.h"
|
#include "network.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
|
|
||||||
#include <QCoreApplication>
|
#include <QCoreApplication>
|
||||||
#include <QGuiApplication>
|
#include <QGuiApplication>
|
||||||
@ -46,7 +46,7 @@ int main(int argc, char *argv[])
|
|||||||
if (LLM::directoryExists(frameworksDir))
|
if (LLM::directoryExists(frameworksDir))
|
||||||
llmodelSearchPaths += ";" + frameworksDir;
|
llmodelSearchPaths += ";" + frameworksDir;
|
||||||
#endif
|
#endif
|
||||||
LLModel::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
|
LlamaCppBackend::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
|
||||||
|
|
||||||
// Set the local and language translation before the qml engine has even been started. This will
|
// Set the local and language translation before the qml engine has even been started. This will
|
||||||
// use the default system locale unless the user has explicitly set it to use a different one.
|
// use the default system locale unless the user has explicitly set it to use a different one.
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
#include "network.h"
|
#include "network.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
|
|
||||||
#include <QChar>
|
#include <QChar>
|
||||||
#include <QCoreApplication>
|
#include <QCoreApplication>
|
||||||
@ -258,7 +258,7 @@ int ModelInfo::maxContextLength() const
|
|||||||
if (!installed || isOnline) return -1;
|
if (!installed || isOnline) return -1;
|
||||||
if (m_maxContextLength != -1) return m_maxContextLength;
|
if (m_maxContextLength != -1) return m_maxContextLength;
|
||||||
auto path = (dirpath + filename()).toStdString();
|
auto path = (dirpath + filename()).toStdString();
|
||||||
int n_ctx = LLModel::Implementation::maxContextLength(path);
|
int n_ctx = LlamaCppBackend::Implementation::maxContextLength(path);
|
||||||
if (n_ctx < 0) {
|
if (n_ctx < 0) {
|
||||||
n_ctx = 4096; // fallback value
|
n_ctx = 4096; // fallback value
|
||||||
}
|
}
|
||||||
@ -282,7 +282,7 @@ int ModelInfo::maxGpuLayers() const
|
|||||||
if (!installed || isOnline) return -1;
|
if (!installed || isOnline) return -1;
|
||||||
if (m_maxGpuLayers != -1) return m_maxGpuLayers;
|
if (m_maxGpuLayers != -1) return m_maxGpuLayers;
|
||||||
auto path = (dirpath + filename()).toStdString();
|
auto path = (dirpath + filename()).toStdString();
|
||||||
int layers = LLModel::Implementation::layerCount(path);
|
int layers = LlamaCppBackend::Implementation::layerCount(path);
|
||||||
if (layers < 0) {
|
if (layers < 0) {
|
||||||
layers = 100; // fallback value
|
layers = 100; // fallback value
|
||||||
}
|
}
|
||||||
@ -997,7 +997,7 @@ void ModelList::updateData(const QString &id, const QVector<QPair<int, QVariant>
|
|||||||
&& (info->isDiscovered() || info->description().isEmpty()))
|
&& (info->isDiscovered() || info->description().isEmpty()))
|
||||||
{
|
{
|
||||||
// read GGUF and decide based on model architecture
|
// read GGUF and decide based on model architecture
|
||||||
info->isEmbeddingModel = LLModel::Implementation::isEmbeddingModel(modelPath.toStdString());
|
info->isEmbeddingModel = LlamaCppBackend::Implementation::isEmbeddingModel(modelPath.toStdString());
|
||||||
info->checkedEmbeddingModel = true;
|
info->checkedEmbeddingModel = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
|
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
#include <QDir>
|
#include <QDir>
|
||||||
@ -95,8 +95,8 @@ static QStringList getDevices(bool skipKompute = false)
|
|||||||
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
||||||
deviceList << "Metal";
|
deviceList << "Metal";
|
||||||
#else
|
#else
|
||||||
std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
|
auto devices = LlamaCppBackend::Implementation::availableGPUDevices();
|
||||||
for (LLModel::GPUDevice &d : devices) {
|
for (auto &d : devices) {
|
||||||
if (!skipKompute || strcmp(d.backend, "kompute"))
|
if (!skipKompute || strcmp(d.backend, "kompute"))
|
||||||
deviceList << QString::fromStdString(d.selectionName());
|
deviceList << QString::fromStdString(d.selectionName());
|
||||||
}
|
}
|
||||||
@ -512,7 +512,7 @@ QString MySettings::device()
|
|||||||
auto device = value.toString();
|
auto device = value.toString();
|
||||||
if (!device.isEmpty()) {
|
if (!device.isEmpty()) {
|
||||||
auto deviceStr = device.toStdString();
|
auto deviceStr = device.toStdString();
|
||||||
auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
|
auto newNameStr = LlamaCppBackend::GPUDevice::updateSelectionName(deviceStr);
|
||||||
if (newNameStr != deviceStr) {
|
if (newNameStr != deviceStr) {
|
||||||
auto newName = QString::fromStdString(newNameStr);
|
auto newName = QString::fromStdString(newNameStr);
|
||||||
qWarning() << "updating device name:" << device << "->" << newName;
|
qWarning() << "updating device name:" << device << "->" << newName;
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
#include "modellist.h"
|
#include "modellist.h"
|
||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
|
|
||||||
#include "../gpt4all-backend/llmodel.h"
|
#include "../gpt4all-backend/llamacpp_backend.h"
|
||||||
|
|
||||||
#include <QCoreApplication>
|
#include <QCoreApplication>
|
||||||
#include <QDateTime>
|
#include <QDateTime>
|
||||||
@ -290,7 +290,7 @@ void Network::sendStartup()
|
|||||||
{"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())},
|
{"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())},
|
||||||
{"ram", LLM::globalInstance()->systemTotalRAMInGB()},
|
{"ram", LLM::globalInstance()->systemTotalRAMInGB()},
|
||||||
{"cpu", getCPUModel()},
|
{"cpu", getCPUModel()},
|
||||||
{"cpu_supports_avx2", LLModel::Implementation::cpuSupportsAVX2()},
|
{"cpu_supports_avx2", LlamaCppBackend::Implementation::cpuSupportsAVX2()},
|
||||||
{"datalake_active", mySettings->networkIsActive()},
|
{"datalake_active", mySettings->networkIsActive()},
|
||||||
});
|
});
|
||||||
sendIpify();
|
sendIpify();
|
||||||
|
Loading…
Reference in New Issue
Block a user