mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-04-27 19:35:20 +00:00
llmodel: dlopen llama.cpp libraries lazily instead of eagerly
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
21c06fdebf
commit
7a1559e3df
@ -6,6 +6,7 @@
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <expected>
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
@ -19,6 +20,7 @@
|
||||
class Dlhandle;
|
||||
|
||||
using namespace std::string_literals;
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#define LLMODEL_MAX_PROMPT_BATCH 128
|
||||
|
||||
@ -94,12 +96,13 @@ public:
|
||||
|
||||
class Implementation {
|
||||
public:
|
||||
Implementation(std::string buildBackend, Dlhandle &&dlhandle);
|
||||
Implementation(const Implementation &) = delete;
|
||||
Implementation(Implementation &&);
|
||||
~Implementation();
|
||||
|
||||
std::string_view modelType() const { return m_modelType; }
|
||||
std::string_view buildVariant() const { return m_buildVariant; }
|
||||
const std::string &buildBackend() const { return m_buildBackend; }
|
||||
std::string_view modelType () const { return m_modelType; }
|
||||
|
||||
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||
@ -114,19 +117,17 @@ public:
|
||||
static int cpuSupportsAVX2();
|
||||
|
||||
private:
|
||||
Implementation(Dlhandle &&);
|
||||
|
||||
static const std::vector<Implementation> &implementationList();
|
||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
||||
static const Implementation *findImplementation(const char *fname, const std::string &buildBackend);
|
||||
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||
|
||||
char *(*m_getFileArch)(const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
LLModel *(*m_construct)();
|
||||
std::string m_buildBackend;
|
||||
Dlhandle *m_dlhandle;
|
||||
|
||||
char *(*m_getFileArch) (const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
LLModel *(*m_construct) ();
|
||||
|
||||
std::string_view m_modelType;
|
||||
std::string_view m_buildVariant;
|
||||
Dlhandle *m_dlhandle;
|
||||
};
|
||||
|
||||
struct PromptContext {
|
||||
@ -141,6 +142,16 @@ public:
|
||||
float contextErase = 0.5f; // percent of context to erase if we exceed the context window
|
||||
};
|
||||
|
||||
private:
|
||||
struct LazyImplementation {
|
||||
std::string buildBackend;
|
||||
fs::path path;
|
||||
std::optional<Implementation> impl = {};
|
||||
|
||||
const Implementation &get();
|
||||
};
|
||||
|
||||
public:
|
||||
explicit LLModel() {}
|
||||
virtual ~LLModel() {}
|
||||
|
||||
@ -267,6 +278,9 @@ protected:
|
||||
const PromptContext &promptCtx,
|
||||
int32_t nPast);
|
||||
|
||||
private:
|
||||
static std::vector<LazyImplementation> &getImplementations();
|
||||
|
||||
friend class LLMImplementation;
|
||||
};
|
||||
|
||||
|
@ -1278,21 +1278,12 @@ void LLamaModel::embedInternal(
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
DLL_EXPORT bool is_g4a_backend_model_implementation()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_model_type()
|
||||
{
|
||||
return modelType_;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_build_variant()
|
||||
{
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT char *get_file_arch(const char *fname)
|
||||
{
|
||||
char *arch = nullptr;
|
||||
|
@ -10,9 +10,9 @@
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
@ -32,6 +32,8 @@
|
||||
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
||||
#endif
|
||||
|
||||
using namespace std::string_literals;
|
||||
using namespace std::string_view_literals;
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#ifndef __APPLE__
|
||||
@ -66,29 +68,30 @@ std::string s_implementations_search_path = ".";
|
||||
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
||||
#endif
|
||||
|
||||
LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
|
||||
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
||||
LLModel::Implementation::Implementation(std::string buildBackend, Dlhandle &&dlhandle)
|
||||
: m_buildBackend(std::move(buildBackend))
|
||||
, m_dlhandle(new Dlhandle(std::move(dlhandle)))
|
||||
{
|
||||
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
||||
assert(get_model_type);
|
||||
m_modelType = get_model_type();
|
||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||
assert(get_build_variant);
|
||||
m_buildVariant = get_build_variant();
|
||||
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
||||
assert(m_getFileArch);
|
||||
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
||||
assert(m_isArchSupported);
|
||||
m_construct = m_dlhandle->get<LLModel *()>("construct");
|
||||
assert(m_construct);
|
||||
|
||||
m_modelType = get_model_type();
|
||||
}
|
||||
|
||||
LLModel::Implementation::Implementation(Implementation &&o)
|
||||
: m_getFileArch(o.m_getFileArch)
|
||||
: m_buildBackend(o.m_buildBackend)
|
||||
, m_dlhandle(o.m_dlhandle)
|
||||
, m_getFileArch(o.m_getFileArch)
|
||||
, m_isArchSupported(o.m_isArchSupported)
|
||||
, m_construct(o.m_construct)
|
||||
, m_modelType(o.m_modelType)
|
||||
, m_buildVariant(o.m_buildVariant)
|
||||
, m_dlhandle(o.m_dlhandle) {
|
||||
{
|
||||
o.m_dlhandle = nullptr;
|
||||
}
|
||||
|
||||
@ -97,11 +100,6 @@ LLModel::Implementation::~Implementation()
|
||||
delete m_dlhandle;
|
||||
}
|
||||
|
||||
static bool isImplementation(const Dlhandle &dl)
|
||||
{
|
||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||
}
|
||||
|
||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
||||
static void addCudaSearchPath()
|
||||
@ -117,55 +115,43 @@ static void addCudaSearchPath()
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
|
||||
auto LLModel::LazyImplementation::get() -> const Implementation &
|
||||
{
|
||||
if (!impl) impl.emplace(buildBackend, Dlhandle(path));
|
||||
return *impl;
|
||||
}
|
||||
|
||||
auto LLModel::getImplementations() -> std::vector<LazyImplementation> &
|
||||
{
|
||||
// in no particular order
|
||||
static const std::array ALL_BUILD_BACKENDS { "cpu"sv, "metal"sv, "kompute"sv, "vulkan"sv, "cuda"sv };
|
||||
static const std::string_view LIB_EXT(LIB_FILE_EXT);
|
||||
|
||||
if (cpu_supports_avx() == 0) {
|
||||
throw std::runtime_error("CPU does not support AVX");
|
||||
}
|
||||
|
||||
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
||||
// individual models without the cleanup of the static list interfering
|
||||
static auto* libs = new std::vector<Implementation>([] () {
|
||||
std::vector<Implementation> fres;
|
||||
static auto* libs = new std::vector<LazyImplementation>([] () {
|
||||
std::vector<LazyImplementation> fres;
|
||||
|
||||
addCudaSearchPath();
|
||||
|
||||
std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
impl_name_re += "-avxonly";
|
||||
}
|
||||
std::regex re(impl_name_re);
|
||||
auto search_in_directory = [&](const std::string& paths) {
|
||||
std::stringstream ss(paths);
|
||||
std::string path;
|
||||
// Split the paths string by the delimiter and process each path.
|
||||
while (std::getline(ss, path, ';')) {
|
||||
std::u8string u8_path(path.begin(), path.end());
|
||||
// Iterate over all libraries
|
||||
for (const auto &f : fs::directory_iterator(u8_path)) {
|
||||
const fs::path &p = f.path();
|
||||
|
||||
if (p.extension() != LIB_FILE_EXT) continue;
|
||||
if (!std::regex_search(p.stem().string(), re)) continue;
|
||||
|
||||
// Add to list if model implementation
|
||||
Dlhandle dl;
|
||||
try {
|
||||
dl = Dlhandle(p);
|
||||
} catch (const Dlhandle::Exception &e) {
|
||||
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
||||
continue;
|
||||
}
|
||||
if (!isImplementation(dl)) {
|
||||
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
fres.emplace_back(Implementation(std::move(dl)));
|
||||
}
|
||||
bool avxonly = cpu_supports_avx2() == 0;
|
||||
std::stringstream ss(s_implementations_search_path);
|
||||
std::string piece;
|
||||
// Split the paths string by the delimiter and process each path.
|
||||
while (std::getline(ss, piece, ';')) {
|
||||
auto basePath = fs::path(std::u8string(piece.begin(), piece.end()));
|
||||
// Iterate over all libraries
|
||||
for (auto &buildBackend : ALL_BUILD_BACKENDS) {
|
||||
auto path = basePath /
|
||||
"llamamodel-mainline-"s.append(buildBackend).append(avxonly ? "-avxonly" : "").append(LIB_EXT);
|
||||
if (fs::exists(path))
|
||||
fres.push_back(LazyImplementation { std::string(buildBackend), path });
|
||||
}
|
||||
};
|
||||
|
||||
search_in_directory(s_implementations_search_path);
|
||||
}
|
||||
|
||||
return fres;
|
||||
}());
|
||||
@ -173,22 +159,16 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
return *libs;
|
||||
}
|
||||
|
||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
||||
auto LLModel::Implementation::findImplementation(const char *fname, const std::string &buildBackend)
|
||||
-> const Implementation *
|
||||
{
|
||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||
return buildVariant + "-avxonly";
|
||||
}
|
||||
return buildVariant;
|
||||
}
|
||||
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
|
||||
{
|
||||
bool buildVariantMatched = false;
|
||||
bool buildBackendMatched = false;
|
||||
std::optional<std::string> archName;
|
||||
for (const auto& i : implementationList()) {
|
||||
if (buildVariant != i.m_buildVariant) continue;
|
||||
buildVariantMatched = true;
|
||||
for (auto &li : getImplementations()) {
|
||||
if (li.buildBackend != buildBackend) continue;
|
||||
buildBackendMatched = true;
|
||||
|
||||
auto &i = li.get();
|
||||
char *arch = i.m_getFileArch(fname);
|
||||
if (!arch) continue;
|
||||
archName = arch;
|
||||
@ -198,7 +178,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
|
||||
if (archSupported) return &i;
|
||||
}
|
||||
|
||||
if (!buildVariantMatched)
|
||||
if (!buildBackendMatched)
|
||||
return nullptr;
|
||||
if (!archName)
|
||||
throw UnsupportedModelError("Unsupported file format");
|
||||
@ -216,7 +196,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
|
||||
}
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
||||
const auto *impl = findImplementation(modelPath.c_str(), desiredBackend);
|
||||
|
||||
if (impl) {
|
||||
// Construct llmodel implementation
|
||||
@ -251,11 +231,11 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
|
||||
{
|
||||
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
|
||||
|
||||
const std::vector<Implementation> *impls;
|
||||
std::vector<LazyImplementation> *impls;
|
||||
try {
|
||||
impls = &implementationList();
|
||||
impls = &getImplementations();
|
||||
} catch (const std::runtime_error &e) {
|
||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||
std::cerr << __func__ << ": getImplementations() failed: " << e.what() << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -268,13 +248,15 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
|
||||
|
||||
const Implementation *impl = nullptr;
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
for (const auto &desiredBackend : desiredBackends) {
|
||||
auto cacheIt = implCache.find(desiredBackend);
|
||||
if (cacheIt != implCache.end())
|
||||
return cacheIt->second.get(); // cached
|
||||
|
||||
for (const auto &i: *impls) {
|
||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
||||
for (auto &li : *impls) {
|
||||
if (li.buildBackend == desiredBackend) {
|
||||
auto &i = li.get();
|
||||
assert(i.m_modelType == "LLaMA");
|
||||
impl = &i;
|
||||
break;
|
||||
}
|
||||
|
@ -528,7 +528,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
|
||||
bool actualDeviceIsCPU = true;
|
||||
|
||||
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
||||
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
|
||||
if (m_llModelInfo.model->implementation().buildBackend() == "metal")
|
||||
actualDeviceIsCPU = false;
|
||||
#else
|
||||
if (requestedDevice != "CPU") {
|
||||
|
@ -108,7 +108,7 @@ bool EmbeddingLLMWorker::loadModel()
|
||||
bool actualDeviceIsCPU = true;
|
||||
|
||||
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
||||
if (m_model->implementation().buildVariant() == "metal")
|
||||
if (m_model->implementation().buildBackend() == "metal")
|
||||
actualDeviceIsCPU = false;
|
||||
#else
|
||||
if (requestedDevice != "CPU") {
|
||||
|
Loading…
Reference in New Issue
Block a user