mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-06 02:50:36 +00:00
chat: major UI redesign for v3.0.0 (#2396)
Signed-off-by: Adam Treat <treat.adam@gmail.com> Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -20,24 +20,28 @@ namespace fs = std::filesystem;
|
||||
|
||||
#ifndef _WIN32
|
||||
|
||||
Dlhandle::Dlhandle(const fs::path &fpath) {
|
||||
Dlhandle::Dlhandle(const fs::path &fpath)
|
||||
{
|
||||
chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (!chandle) {
|
||||
throw Exception("dlopen: "s + dlerror());
|
||||
}
|
||||
}
|
||||
|
||||
Dlhandle::~Dlhandle() {
|
||||
Dlhandle::~Dlhandle()
|
||||
{
|
||||
if (chandle) dlclose(chandle);
|
||||
}
|
||||
|
||||
void *Dlhandle::get_internal(const char *symbol) const {
|
||||
void *Dlhandle::get_internal(const char *symbol) const
|
||||
{
|
||||
return dlsym(chandle, symbol);
|
||||
}
|
||||
|
||||
#else // defined(_WIN32)
|
||||
|
||||
Dlhandle::Dlhandle(const fs::path &fpath) {
|
||||
Dlhandle::Dlhandle(const fs::path &fpath)
|
||||
{
|
||||
fs::path afpath = fs::absolute(fpath);
|
||||
|
||||
// Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
|
||||
@@ -58,11 +62,13 @@ Dlhandle::Dlhandle(const fs::path &fpath) {
|
||||
}
|
||||
}
|
||||
|
||||
Dlhandle::~Dlhandle() {
|
||||
Dlhandle::~Dlhandle()
|
||||
{
|
||||
if (chandle) FreeLibrary(HMODULE(chandle));
|
||||
}
|
||||
|
||||
void *Dlhandle::get_internal(const char *symbol) const {
|
||||
void *Dlhandle::get_internal(const char *symbol) const
|
||||
{
|
||||
return GetProcAddress(HMODULE(chandle), symbol);
|
||||
}
|
||||
|
||||
|
@@ -123,7 +123,8 @@ static bool kv_cache_init(
|
||||
}
|
||||
|
||||
// load the model's weights from a file path
|
||||
bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) {
|
||||
bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr)
|
||||
{
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
if(mem_req != nullptr) {
|
||||
*mem_req = 0;
|
||||
@@ -667,7 +668,8 @@ GPTJ::GPTJ()
|
||||
d_ptr->modelLoaded = false;
|
||||
}
|
||||
|
||||
size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
|
||||
size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
{
|
||||
(void)n_ctx;
|
||||
(void)ngl;
|
||||
gptj_model dummy_model;
|
||||
@@ -677,7 +679,8 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
|
||||
return mem_req;
|
||||
}
|
||||
|
||||
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
|
||||
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
{
|
||||
(void)n_ctx;
|
||||
(void)ngl;
|
||||
d_ptr->modelLoaded = false;
|
||||
@@ -698,7 +701,8 @@ bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void GPTJ::setThreadCount(int32_t n_threads) {
|
||||
void GPTJ::setThreadCount(int32_t n_threads)
|
||||
{
|
||||
d_ptr->n_threads = n_threads;
|
||||
}
|
||||
|
||||
@@ -780,7 +784,8 @@ const std::vector<LLModel::Token> &GPTJ::endTokens() const
|
||||
return fres;
|
||||
}
|
||||
|
||||
const char *get_arch_name(gguf_context *ctx_gguf) {
|
||||
const char *get_arch_name(gguf_context *ctx_gguf)
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
|
||||
if (kid == -1)
|
||||
throw std::runtime_error("key not found in model: general.architecture");
|
||||
@@ -799,19 +804,23 @@ const char *get_arch_name(gguf_context *ctx_gguf) {
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||
DLL_EXPORT bool is_g4a_backend_model_implementation()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_model_type() {
|
||||
DLL_EXPORT const char *get_model_type()
|
||||
{
|
||||
return modelType_;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_build_variant() {
|
||||
DLL_EXPORT const char *get_build_variant()
|
||||
{
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT char *get_file_arch(const char *fname) {
|
||||
DLL_EXPORT char *get_file_arch(const char *fname)
|
||||
{
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
@@ -832,11 +841,13 @@ DLL_EXPORT char *get_file_arch(const char *fname) {
|
||||
return arch;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool is_arch_supported(const char *arch) {
|
||||
DLL_EXPORT bool is_arch_supported(const char *arch)
|
||||
{
|
||||
return !strcmp(arch, "gptj");
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
DLL_EXPORT LLModel *construct()
|
||||
{
|
||||
return new GPTJ;
|
||||
}
|
||||
}
|
||||
|
@@ -84,16 +84,19 @@ static const std::vector<const char *> EMBEDDING_ARCHES {
|
||||
"bert", "nomic-bert",
|
||||
};
|
||||
|
||||
static bool is_embedding_arch(const std::string &arch) {
|
||||
static bool is_embedding_arch(const std::string &arch)
|
||||
{
|
||||
return std::find(EMBEDDING_ARCHES.begin(), EMBEDDING_ARCHES.end(), arch) < EMBEDDING_ARCHES.end();
|
||||
}
|
||||
|
||||
static bool llama_verbose() {
|
||||
static bool llama_verbose()
|
||||
{
|
||||
const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
|
||||
return var && *var;
|
||||
}
|
||||
|
||||
static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata) {
|
||||
static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
|
||||
{
|
||||
(void)userdata;
|
||||
if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
|
||||
fputs(text, stderr);
|
||||
@@ -147,7 +150,8 @@ static int llama_sample_top_p_top_k(
|
||||
return llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
|
||||
const char *get_arch_name(gguf_context *ctx_gguf) {
|
||||
const char *get_arch_name(gguf_context *ctx_gguf)
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
|
||||
if (kid == -1)
|
||||
throw std::runtime_error("key not found in model: general.architecture");
|
||||
@@ -159,7 +163,8 @@ const char *get_arch_name(gguf_context *ctx_gguf) {
|
||||
return gguf_get_val_str(ctx_gguf, kid);
|
||||
}
|
||||
|
||||
static gguf_context *load_gguf(const char *fname) {
|
||||
static gguf_context *load_gguf(const char *fname)
|
||||
{
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ nullptr,
|
||||
@@ -180,7 +185,8 @@ static gguf_context *load_gguf(const char *fname) {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
|
||||
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey)
|
||||
{
|
||||
int32_t value = -1;
|
||||
std::string arch;
|
||||
|
||||
@@ -237,7 +243,8 @@ struct llama_file_hparams {
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
};
|
||||
|
||||
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
|
||||
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
{
|
||||
// TODO(cebtenzzre): update to GGUF
|
||||
(void)ngl; // FIXME(cetenzzre): use this value
|
||||
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||
@@ -261,7 +268,8 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
return filesize + est_kvcache_size;
|
||||
}
|
||||
|
||||
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
|
||||
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
|
||||
{
|
||||
auto * ctx = load_gguf(modelPath.c_str());
|
||||
if (!ctx) {
|
||||
std::cerr << __func__ << ": failed to load " << modelPath << "\n";
|
||||
@@ -297,7 +305,8 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
|
||||
return res;
|
||||
}
|
||||
|
||||
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const {
|
||||
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
|
||||
{
|
||||
bool result = false;
|
||||
std::string arch;
|
||||
|
||||
@@ -453,12 +462,14 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
return true;
|
||||
}
|
||||
|
||||
void LLamaModel::setThreadCount(int32_t n_threads) {
|
||||
void LLamaModel::setThreadCount(int32_t n_threads)
|
||||
{
|
||||
d_ptr->n_threads = n_threads;
|
||||
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
||||
}
|
||||
|
||||
int32_t LLamaModel::threadCount() const {
|
||||
int32_t LLamaModel::threadCount() const
|
||||
{
|
||||
return d_ptr->n_threads;
|
||||
}
|
||||
|
||||
@@ -581,7 +592,8 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
static const char *getVulkanVendorName(uint32_t vendorID) {
|
||||
static const char *getVulkanVendorName(uint32_t vendorID)
|
||||
{
|
||||
switch (vendorID) {
|
||||
case 0x10DE: return "nvidia";
|
||||
case 0x1002: return "amd";
|
||||
@@ -738,11 +750,13 @@ bool LLamaModel::usingGPUDevice() const
|
||||
return hasDevice;
|
||||
}
|
||||
|
||||
const char *LLamaModel::backendName() const {
|
||||
const char *LLamaModel::backendName() const
|
||||
{
|
||||
return d_ptr->backend_name;
|
||||
}
|
||||
|
||||
const char *LLamaModel::gpuDeviceName() const {
|
||||
const char *LLamaModel::gpuDeviceName() const
|
||||
{
|
||||
if (usingGPUDevice()) {
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
return d_ptr->deviceName.c_str();
|
||||
@@ -768,13 +782,15 @@ void llama_batch_add(
|
||||
batch.n_tokens++;
|
||||
}
|
||||
|
||||
static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id) {
|
||||
static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id)
|
||||
{
|
||||
for (unsigned i = 0; i < tokens.size(); i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
size_t LLamaModel::embeddingSize() const {
|
||||
size_t LLamaModel::embeddingSize() const
|
||||
{
|
||||
return llama_n_embd(d_ptr->model);
|
||||
}
|
||||
|
||||
@@ -894,12 +910,14 @@ void LLamaModel::embed(
|
||||
// MD5 hash of "nomic empty"
|
||||
static const char EMPTY_PLACEHOLDER[] = "24df574ea1c998de59d5be15e769658e";
|
||||
|
||||
auto product(double a) -> std::function<double(double)> {
|
||||
auto product(double a) -> std::function<double(double)>
|
||||
{
|
||||
return [a](double b) { return a * b; };
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
double getL2NormScale(T *start, T *end) {
|
||||
double getL2NormScale(T *start, T *end)
|
||||
{
|
||||
double magnitude = std::sqrt(std::inner_product(start, end, start, 0.0));
|
||||
return 1.0 / std::max(magnitude, 1e-12);
|
||||
}
|
||||
@@ -1107,19 +1125,23 @@ void LLamaModel::embedInternal(
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||
DLL_EXPORT bool is_g4a_backend_model_implementation()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_model_type() {
|
||||
DLL_EXPORT const char *get_model_type()
|
||||
{
|
||||
return modelType_;
|
||||
}
|
||||
|
||||
DLL_EXPORT const char *get_build_variant() {
|
||||
DLL_EXPORT const char *get_build_variant()
|
||||
{
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT char *get_file_arch(const char *fname) {
|
||||
DLL_EXPORT char *get_file_arch(const char *fname)
|
||||
{
|
||||
char *arch = nullptr;
|
||||
std::string archStr;
|
||||
|
||||
@@ -1144,11 +1166,13 @@ cleanup:
|
||||
return arch;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool is_arch_supported(const char *arch) {
|
||||
DLL_EXPORT bool is_arch_supported(const char *arch)
|
||||
{
|
||||
return std::find(KNOWN_ARCHES.begin(), KNOWN_ARCHES.end(), std::string(arch)) < KNOWN_ARCHES.end();
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
DLL_EXPORT LLModel *construct()
|
||||
{
|
||||
llama_log_set(llama_log_callback, nullptr);
|
||||
return new LLamaModel;
|
||||
}
|
||||
|
@@ -92,17 +92,20 @@ LLModel::Implementation::Implementation(Implementation &&o)
|
||||
o.m_dlhandle = nullptr;
|
||||
}
|
||||
|
||||
LLModel::Implementation::~Implementation() {
|
||||
LLModel::Implementation::~Implementation()
|
||||
{
|
||||
delete m_dlhandle;
|
||||
}
|
||||
|
||||
static bool isImplementation(const Dlhandle &dl) {
|
||||
static bool isImplementation(const Dlhandle &dl)
|
||||
{
|
||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||
}
|
||||
|
||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
||||
static void addCudaSearchPath() {
|
||||
static void addCudaSearchPath()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
||||
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
||||
@@ -114,7 +117,8 @@ static void addCudaSearchPath() {
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
|
||||
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
|
||||
{
|
||||
if (cpu_supports_avx() == 0) {
|
||||
throw std::runtime_error("CPU does not support AVX");
|
||||
}
|
||||
@@ -169,14 +173,16 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
return *libs;
|
||||
}
|
||||
|
||||
static std::string applyCPUVariant(const std::string &buildVariant) {
|
||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
||||
{
|
||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||
return buildVariant + "-avxonly";
|
||||
}
|
||||
return buildVariant;
|
||||
}
|
||||
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
|
||||
{
|
||||
bool buildVariantMatched = false;
|
||||
std::optional<std::string> archName;
|
||||
for (const auto& i : implementationList()) {
|
||||
@@ -200,7 +206,8 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
|
||||
throw BadArchError(std::move(*archName));
|
||||
}
|
||||
|
||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
|
||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
|
||||
{
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend != "auto") {
|
||||
desiredBackends.push_back(backend);
|
||||
@@ -240,7 +247,8 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
|
||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
||||
}
|
||||
|
||||
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
|
||||
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
|
||||
{
|
||||
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
|
||||
|
||||
const std::vector<Implementation> *impls;
|
||||
@@ -284,7 +292,8 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
|
||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
|
||||
{
|
||||
std::vector<LLModel::GPUDevice> devices;
|
||||
#ifndef __APPLE__
|
||||
static const std::string backends[] = {"kompute", "cuda"};
|
||||
@@ -299,33 +308,40 @@ std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(siz
|
||||
return devices;
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
|
||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
|
||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->layerCount(modelPath) : -1;
|
||||
}
|
||||
|
||||
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
|
||||
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama && llama->isEmbeddingModel(modelPath);
|
||||
}
|
||||
|
||||
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
|
||||
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
|
||||
{
|
||||
s_implementations_search_path = path;
|
||||
}
|
||||
|
||||
const std::string& LLModel::Implementation::implementationsSearchPath() {
|
||||
const std::string& LLModel::Implementation::implementationsSearchPath()
|
||||
{
|
||||
return s_implementations_search_path;
|
||||
}
|
||||
|
||||
bool LLModel::Implementation::hasSupportedCPU() {
|
||||
bool LLModel::Implementation::hasSupportedCPU()
|
||||
{
|
||||
return cpu_supports_avx() != 0;
|
||||
}
|
||||
|
||||
int LLModel::Implementation::cpuSupportsAVX2() {
|
||||
int LLModel::Implementation::cpuSupportsAVX2()
|
||||
{
|
||||
return cpu_supports_avx2();
|
||||
}
|
||||
|
@@ -20,7 +20,8 @@ struct LLModelWrapper {
|
||||
~LLModelWrapper() { delete llModel; }
|
||||
};
|
||||
|
||||
llmodel_model llmodel_model_create(const char *model_path) {
|
||||
llmodel_model llmodel_model_create(const char *model_path)
|
||||
{
|
||||
const char *error;
|
||||
auto fres = llmodel_model_create2(model_path, "auto", &error);
|
||||
if (!fres) {
|
||||
@@ -29,7 +30,8 @@ llmodel_model llmodel_model_create(const char *model_path) {
|
||||
return fres;
|
||||
}
|
||||
|
||||
static void llmodel_set_error(const char **errptr, const char *message) {
|
||||
static void llmodel_set_error(const char **errptr, const char *message)
|
||||
{
|
||||
thread_local static std::string last_error_message;
|
||||
if (errptr) {
|
||||
last_error_message = message;
|
||||
@@ -37,7 +39,8 @@ static void llmodel_set_error(const char **errptr, const char *message) {
|
||||
}
|
||||
}
|
||||
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
|
||||
{
|
||||
LLModel *llModel;
|
||||
try {
|
||||
llModel = LLModel::Implementation::construct(model_path, backend);
|
||||
@@ -51,7 +54,8 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend,
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
void llmodel_model_destroy(llmodel_model model) {
|
||||
void llmodel_model_destroy(llmodel_model model)
|
||||
{
|
||||
delete static_cast<LLModelWrapper *>(model);
|
||||
}
|
||||
|
||||
|
@@ -14,7 +14,8 @@
|
||||
#include <vector>
|
||||
|
||||
// TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
|
||||
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
|
||||
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
|
||||
{
|
||||
int n_keep = shouldAddBOS();
|
||||
const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
|
||||
|
||||
@@ -43,7 +44,8 @@ stop_generating:
|
||||
recalculate(false);
|
||||
}
|
||||
|
||||
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err) {
|
||||
static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
|
||||
{
|
||||
static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
|
||||
|
||||
auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
|
||||
|
@@ -38,7 +38,8 @@ struct llm_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
|
||||
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
|
||||
{
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
|
@@ -8,7 +8,8 @@
|
||||
#include <regex>
|
||||
#include <utility>
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement) {
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement)
|
||||
{
|
||||
size_t pos = 0;
|
||||
while ((pos = str.find(needle, pos)) != std::string::npos) {
|
||||
str.replace(pos, needle.length(), replacement);
|
||||
@@ -16,7 +17,8 @@ void replace(std::string & str, const std::string & needle, const std::string &
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname)
|
||||
{
|
||||
std::map<std::string, int32_t> result;
|
||||
|
||||
// read file into string
|
||||
@@ -107,7 +109,8 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
|
||||
{
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
@@ -162,12 +165,14 @@ std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std
|
||||
return tokens;
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string &s) {
|
||||
std::string regex_escape(const std::string &s)
|
||||
{
|
||||
static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
|
||||
return std::regex_replace(s, metacharacters, "\\$&");
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
|
||||
{
|
||||
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||
if (!vocab.special_tokens.empty()) {
|
||||
std::vector<gpt_vocab::id> out;
|
||||
@@ -203,7 +208,8 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||
}
|
||||
|
||||
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
|
||||
{
|
||||
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
vocab.token_to_id = ::json_parse(fname);
|
||||
|
@@ -14,7 +14,8 @@
|
||||
//
|
||||
// General purpose inline functions
|
||||
//
|
||||
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes) {
|
||||
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
|
||||
{
|
||||
return bytes*1024*1024;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user