mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-07 19:40:21 +00:00
implement local Nomic Embed via llama.cpp (#2086)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
|
||||
struct LLModelWrapper {
|
||||
@@ -41,22 +42,22 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va
|
||||
*error = last_error_message.c_str();
|
||||
}
|
||||
}
|
||||
return reinterpret_cast<llmodel_model*>(wrapper);
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
void llmodel_model_destroy(llmodel_model model) {
|
||||
delete reinterpret_cast<LLModelWrapper*>(model);
|
||||
delete static_cast<LLModelWrapper *>(model);
|
||||
}
|
||||
|
||||
size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
|
||||
}
|
||||
|
||||
bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
|
||||
std::string modelPath(model_path);
|
||||
if (wrapper->llModel->isModelBlacklisted(modelPath)) {
|
||||
@@ -69,44 +70,28 @@ bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, i
|
||||
|
||||
bool llmodel_isModelLoaded(llmodel_model model)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->isModelLoaded();
|
||||
}
|
||||
|
||||
uint64_t llmodel_get_state_size(llmodel_model model)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->stateSize();
|
||||
}
|
||||
|
||||
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->saveState(dest);
|
||||
}
|
||||
|
||||
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->restoreState(src);
|
||||
}
|
||||
|
||||
// Wrapper functions for the C callbacks
|
||||
bool prompt_wrapper(int32_t token_id, void *user_data) {
|
||||
llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);
|
||||
return callback(token_id);
|
||||
}
|
||||
|
||||
bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {
|
||||
llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);
|
||||
return callback(token_id, response.c_str());
|
||||
}
|
||||
|
||||
bool recalculate_wrapper(bool is_recalculating, void *user_data) {
|
||||
llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);
|
||||
return callback(is_recalculating);
|
||||
}
|
||||
|
||||
void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||
const char *prompt_template,
|
||||
llmodel_prompt_callback prompt_callback,
|
||||
@@ -116,15 +101,11 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||
bool special,
|
||||
const char *fake_reply)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
|
||||
// Create std::function wrappers that call the C function pointers
|
||||
std::function<bool(int32_t)> prompt_func =
|
||||
std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));
|
||||
std::function<bool(int32_t, const std::string&)> response_func =
|
||||
std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));
|
||||
std::function<bool(bool)> recalc_func =
|
||||
std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));
|
||||
auto response_func = [response_callback](int32_t token_id, const std::string &response) {
|
||||
return response_callback(token_id, response.c_str());
|
||||
};
|
||||
|
||||
if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
|
||||
wrapper->promptContext.tokens.resize(ctx->n_past);
|
||||
@@ -147,8 +128,8 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||
auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
|
||||
|
||||
// Call the C++ prompt method
|
||||
wrapper->llModel->prompt(prompt, prompt_template, prompt_func, response_func, recalc_func, wrapper->promptContext,
|
||||
special, fake_reply_p);
|
||||
wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
|
||||
wrapper->promptContext, special, fake_reply_p);
|
||||
|
||||
// Update the C context by giving access to the wrappers raw pointers to std::vector data
|
||||
// which involves no copies
|
||||
@@ -171,38 +152,60 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||
ctx->context_erase = wrapper->promptContext.contextErase;
|
||||
}
|
||||
|
||||
float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size)
|
||||
{
|
||||
if (model == nullptr || text == nullptr || !strlen(text)) {
|
||||
*embedding_size = 0;
|
||||
float *llmodel_embed(
|
||||
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
|
||||
bool do_mean, bool atlas, const char **error
|
||||
) {
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
|
||||
if (!texts || !*texts) {
|
||||
if (error)
|
||||
*error = strdup("'texts' is NULL or empty");
|
||||
return nullptr;
|
||||
}
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
std::vector<float> embeddingVector = wrapper->llModel->embedding(text);
|
||||
float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float));
|
||||
if (embedding == nullptr) {
|
||||
*embedding_size = 0;
|
||||
|
||||
std::vector<std::string> textsVec;
|
||||
while (*texts) { textsVec.emplace_back(*texts++); }
|
||||
|
||||
size_t embd_size;
|
||||
float *embedding;
|
||||
|
||||
try {
|
||||
embd_size = wrapper->llModel->embeddingSize();
|
||||
if (dimensionality > 0 && dimensionality < int(embd_size))
|
||||
embd_size = dimensionality;
|
||||
|
||||
embd_size *= textsVec.size();
|
||||
|
||||
std::optional<std::string> prefixStr;
|
||||
if (prefix) { prefixStr = prefix; }
|
||||
|
||||
embedding = new float[embd_size];
|
||||
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas);
|
||||
} catch (std::exception const &e) {
|
||||
if (error)
|
||||
*error = strdup(e.what());
|
||||
return nullptr;
|
||||
}
|
||||
std::copy(embeddingVector.begin(), embeddingVector.end(), embedding);
|
||||
*embedding_size = embeddingVector.size();
|
||||
|
||||
*embedding_size = embd_size;
|
||||
return embedding;
|
||||
}
|
||||
|
||||
void llmodel_free_embedding(float *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
delete[] ptr;
|
||||
}
|
||||
|
||||
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
wrapper->llModel->setThreadCount(n_threads);
|
||||
}
|
||||
|
||||
int32_t llmodel_threadCount(llmodel_model model)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->threadCount();
|
||||
}
|
||||
|
||||
@@ -218,7 +221,7 @@ const char *llmodel_get_implementation_search_path()
|
||||
|
||||
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
|
||||
|
||||
// Set the num_devices
|
||||
@@ -242,24 +245,24 @@ struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, si
|
||||
|
||||
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
|
||||
}
|
||||
|
||||
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->initializeGPUDevice(device->index);
|
||||
}
|
||||
|
||||
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->initializeGPUDevice(device);
|
||||
}
|
||||
|
||||
bool llmodel_has_gpu_device(llmodel_model model)
|
||||
{
|
||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->hasGPUDevice();
|
||||
}
|
||||
|
Reference in New Issue
Block a user