implement local Nomic Embed via llama.cpp (#2086)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-03-13 18:09:24 -04:00
committed by GitHub
parent 171f4e488e
commit 406e88b59a
23 changed files with 799 additions and 1198 deletions

View File

@@ -4,6 +4,7 @@
#include <cerrno>
#include <cstring>
#include <iostream>
#include <optional>
#include <utility>
struct LLModelWrapper {
@@ -41,22 +42,22 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va
*error = last_error_message.c_str();
}
}
return reinterpret_cast<llmodel_model*>(wrapper);
return wrapper;
}
void llmodel_model_destroy(llmodel_model model) {
delete reinterpret_cast<LLModelWrapper*>(model);
delete static_cast<LLModelWrapper *>(model);
}
size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
}
bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
std::string modelPath(model_path);
if (wrapper->llModel->isModelBlacklisted(modelPath)) {
@@ -69,44 +70,28 @@ bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, i
bool llmodel_isModelLoaded(llmodel_model model)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->isModelLoaded();
}
uint64_t llmodel_get_state_size(llmodel_model model)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->stateSize();
}
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->saveState(dest);
}
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->restoreState(src);
}
// Wrapper functions for the C callbacks
bool prompt_wrapper(int32_t token_id, void *user_data) {
llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);
return callback(token_id);
}
bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {
llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);
return callback(token_id, response.c_str());
}
bool recalculate_wrapper(bool is_recalculating, void *user_data) {
llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);
return callback(is_recalculating);
}
void llmodel_prompt(llmodel_model model, const char *prompt,
const char *prompt_template,
llmodel_prompt_callback prompt_callback,
@@ -116,15 +101,11 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
bool special,
const char *fake_reply)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
// Create std::function wrappers that call the C function pointers
std::function<bool(int32_t)> prompt_func =
std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));
std::function<bool(int32_t, const std::string&)> response_func =
std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));
std::function<bool(bool)> recalc_func =
std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));
auto response_func = [response_callback](int32_t token_id, const std::string &response) {
return response_callback(token_id, response.c_str());
};
if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
wrapper->promptContext.tokens.resize(ctx->n_past);
@@ -147,8 +128,8 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
// Call the C++ prompt method
wrapper->llModel->prompt(prompt, prompt_template, prompt_func, response_func, recalc_func, wrapper->promptContext,
special, fake_reply_p);
wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
wrapper->promptContext, special, fake_reply_p);
// Update the C context by giving access to the wrappers raw pointers to std::vector data
// which involves no copies
@@ -171,38 +152,60 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
ctx->context_erase = wrapper->promptContext.contextErase;
}
float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size)
{
if (model == nullptr || text == nullptr || !strlen(text)) {
*embedding_size = 0;
float *llmodel_embed(
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
bool do_mean, bool atlas, const char **error
) {
auto *wrapper = static_cast<LLModelWrapper *>(model);
if (!texts || !*texts) {
if (error)
*error = strdup("'texts' is NULL or empty");
return nullptr;
}
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
std::vector<float> embeddingVector = wrapper->llModel->embedding(text);
float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float));
if (embedding == nullptr) {
*embedding_size = 0;
std::vector<std::string> textsVec;
while (*texts) { textsVec.emplace_back(*texts++); }
size_t embd_size;
float *embedding;
try {
embd_size = wrapper->llModel->embeddingSize();
if (dimensionality > 0 && dimensionality < int(embd_size))
embd_size = dimensionality;
embd_size *= textsVec.size();
std::optional<std::string> prefixStr;
if (prefix) { prefixStr = prefix; }
embedding = new float[embd_size];
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas);
} catch (std::exception const &e) {
if (error)
*error = strdup(e.what());
return nullptr;
}
std::copy(embeddingVector.begin(), embeddingVector.end(), embedding);
*embedding_size = embeddingVector.size();
*embedding_size = embd_size;
return embedding;
}
void llmodel_free_embedding(float *ptr)
{
free(ptr);
delete[] ptr;
}
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
wrapper->llModel->setThreadCount(n_threads);
}
int32_t llmodel_threadCount(llmodel_model model)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->threadCount();
}
@@ -218,7 +221,7 @@ const char *llmodel_get_implementation_search_path()
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
// Set the num_devices
@@ -242,24 +245,24 @@ struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, si
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
}
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->initializeGPUDevice(device->index);
}
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->initializeGPUDevice(device);
}
bool llmodel_has_gpu_device(llmodel_model model)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->hasGPUDevice();
}