support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -48,6 +48,7 @@ struct llmodel_prompt_context {
};
struct llmodel_gpu_device {
const char * backend;
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
@@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
* Embedding cancellation callback for use with llmodel_embed.
* @param batch_sizes The number of tokens in each batch that will be embedded.
* @param n_batch The number of batches that will be embedded.
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
* @return True to cancel llmodel_embed, false to continue.
*/
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
* Create a llmodel instance.
* Recognises correct model type from file at model_path
* @param model_path A string representing the path to the model file; will only be used to detect model type.
* @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
* @param error A pointer to a string; will only be set on error.
* @return A pointer to the llmodel_model instance; NULL on error.
*/
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
/**
* Destroy a llmodel instance.