expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-01-31 14:17:44 -05:00
committed by GitHub
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions

View File

@@ -183,7 +183,7 @@ public class LLModel : ILLModel
/// <returns>true if the model was loaded successfully, false otherwise.</returns>
public bool Load(string modelPath)
{
return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048);
return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048, 100);
}
protected void Destroy()

View File

@@ -71,7 +71,8 @@ internal static unsafe partial class NativeMethods
public static extern bool llmodel_loadModel(
[NativeTypeName("llmodel_model")] IntPtr model,
[NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
[NativeTypeName("int32_t")] int n_ctx);
[NativeTypeName("int32_t")] int n_ctx,
[NativeTypeName("int32_t")] int ngl);
[DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]

View File

@@ -43,7 +43,7 @@ public class Gpt4AllModelFactory : IGpt4AllModelFactory
}
_logger.LogDebug("Model created handle=0x{ModelHandle:X8}", handle);
_logger.LogInformation("Model loading started");
var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048);
var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048, 100);
_logger.LogInformation("Model loading completed success={ModelLoadSuccess}", loadedSuccessfully);
if (!loadedSuccessfully)
{

View File

@@ -23,7 +23,7 @@ void* load_model(const char *fname, int n_threads) {
fprintf(stderr, "%s: error '%s'\n", __func__, new_error);
return nullptr;
}
if (!llmodel_loadModel(model, fname, 2048)) {
if (!llmodel_loadModel(model, fname, 2048, 100)) {
llmodel_model_destroy(model);
return nullptr;
}

View File

@@ -195,7 +195,7 @@ public class LLModel implements AutoCloseable {
if(model == null) {
throw new IllegalStateException("Could not load, gpt4all backend returned error: " + error.getValue().getString(0));
}
library.llmodel_loadModel(model, modelPathAbs, 2048);
library.llmodel_loadModel(model, modelPathAbs, 2048, 100);
if(!library.llmodel_isModelLoaded(model)){
throw new IllegalStateException("The model " + modelName + " could not be loaded");

View File

@@ -61,7 +61,7 @@ public interface LLModelLibrary {
Pointer llmodel_model_create2(String model_path, String build_variant, PointerByReference error);
void llmodel_model_destroy(Pointer model);
boolean llmodel_loadModel(Pointer model, String model_path, int n_ctx);
boolean llmodel_loadModel(Pointer model, String model_path, int n_ctx, int ngl);
boolean llmodel_isModelLoaded(Pointer model);
@u_int64_t long llmodel_get_state_size(Pointer model);
@u_int64_t long llmodel_save_state_data(Pointer model, Pointer dest);

View File

@@ -70,6 +70,7 @@ class GPT4All:
n_threads: Optional[int] = None,
device: Optional[str] = "cpu",
n_ctx: int = 2048,
ngl: int = 100,
verbose: bool = False,
):
"""
@@ -92,6 +93,7 @@ class GPT4All:
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
n_ctx: Maximum size of context window
ngl: Number of GPU layers to use (Vulkan)
verbose: If True, print debug messages.
"""
self.model_type = model_type
@@ -99,8 +101,8 @@ class GPT4All:
# Retrieve model and download if allowed
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
if device is not None and device != "cpu":
self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx)
self.model.load_model(self.config["path"], n_ctx)
self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl)
self.model.load_model(self.config["path"], n_ctx, ngl)
# Set n_threads
if n_threads is not None:
self.model.set_thread_count(n_threads)

View File

@@ -182,16 +182,16 @@ class LLModel:
if self.model is not None:
self.llmodel_lib.llmodel_model_destroy(self.model)
def memory_needed(self, model_path: str, n_ctx: int) -> int:
def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
self.model = None
return self._memory_needed(model_path, n_ctx)
return self._memory_needed(model_path, n_ctx, ngl)
def _memory_needed(self, model_path: str, n_ctx: int) -> int:
def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
if self.model is None:
self.model = _create_model(model_path.encode())
return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx)
return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl)
def list_gpu(self, model_path: str, n_ctx: int) -> list[LLModelGPUDevice]:
def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]:
"""
Lists available GPU devices that satisfy the model's memory requirements.
@@ -201,13 +201,15 @@ class LLModel:
Path to the model.
n_ctx : int
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
Returns
-------
list
A list of LLModelGPUDevice structures representing available GPU devices.
"""
mem_required = self._memory_needed(model_path, n_ctx)
mem_required = self._memory_needed(model_path, n_ctx, ngl)
return self._list_gpu(mem_required)
def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
@@ -217,8 +219,8 @@ class LLModel:
raise ValueError("Unable to retrieve available GPU devices")
return devices_ptr[:num_devices.value]
def init_gpu(self, model_path: str, device: str, n_ctx: int):
mem_required = self._memory_needed(model_path, n_ctx)
def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int):
mem_required = self._memory_needed(model_path, n_ctx, ngl)
success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode())
if not success:
@@ -241,7 +243,7 @@ class LLModel:
error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
raise ValueError(error_msg)
def load_model(self, model_path: str, n_ctx: int) -> bool:
def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool:
"""
Load model from a file.
@@ -251,6 +253,8 @@ class LLModel:
Model filepath
n_ctx : int
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
Returns
-------
@@ -258,7 +262,7 @@ class LLModel:
"""
self.model = _create_model(model_path.encode())
llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx)
llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl)
filename = os.path.basename(model_path)
self.model_name = os.path.splitext(filename)[0]

View File

@@ -28,7 +28,7 @@ Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
{
auto env = info.Env();
return Napi::Number::New(env, static_cast<uint32_t>( llmodel_required_mem(GetInference(), full_model_path.c_str(), 2048) ));
return Napi::Number::New(env, static_cast<uint32_t>( llmodel_required_mem(GetInference(), full_model_path.c_str(), 2048, 100) ));
}
Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo& info)
@@ -161,7 +161,7 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
}
}
auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), 2048);
auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), 2048, 100);
if(!success) {
Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
return;