mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-08 03:49:10 +00:00
expose n_gpu_layers parameter of llama.cpp (#1890)
Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -70,6 +70,7 @@ class GPT4All:
|
||||
n_threads: Optional[int] = None,
|
||||
device: Optional[str] = "cpu",
|
||||
n_ctx: int = 2048,
|
||||
ngl: int = 100,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""
|
||||
@@ -92,6 +93,7 @@ class GPT4All:
|
||||
|
||||
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
|
||||
n_ctx: Maximum size of context window
|
||||
ngl: Number of GPU layers to use (Vulkan)
|
||||
verbose: If True, print debug messages.
|
||||
"""
|
||||
self.model_type = model_type
|
||||
@@ -99,8 +101,8 @@ class GPT4All:
|
||||
# Retrieve model and download if allowed
|
||||
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
|
||||
if device is not None and device != "cpu":
|
||||
self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx)
|
||||
self.model.load_model(self.config["path"], n_ctx)
|
||||
self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl)
|
||||
self.model.load_model(self.config["path"], n_ctx, ngl)
|
||||
# Set n_threads
|
||||
if n_threads is not None:
|
||||
self.model.set_thread_count(n_threads)
|
||||
|
@@ -182,16 +182,16 @@ class LLModel:
|
||||
if self.model is not None:
|
||||
self.llmodel_lib.llmodel_model_destroy(self.model)
|
||||
|
||||
def memory_needed(self, model_path: str, n_ctx: int) -> int:
|
||||
def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
|
||||
self.model = None
|
||||
return self._memory_needed(model_path, n_ctx)
|
||||
return self._memory_needed(model_path, n_ctx, ngl)
|
||||
|
||||
def _memory_needed(self, model_path: str, n_ctx: int) -> int:
|
||||
def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
|
||||
if self.model is None:
|
||||
self.model = _create_model(model_path.encode())
|
||||
return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx)
|
||||
return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl)
|
||||
|
||||
def list_gpu(self, model_path: str, n_ctx: int) -> list[LLModelGPUDevice]:
|
||||
def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]:
|
||||
"""
|
||||
Lists available GPU devices that satisfy the model's memory requirements.
|
||||
|
||||
@@ -201,13 +201,15 @@ class LLModel:
|
||||
Path to the model.
|
||||
n_ctx : int
|
||||
Maximum size of context window
|
||||
ngl : int
|
||||
Number of GPU layers to use (Vulkan)
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
A list of LLModelGPUDevice structures representing available GPU devices.
|
||||
"""
|
||||
mem_required = self._memory_needed(model_path, n_ctx)
|
||||
mem_required = self._memory_needed(model_path, n_ctx, ngl)
|
||||
return self._list_gpu(mem_required)
|
||||
|
||||
def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
|
||||
@@ -217,8 +219,8 @@ class LLModel:
|
||||
raise ValueError("Unable to retrieve available GPU devices")
|
||||
return devices_ptr[:num_devices.value]
|
||||
|
||||
def init_gpu(self, model_path: str, device: str, n_ctx: int):
|
||||
mem_required = self._memory_needed(model_path, n_ctx)
|
||||
def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int):
|
||||
mem_required = self._memory_needed(model_path, n_ctx, ngl)
|
||||
|
||||
success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode())
|
||||
if not success:
|
||||
@@ -241,7 +243,7 @@ class LLModel:
|
||||
error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
def load_model(self, model_path: str, n_ctx: int) -> bool:
|
||||
def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool:
|
||||
"""
|
||||
Load model from a file.
|
||||
|
||||
@@ -251,6 +253,8 @@ class LLModel:
|
||||
Model filepath
|
||||
n_ctx : int
|
||||
Maximum size of context window
|
||||
ngl : int
|
||||
Number of GPU layers to use (Vulkan)
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -258,7 +262,7 @@ class LLModel:
|
||||
"""
|
||||
self.model = _create_model(model_path.encode())
|
||||
|
||||
llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx)
|
||||
llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl)
|
||||
|
||||
filename = os.path.basename(model_path)
|
||||
self.model_name = os.path.splitext(filename)[0]
|
||||
|
Reference in New Issue
Block a user