diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp index 01b348d0..342827e2 100644 --- a/gpt4all-backend/bert.cpp +++ b/gpt4all-backend/bert.cpp @@ -713,10 +713,16 @@ bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl) { (void)n_ctx; (void)ngl; - d_ptr->ctx = bert_load_from_file(modelPath.c_str()); - d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - d_ptr->modelLoaded = d_ptr->ctx != nullptr; + d_ptr->modelLoaded = false; + + auto * ctx = bert_load_from_file(modelPath.c_str()); fflush(stdout); + if (!ctx) + return false; + + d_ptr->ctx = ctx; + d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + d_ptr->modelLoaded = true; return true; } diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp index 40db378a..51a032f8 100644 --- a/gpt4all-backend/gptj.cpp +++ b/gpt4all-backend/gptj.cpp @@ -685,18 +685,21 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) { bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) { (void)n_ctx; (void)ngl; + d_ptr->modelLoaded = false; + std::mt19937 rng(time(NULL)); d_ptr->rng = rng; // load the model - if (!gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab)) { + bool ok = gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab); + fflush(stdout); + if (!ok) { std::cerr << "GPT-J ERROR: failed to load model from " << modelPath; return false; } d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); d_ptr->modelLoaded = true; - fflush(stdout); return true; } diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index cd1b5a10..315102f8 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit cd1b5a104b9d3e211a50b9f6c261aced3bf09834 +Subproject commit 315102f89109f1b67c8f89f12d98ab646685e333 diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index df55756c..065ddf41 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -150,6 +150,8 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) { + d_ptr->modelLoaded = false; + // clean up after previous loadModel() if (d_ptr->model) { llama_free_model(d_ptr->model); @@ -195,6 +197,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params); if (!d_ptr->model) { + fflush(stdout); d_ptr->device = -1; std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; return false; @@ -225,6 +228,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params); if (!d_ptr->ctx) { + fflush(stdout); std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl; llama_free_model(d_ptr->model); d_ptr->model = nullptr; @@ -240,8 +244,8 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) } #endif + fflush(stdout); d_ptr->modelLoaded = true; - fflush(stderr); return true; } diff --git a/gpt4all-bindings/python/gpt4all/__init__.py b/gpt4all-bindings/python/gpt4all/__init__.py index 391fab02..01df38fc 100644 --- a/gpt4all-bindings/python/gpt4all/__init__.py +++ b/gpt4all-bindings/python/gpt4all/__init__.py @@ -1,2 +1 @@ from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All -from .pyllmodel import LLModel as LLModel diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py similarity index 75% rename from gpt4all-bindings/python/gpt4all/pyllmodel.py rename to gpt4all-bindings/python/gpt4all/_pyllmodel.py index f313e305..100ec4cb 100644 --- a/gpt4all-bindings/python/gpt4all/pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -142,15 +142,6 @@ def empty_response_callback(token_id: int, response: str) -> bool: return True -def _create_model(model_path: bytes) -> ctypes.c_void_p: - err = ctypes.c_char_p() - model = llmodel.llmodel_model_create2(model_path, b"auto", ctypes.byref(err)) - if model is None: - s = err.value - raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}") - return model - - # Symbol to terminate from generator class Sentinel(Enum): TERMINATING_SYMBOL = 0 @@ -161,116 +152,77 @@ class LLModel: Base class and universal wrapper for GPT4All language models built around llmodel C-API. - Attributes + Parameters ---------- - model: llmodel_model - Ctype pointer to underlying model - model_name: str - Model name + model_path : str + Path to the model. + n_ctx : int + Maximum size of context window + ngl : int + Number of GPU layers to use (Vulkan) """ - def __init__(self): - self.model = None - self.model_name = None - self.context = None - self.llmodel_lib = llmodel - + def __init__(self, model_path: str, n_ctx: int, ngl: int): + self.model_path = model_path.encode() + self.n_ctx = n_ctx + self.ngl = ngl + self.context: LLModelPromptContext | None = None self.buffer = bytearray() self.buff_expecting_cont_bytes: int = 0 + # Construct a model implementation + err = ctypes.c_char_p() + model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err)) + if model is None: + s = err.value + raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}") + self.model = model + def __del__(self): - if self.model is not None: - self.llmodel_lib.llmodel_model_destroy(self.model) - - def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int: - self.model = None - return self._memory_needed(model_path, n_ctx, ngl) - - def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int: - if self.model is None: - self.model = _create_model(model_path.encode()) - return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl) - - def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]: - """ - Lists available GPU devices that satisfy the model's memory requirements. - - Parameters - ---------- - model_path : str - Path to the model. - n_ctx : int - Maximum size of context window - ngl : int - Number of GPU layers to use (Vulkan) - - Returns - ------- - list - A list of LLModelGPUDevice structures representing available GPU devices. - """ - mem_required = self._memory_needed(model_path, n_ctx, ngl) - return self._list_gpu(mem_required) + if hasattr(self, 'model'): + llmodel.llmodel_model_destroy(self.model) def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]: num_devices = ctypes.c_int32(0) - devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices)) + devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices)) if not devices_ptr: raise ValueError("Unable to retrieve available GPU devices") return devices_ptr[:num_devices.value] - def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int): - mem_required = self._memory_needed(model_path, n_ctx, ngl) + def init_gpu(self, device: str): + mem_required = llmodel.llmodel_required_mem(self.model, self.model_path, self.n_ctx, self.ngl) - success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()) - if not success: - # Retrieve all GPUs without considering memory requirements. - num_devices = ctypes.c_int32(0) - all_devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices)) - if not all_devices_ptr: - raise ValueError("Unable to retrieve list of all GPU devices") - all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]] + if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()): + return - # Retrieve GPUs that meet the memory requirements using list_gpu - available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)] + # Retrieve all GPUs without considering memory requirements. + num_devices = ctypes.c_int32(0) + all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices)) + if not all_devices_ptr: + raise ValueError("Unable to retrieve list of all GPU devices") + all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]] - # Identify GPUs that are unavailable due to insufficient memory or features - unavailable_gpus = set(all_gpus).difference(available_gpus) + # Retrieve GPUs that meet the memory requirements using list_gpu + available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)] - # Formulate the error message - error_msg = "Unable to initialize model on GPU: '{}'.".format(device) - error_msg += "\nAvailable GPUs: {}.".format(available_gpus) - error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus) - raise ValueError(error_msg) + # Identify GPUs that are unavailable due to insufficient memory or features + unavailable_gpus = set(all_gpus).difference(available_gpus) - def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool: + # Formulate the error message + error_msg = "Unable to initialize model on GPU: '{}'.".format(device) + error_msg += "\nAvailable GPUs: {}.".format(available_gpus) + error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus) + raise ValueError(error_msg) + + def load_model(self) -> bool: """ Load model from a file. - Parameters - ---------- - model_path : str - Model filepath - n_ctx : int - Maximum size of context window - ngl : int - Number of GPU layers to use (Vulkan) - Returns ------- True if model loaded successfully, False otherwise """ - self.model = _create_model(model_path.encode()) - - llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl) - - filename = os.path.basename(model_path) - self.model_name = os.path.splitext(filename)[0] - - if llmodel.llmodel_isModelLoaded(self.model): - return True - else: - return False + return llmodel.llmodel_loadModel(self.model, self.model_path, self.n_ctx, self.ngl) def set_thread_count(self, n_threads): if not llmodel.llmodel_isModelLoaded(self.model): @@ -295,7 +247,7 @@ class LLModel: reset_context: bool = False, ): if self.context is None: - self.context = LLModelPromptContext( + context = LLModelPromptContext( logits_size=0, tokens_size=0, n_past=0, @@ -309,8 +261,11 @@ class LLModel: repeat_last_n=repeat_last_n, context_erase=context_erase, ) - elif reset_context: - self.context.n_past = 0 + self.context = context + else: + context = self.context + if reset_context: + self.context.n_past = 0 self.context.n_predict = n_predict self.context.top_k = top_k diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index fcd0a91d..02fa1c80 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -15,7 +15,7 @@ from requests.exceptions import ChunkedEncodingError from tqdm import tqdm from urllib3.exceptions import IncompleteRead, ProtocolError -from . import pyllmodel +from . import _pyllmodel # TODO: move to config DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\") @@ -97,12 +97,12 @@ class GPT4All: verbose: If True, print debug messages. """ self.model_type = model_type - self.model = pyllmodel.LLModel() # Retrieve model and download if allowed self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose) + self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl) if device is not None and device != "cpu": - self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl) - self.model.load_model(self.config["path"], n_ctx, ngl) + self.model.init_gpu(device) + self.model.load_model() # Set n_threads if n_threads is not None: self.model.set_thread_count(n_threads) @@ -292,7 +292,7 @@ class GPT4All: n_batch: int = 8, n_predict: Optional[int] = None, streaming: bool = False, - callback: pyllmodel.ResponseCallbackType = pyllmodel.empty_response_callback, + callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback, ) -> Union[str, Iterable[str]]: """ Generate outputs from any GPT4All model. @@ -350,9 +350,9 @@ class GPT4All: output_collector = self.current_chat_session def _callback_wrapper( - callback: pyllmodel.ResponseCallbackType, + callback: _pyllmodel.ResponseCallbackType, output_collector: List[MessageType], - ) -> pyllmodel.ResponseCallbackType: + ) -> _pyllmodel.ResponseCallbackType: def _callback(token_id: int, response: str) -> bool: nonlocal callback, output_collector diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py index 7ff2a1c6..5b5a24f7 100644 --- a/gpt4all-bindings/python/setup.py +++ b/gpt4all-bindings/python/setup.py @@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECTORY, setup( name=package_name, - version="2.1.0", + version="2.2.0", description="Python bindings for GPT4All", author="Nomic and the Open Source Community", author_email="support@nomic.ai",