diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
index 01b348d0..342827e2 100644
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -713,10 +713,16 @@ bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     (void)n_ctx;
     (void)ngl;
-    d_ptr->ctx = bert_load_from_file(modelPath.c_str());
-    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    d_ptr->modelLoaded = d_ptr->ctx != nullptr;
+    d_ptr->modelLoaded = false;
+
+    auto * ctx = bert_load_from_file(modelPath.c_str());
     fflush(stdout);
+    if (!ctx)
+        return false;
+
+    d_ptr->ctx = ctx;
+    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    d_ptr->modelLoaded = true;
     return true;
 }
 
diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
index 40db378a..51a032f8 100644
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@@ -685,18 +685,21 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
 bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
     (void)n_ctx;
     (void)ngl;
+    d_ptr->modelLoaded = false;
+
     std::mt19937 rng(time(NULL));
     d_ptr->rng = rng;
 
     // load the model
-    if (!gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab)) {
+    bool ok = gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab);
+    fflush(stdout);
+    if (!ok) {
         std::cerr << "GPT-J ERROR: failed to load model from " <<  modelPath;
         return false;
     }
 
     d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = true;
-    fflush(stdout);
     return true;
 }
 
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index cd1b5a10..315102f8 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit cd1b5a104b9d3e211a50b9f6c261aced3bf09834
+Subproject commit 315102f89109f1b67c8f89f12d98ab646685e333
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index df55756c..065ddf41 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -150,6 +150,8 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 
 bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
+    d_ptr->modelLoaded = false;
+
     // clean up after previous loadModel()
     if (d_ptr->model) {
         llama_free_model(d_ptr->model);
@@ -195,6 +197,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 
     d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
     if (!d_ptr->model) {
+        fflush(stdout);
         d_ptr->device = -1;
         std::cerr << "LLAMA ERROR: failed to load model from " <<  modelPath << std::endl;
         return false;
@@ -225,6 +228,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 
     d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
     if (!d_ptr->ctx) {
+        fflush(stdout);
         std::cerr << "LLAMA ERROR: failed to init context for model " <<  modelPath << std::endl;
         llama_free_model(d_ptr->model);
         d_ptr->model = nullptr;
@@ -240,8 +244,8 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     }
 #endif
 
+    fflush(stdout);
     d_ptr->modelLoaded = true;
-    fflush(stderr);
     return true;
 }
 
diff --git a/gpt4all-bindings/python/gpt4all/__init__.py b/gpt4all-bindings/python/gpt4all/__init__.py
index 391fab02..01df38fc 100644
--- a/gpt4all-bindings/python/gpt4all/__init__.py
+++ b/gpt4all-bindings/python/gpt4all/__init__.py
@@ -1,2 +1 @@
 from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All
-from .pyllmodel import LLModel as LLModel
diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
similarity index 75%
rename from gpt4all-bindings/python/gpt4all/pyllmodel.py
rename to gpt4all-bindings/python/gpt4all/_pyllmodel.py
index f313e305..100ec4cb 100644
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -142,15 +142,6 @@ def empty_response_callback(token_id: int, response: str) -> bool:
     return True
 
 
-def _create_model(model_path: bytes) -> ctypes.c_void_p:
-    err = ctypes.c_char_p()
-    model = llmodel.llmodel_model_create2(model_path, b"auto", ctypes.byref(err))
-    if model is None:
-        s = err.value
-        raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}")
-    return model
-
-
 # Symbol to terminate from generator
 class Sentinel(Enum):
     TERMINATING_SYMBOL = 0
@@ -161,116 +152,77 @@ class LLModel:
     Base class and universal wrapper for GPT4All language models
     built around llmodel C-API.
 
-    Attributes
+    Parameters
     ----------
-    model: llmodel_model
-        Ctype pointer to underlying model
-    model_name: str
-        Model name
+    model_path : str
+        Path to the model.
+    n_ctx : int
+        Maximum size of context window
+    ngl : int
+        Number of GPU layers to use (Vulkan)
     """
 
-    def __init__(self):
-        self.model = None
-        self.model_name = None
-        self.context = None
-        self.llmodel_lib = llmodel
-
+    def __init__(self, model_path: str, n_ctx: int, ngl: int):
+        self.model_path = model_path.encode()
+        self.n_ctx = n_ctx
+        self.ngl = ngl
+        self.context: LLModelPromptContext | None = None
         self.buffer = bytearray()
         self.buff_expecting_cont_bytes: int = 0
 
+        # Construct a model implementation
+        err = ctypes.c_char_p()
+        model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
+        if model is None:
+            s = err.value
+            raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}")
+        self.model = model
+
     def __del__(self):
-        if self.model is not None:
-            self.llmodel_lib.llmodel_model_destroy(self.model)
-
-    def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
-        self.model = None
-        return self._memory_needed(model_path, n_ctx, ngl)
-
-    def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
-        if self.model is None:
-            self.model = _create_model(model_path.encode())
-        return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl)
-
-    def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]:
-        """
-        Lists available GPU devices that satisfy the model's memory requirements.
-
-        Parameters
-        ----------
-        model_path : str
-            Path to the model.
-        n_ctx : int
-            Maximum size of context window
-        ngl : int
-            Number of GPU layers to use (Vulkan)
-
-        Returns
-        -------
-        list
-            A list of LLModelGPUDevice structures representing available GPU devices.
-        """
-        mem_required = self._memory_needed(model_path, n_ctx, ngl)
-        return self._list_gpu(mem_required)
+        if hasattr(self, 'model'):
+            llmodel.llmodel_model_destroy(self.model)
 
     def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
         num_devices = ctypes.c_int32(0)
-        devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
+        devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
         if not devices_ptr:
             raise ValueError("Unable to retrieve available GPU devices")
         return devices_ptr[:num_devices.value]
 
-    def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int):
-        mem_required = self._memory_needed(model_path, n_ctx, ngl)
+    def init_gpu(self, device: str):
+        mem_required = llmodel.llmodel_required_mem(self.model, self.model_path, self.n_ctx, self.ngl)
 
-        success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode())
-        if not success:
-            # Retrieve all GPUs without considering memory requirements.
-            num_devices = ctypes.c_int32(0)
-            all_devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
-            if not all_devices_ptr:
-                raise ValueError("Unable to retrieve list of all GPU devices")
-            all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
+        if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
+            return
 
-            # Retrieve GPUs that meet the memory requirements using list_gpu
-            available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
+        # Retrieve all GPUs without considering memory requirements.
+        num_devices = ctypes.c_int32(0)
+        all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
+        if not all_devices_ptr:
+            raise ValueError("Unable to retrieve list of all GPU devices")
+        all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
 
-            # Identify GPUs that are unavailable due to insufficient memory or features
-            unavailable_gpus = set(all_gpus).difference(available_gpus)
+        # Retrieve GPUs that meet the memory requirements using list_gpu
+        available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
 
-            # Formulate the error message
-            error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
-            error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
-            error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
-            raise ValueError(error_msg)
+        # Identify GPUs that are unavailable due to insufficient memory or features
+        unavailable_gpus = set(all_gpus).difference(available_gpus)
 
-    def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool:
+        # Formulate the error message
+        error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
+        error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
+        error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
+        raise ValueError(error_msg)
+
+    def load_model(self) -> bool:
         """
         Load model from a file.
 
-        Parameters
-        ----------
-        model_path : str
-            Model filepath
-        n_ctx : int
-            Maximum size of context window
-        ngl : int
-            Number of GPU layers to use (Vulkan)
-
         Returns
         -------
         True if model loaded successfully, False otherwise
         """
-        self.model = _create_model(model_path.encode())
-
-        llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl)
-
-        filename = os.path.basename(model_path)
-        self.model_name = os.path.splitext(filename)[0]
-
-        if llmodel.llmodel_isModelLoaded(self.model):
-            return True
-        else:
-            return False
+        return llmodel.llmodel_loadModel(self.model, self.model_path, self.n_ctx, self.ngl)
 
     def set_thread_count(self, n_threads):
         if not llmodel.llmodel_isModelLoaded(self.model):
@@ -295,7 +247,7 @@ class LLModel:
         reset_context: bool = False,
     ):
         if self.context is None:
-            self.context = LLModelPromptContext(
+            context = LLModelPromptContext(
                 logits_size=0,
                 tokens_size=0,
                 n_past=0,
@@ -309,8 +261,11 @@ class LLModel:
                 repeat_last_n=repeat_last_n,
                 context_erase=context_erase,
             )
-        elif reset_context:
-            self.context.n_past = 0
+            self.context = context
+        else:
+            context = self.context
+            if reset_context:
+                self.context.n_past = 0
 
         self.context.n_predict = n_predict
         self.context.top_k = top_k
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index fcd0a91d..02fa1c80 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -15,7 +15,7 @@ from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError
 
-from . import pyllmodel
+from . import _pyllmodel
 
 # TODO: move to config
 DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\")
@@ -97,12 +97,12 @@ class GPT4All:
             verbose: If True, print debug messages.
         """
         self.model_type = model_type
-        self.model = pyllmodel.LLModel()
         # Retrieve model and download if allowed
         self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
+        self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl)
         if device is not None and device != "cpu":
-            self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl)
-        self.model.load_model(self.config["path"], n_ctx, ngl)
+            self.model.init_gpu(device)
+        self.model.load_model()
         # Set n_threads
         if n_threads is not None:
             self.model.set_thread_count(n_threads)
@@ -292,7 +292,7 @@ class GPT4All:
         n_batch: int = 8,
         n_predict: Optional[int] = None,
         streaming: bool = False,
-        callback: pyllmodel.ResponseCallbackType = pyllmodel.empty_response_callback,
+        callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback,
     ) -> Union[str, Iterable[str]]:
         """
         Generate outputs from any GPT4All model.
@@ -350,9 +350,9 @@ class GPT4All:
             output_collector = self.current_chat_session
 
         def _callback_wrapper(
-            callback: pyllmodel.ResponseCallbackType,
+            callback: _pyllmodel.ResponseCallbackType,
             output_collector: List[MessageType],
-        ) -> pyllmodel.ResponseCallbackType:
+        ) -> _pyllmodel.ResponseCallbackType:
             def _callback(token_id: int, response: str) -> bool:
                 nonlocal callback, output_collector
 
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index 7ff2a1c6..5b5a24f7 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECTORY,
 
 setup(
     name=package_name,
-    version="2.1.0",
+    version="2.2.0",
     description="Python bindings for GPT4All",
     author="Nomic and the Open Source Community",
     author_email="support@nomic.ai",