From b4adcba87748a57706a809db9d4cda8f632989f4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 30 May 2024 16:38:45 -0400
Subject: [PATCH] llamamodel: trigger CUDA OOM early so we can fall back

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/llamamodel.cpp    | 37 +++++++++++++++++++++++++++++++
 gpt4all-backend/llamamodel_impl.h |  2 ++
 2 files changed, 39 insertions(+)
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index a9f89b1a..e955f804 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -426,6 +426,23 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
         return false;
     }
 
+#ifdef GGML_USE_CUDA
+    if (d_ptr->model_params.n_gpu_layers > 0) {
+        try {
+            testModel(); // eagerly allocate memory
+        } catch (const std::runtime_error &e) {
+            std::cerr << "LLAMA ERROR: model test failed: " << e.what() << "\n";
+            llama_free(d_ptr->ctx);
+            d_ptr->ctx = nullptr;
+            llama_free_model(d_ptr->model);
+            d_ptr->model = nullptr;
+            d_ptr->device = -1;
+            d_ptr->deviceName.clear();
+            return false;
+        }
+    }
+#endif
+
     d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
 
     if (usingGPUDevice()) {
@@ -449,6 +466,26 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     return true;
 }
 
+void LLamaModel::testModel() {
+    int n_ctx = llama_n_ctx(d_ptr->ctx);
+    int n_batch = LLMODEL_MAX_PROMPT_BATCH;
+    n_batch = std::min(n_batch, n_ctx);
+
+    // test with maximum batch size
+    PromptContext ctx;
+    ctx.n_batch = n_batch;
+    std::vector<int32_t> tokens(n_batch);
+
+    llama_set_skip_cpu(d_ptr->ctx, true);
+    if (!evalTokens(ctx, tokens))
+        throw std::runtime_error("llama_decode failed");
+    llama_set_skip_cpu(d_ptr->ctx, false);
+    llama_synchronize(d_ptr->ctx); // wait for GPU to finish
+
+    // clean up
+    llama_kv_cache_clear(d_ptr->ctx);
+}
+
 void LLamaModel::setThreadCount(int32_t n_threads) {
     d_ptr->n_threads = n_threads;
     llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index 2051fd3b..338d30ce 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -48,6 +48,8 @@ public:
                size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
 
 private:
+    void testModel(); // used for CUDA to eagerly allocate memory
+
     std::unique_ptr<LLamaPrivate> d_ptr;
     bool m_supportsEmbedding = false;
     bool m_supportsCompletion = false;