From b4adcba87748a57706a809db9d4cda8f632989f4 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 30 May 2024 16:38:45 -0400 Subject: [PATCH] llamamodel: trigger CUDA OOM early so we can fall back Signed-off-by: Jared Van Bortel --- gpt4all-backend/llamamodel.cpp | 37 +++++++++++++++++++++++++++++++ gpt4all-backend/llamamodel_impl.h | 2 ++ 2 files changed, 39 insertions(+) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index a9f89b1a..e955f804 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -426,6 +426,23 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) return false; } +#ifdef GGML_USE_CUDA + if (d_ptr->model_params.n_gpu_layers > 0) { + try { + testModel(); // eagerly allocate memory + } catch (const std::runtime_error &e) { + std::cerr << "LLAMA ERROR: model test failed: " << e.what() << "\n"; + llama_free(d_ptr->ctx); + d_ptr->ctx = nullptr; + llama_free_model(d_ptr->model); + d_ptr->model = nullptr; + d_ptr->device = -1; + d_ptr->deviceName.clear(); + return false; + } + } +#endif + d_ptr->end_tokens = {llama_token_eos(d_ptr->model)}; if (usingGPUDevice()) { @@ -449,6 +466,26 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) return true; } +void LLamaModel::testModel() { + int n_ctx = llama_n_ctx(d_ptr->ctx); + int n_batch = LLMODEL_MAX_PROMPT_BATCH; + n_batch = std::min(n_batch, n_ctx); + + // test with maximum batch size + PromptContext ctx; + ctx.n_batch = n_batch; + std::vector tokens(n_batch); + + llama_set_skip_cpu(d_ptr->ctx, true); + if (!evalTokens(ctx, tokens)) + throw std::runtime_error("llama_decode failed"); + llama_set_skip_cpu(d_ptr->ctx, false); + llama_synchronize(d_ptr->ctx); // wait for GPU to finish + + // clean up + llama_kv_cache_clear(d_ptr->ctx); +} + void LLamaModel::setThreadCount(int32_t n_threads) { d_ptr->n_threads = n_threads; llama_set_n_threads(d_ptr->ctx, n_threads, n_threads); diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index 2051fd3b..338d30ce 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -48,6 +48,8 @@ public: size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override; private: + void testModel(); // used for CUDA to eagerly allocate memory + std::unique_ptr d_ptr; bool m_supportsEmbedding = false; bool m_supportsCompletion = false;