llamamodel: trigger CUDA OOM early so we can fall back

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-08-07 11:03:33 +00:00 · 2024-05-30 16:38:45 -04:00 · 2024-05-30 16:38:45 -04:00 · b4adcba877
commit b4adcba877
parent 19c95060ec
2 changed files with 39 additions and 0 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -426,6 +426,23 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        return false;
    }
 #ifdef GGML_USE_CUDA
    if (d_ptr->model_params.n_gpu_layers > 0) {
        try {
            testModel(); // eagerly allocate memory
        } catch (const std::runtime_error &e) {
            std::cerr << "LLAMA ERROR: model test failed: " << e.what() << "\n";
            llama_free(d_ptr->ctx);
            d_ptr->ctx = nullptr;
            llama_free_model(d_ptr->model);
            d_ptr->model = nullptr;
            d_ptr->device = -1;
            d_ptr->deviceName.clear();
            return false;
        }
    }
 #endif
    d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
    if (usingGPUDevice()) {
@ -449,6 +466,26 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    return true;
 }
 void LLamaModel::testModel() {
    int n_ctx = llama_n_ctx(d_ptr->ctx);
    int n_batch = LLMODEL_MAX_PROMPT_BATCH;
    n_batch = std::min(n_batch, n_ctx);
    // test with maximum batch size
    PromptContext ctx;
    ctx.n_batch = n_batch;
    std::vector<int32_t> tokens(n_batch);
    llama_set_skip_cpu(d_ptr->ctx, true);
    if (!evalTokens(ctx, tokens))
        throw std::runtime_error("llama_decode failed");
    llama_set_skip_cpu(d_ptr->ctx, false);
    llama_synchronize(d_ptr->ctx); // wait for GPU to finish
    // clean up
    llama_kv_cache_clear(d_ptr->ctx);
 }
 void LLamaModel::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
    llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -48,6 +48,8 @@ public:
               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
 private:
    void testModel(); // used for CUDA to eagerly allocate memory
    std::unique_ptr<LLamaPrivate> d_ptr;
    bool m_supportsEmbedding = false;
    bool m_supportsCompletion = false;