mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-21 21:19:08 +00:00
llamamodel: trigger CUDA OOM early so we can fall back
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
19c95060ec
commit
b4adcba877
@ -426,6 +426,23 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
if (d_ptr->model_params.n_gpu_layers > 0) {
|
||||||
|
try {
|
||||||
|
testModel(); // eagerly allocate memory
|
||||||
|
} catch (const std::runtime_error &e) {
|
||||||
|
std::cerr << "LLAMA ERROR: model test failed: " << e.what() << "\n";
|
||||||
|
llama_free(d_ptr->ctx);
|
||||||
|
d_ptr->ctx = nullptr;
|
||||||
|
llama_free_model(d_ptr->model);
|
||||||
|
d_ptr->model = nullptr;
|
||||||
|
d_ptr->device = -1;
|
||||||
|
d_ptr->deviceName.clear();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
|
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
|
||||||
|
|
||||||
if (usingGPUDevice()) {
|
if (usingGPUDevice()) {
|
||||||
@ -449,6 +466,26 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LLamaModel::testModel() {
|
||||||
|
int n_ctx = llama_n_ctx(d_ptr->ctx);
|
||||||
|
int n_batch = LLMODEL_MAX_PROMPT_BATCH;
|
||||||
|
n_batch = std::min(n_batch, n_ctx);
|
||||||
|
|
||||||
|
// test with maximum batch size
|
||||||
|
PromptContext ctx;
|
||||||
|
ctx.n_batch = n_batch;
|
||||||
|
std::vector<int32_t> tokens(n_batch);
|
||||||
|
|
||||||
|
llama_set_skip_cpu(d_ptr->ctx, true);
|
||||||
|
if (!evalTokens(ctx, tokens))
|
||||||
|
throw std::runtime_error("llama_decode failed");
|
||||||
|
llama_set_skip_cpu(d_ptr->ctx, false);
|
||||||
|
llama_synchronize(d_ptr->ctx); // wait for GPU to finish
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
llama_kv_cache_clear(d_ptr->ctx);
|
||||||
|
}
|
||||||
|
|
||||||
void LLamaModel::setThreadCount(int32_t n_threads) {
|
void LLamaModel::setThreadCount(int32_t n_threads) {
|
||||||
d_ptr->n_threads = n_threads;
|
d_ptr->n_threads = n_threads;
|
||||||
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
||||||
|
@ -48,6 +48,8 @@ public:
|
|||||||
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void testModel(); // used for CUDA to eagerly allocate memory
|
||||||
|
|
||||||
std::unique_ptr<LLamaPrivate> d_ptr;
|
std::unique_ptr<LLamaPrivate> d_ptr;
|
||||||
bool m_supportsEmbedding = false;
|
bool m_supportsEmbedding = false;
|
||||||
bool m_supportsCompletion = false;
|
bool m_supportsCompletion = false;
|
||||||
|
Loading…
Reference in New Issue
Block a user