llmodel: skip attempting Metal if model+kvcache > 53% of system ram

2025-09-26 23:45:12 +00:00 · 2023-06-26 14:53:17 -07:00
parent 57fa8644d6
commit db34a2f670
2 changed files with 17 additions and 1 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -178,7 +178,9 @@ int32_t LLamaModel::threadCount() const {
 LLamaModel::~LLamaModel()
 {
-    llama_free(d_ptr->ctx);
+    if(d_ptr->ctx) {
        llama_free(d_ptr->ctx);
    }
 }
 bool LLamaModel::isModelLoaded() const
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -1,5 +1,6 @@
 #include "llmodel.h"
 #include "dlhandle.h"
 #include "sysinfo.h"
 #include <iostream>
 #include <string>
@@ -129,7 +130,20 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria
    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
        if (buildVariant == "auto") {
            size_t total_mem = getSystemTotalRAMInBytes();
            impl = implementation(f, "metal");
            if(impl) {
                LLModel* metalimpl = impl->construct();
                size_t req_mem = metalimpl->requiredMem(modelPath);
                float req_to_total = (float) req_mem / (float) total_mem;
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                if (req_to_total >= 0.53) {
                    delete metalimpl;
                    impl = nullptr;
                } else {
                    return metalimpl;
                }
            }
        }
    #endif