From 290c62944273fa8b988e045e1065db2b4a711cf9 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 19 Jul 2024 14:52:58 -0400 Subject: [PATCH] backend: rebase llama.cpp submodule on latest upstream (#2694) * Adds support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Kompute support) * Also enables Kompute support for StarCoder2, XVERSE, Command R, and OLMo * Includes a number of Kompute resource management fixes Signed-off-by: Jared Van Bortel --- gpt4all-backend/CMakeLists.txt | 26 +- gpt4all-backend/llama.cpp-mainline | 2 +- gpt4all-backend/llama.cpp.cmake | 411 ++++++++++++++++------------- gpt4all-backend/llamamodel.cpp | 38 ++- 4 files changed, 266 insertions(+), 211 deletions(-) diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index 58c1084d..b47c4505 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -90,25 +90,25 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) else() set(GPT4ALL_ALLOW_NON_AVX ON) endif() - set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX}) - set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX}) - set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX}) + set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX}) + set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX}) + set(GGML_FMA ${GPT4ALL_ALLOW_NON_AVX}) - set(LLAMA_METAL OFF) - set(LLAMA_KOMPUTE OFF) - set(LLAMA_VULKAN OFF) - set(LLAMA_CUDA OFF) - set(LLAMA_ROCM OFF) + set(GGML_METAL OFF) + set(GGML_KOMPUTE OFF) + set(GGML_VULKAN OFF) + set(GGML_CUDA OFF) + set(GGML_ROCM OFF) if (BUILD_VARIANT MATCHES metal) - set(LLAMA_METAL ON) + set(GGML_METAL ON) elseif (BUILD_VARIANT MATCHES kompute) - set(LLAMA_KOMPUTE ON) + set(GGML_KOMPUTE ON) elseif (BUILD_VARIANT MATCHES vulkan) - set(LLAMA_VULKAN ON) + set(GGML_VULKAN ON) elseif (BUILD_VARIANT MATCHES cuda) - set(LLAMA_CUDA ON) + set(GGML_CUDA ON) elseif (BUILD_VARIANT MATCHES rocm) - set(LLAMA_HIPBLAS ON) + set(GGML_HIPBLAS ON) endif() # Include GGML diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index dc517633..2bae44a0 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit dc51763303bd2dae0a2aecf0f205f3eee3f59620 +Subproject commit 2bae44a07fddf10512005c9475b73c09d38364a2 diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake index f12ec259..eac2dcf7 100644 --- a/gpt4all-backend/llama.cpp.cmake +++ b/gpt4all-backend/llama.cpp.cmake @@ -7,7 +7,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) # # some of the options here are commented out so they can be set "dynamically" before calling include_ggml() -set(LLAMA_LLAMAFILE_DEFAULT ON) +set(GGML_LLAMAFILE_DEFAULT ON) # general option(LLAMA_STATIC "llama: static link libraries" OFF) @@ -22,15 +22,15 @@ option(LLAMA_GPROF "llama: enable gprof" option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) # instruction set specific -#option(LLAMA_AVX "llama: enable AVX" ON) -#option(LLAMA_AVX2 "llama: enable AVX2" ON) -#option(LLAMA_AVX512 "llama: enable AVX512" OFF) -#option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) -#option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) -#option(LLAMA_FMA "llama: enable FMA" ON) +#option(GGML_AVX "ggml: enable AVX" ON) +#option(GGML_AVX2 "ggml: enable AVX2" ON) +#option(GGML_AVX512 "ggml: enable AVX512" OFF) +#option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +#option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +#option(GGML_FMA "ggml: enable FMA" ON) # in MSVC F16C is implied with AVX2/AVX512 #if (NOT MSVC) -# option(LLAMA_F16C "llama: enable F16C" ON) +# option(GGML_F16C "ggml: enable F16C" ON) #endif() if (WIN32) @@ -38,40 +38,46 @@ if (WIN32) endif() # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) -set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -#option(LLAMA_CUDA "llama: use CUDA" OFF) -option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) -option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) -set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") -set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") -option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) -set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") -set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING - "llama: max. batch size for using peer access") -option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) -#option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) -option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) -#option(LLAMA_CLBLAST "llama: use CLBlast" OFF) -#option(LLAMA_VULKAN "llama: use Vulkan" OFF) -option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF) -option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF) -option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF) -option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF) -#option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) -option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) -option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) -set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING - "llama: metal minimum macOS version") -set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") -#option(LLAMA_KOMPUTE "llama: use Kompute" OFF) -option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) -set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") +option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) +option(GGML_BLAS "ggml: use BLAS" OFF) +option(GGML_LLAMAFILE "ggml: use llamafile SGEMM" ${GGML_LLAMAFILE_DEFAULT}) +set(GGML_BLAS_VENDOR "Generic" CACHE STRING "ggml: BLAS library vendor") + +#option(GGML_CUDA "ggml: use CUDA" OFF) +option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) +option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) +set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") +set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") +option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) +set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING + "ggml: iters./thread per block for Q2_K/Q6_K") +set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING + "ggml: max. batch size for using peer access") +option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) +option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) +option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) +option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) + +#option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) +option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) +#option(GGML_VULKAN "ggml: use Vulkan" OFF) +option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) +option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) +option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) +option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) +#option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) +option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) +option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) +set(GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING + "ggml: metal minimum macOS version") +set(GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") +#option(GGML_KOMPUTE "ggml: use Kompute" OFF) +option(GGML_QKK_64 "ggml: use super-block size of 64 for k-quants" OFF) +set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") # add perf arguments -option(LLAMA_PERF "llama: enable perf" OFF) +option(LLAMA_PERF "llama: enable perf" OFF) # # Compile flags @@ -80,14 +86,14 @@ option(LLAMA_PERF "llama: enable perf" set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) -list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) +list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) # enable libstdc++ assertions for debug builds if (CMAKE_SYSTEM_NAME MATCHES "Linux") list(APPEND GGML_COMPILE_DEFS $<$:_GLIBCXX_ASSERTIONS>) endif() -if (APPLE AND LLAMA_ACCELERATE) +if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) message(STATUS "Accelerate framework found") @@ -101,7 +107,7 @@ if (APPLE AND LLAMA_ACCELERATE) endif() endif() -if (LLAMA_BLAS) +if (GGML_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) endif() @@ -109,7 +115,7 @@ if (LLAMA_BLAS) set(BLA_SIZEOF_INTEGER 8) endif() - set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) + set(BLA_VENDOR ${GGML_BLAS_VENDOR}) find_package(BLAS) if (BLAS_FOUND) @@ -119,24 +125,24 @@ if (LLAMA_BLAS) # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) - if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") + if (${GGML_BLAS_VENDOR} MATCHES "Generic") pkg_check_modules(DepBLAS REQUIRED blas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") + elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") # As of openblas v0.3.22, the 64-bit is named openblas64.pc pkg_check_modules(DepBLAS openblas64) if (NOT DepBLAS_FOUND) pkg_check_modules(DepBLAS REQUIRED openblas) endif() - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") + elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") pkg_check_modules(DepBLAS REQUIRED blis) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") + elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") pkg_check_modules(DepBLAS REQUIRED blas-atlas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") + elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") pkg_check_modules(DepBLAS REQUIRED flexiblas_api) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") + elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") # all Intel* libraries share the same include path pkg_check_modules(DepBLAS REQUIRED mkl-sdl) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") + elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") # this doesn't provide pkg-config # suggest to assign BLAS_INCLUDE_DIRS on your own if ("${NVHPC_VERSION}" STREQUAL "") @@ -170,7 +176,7 @@ if (LLAMA_BLAS) list(APPEND GGML_COMPILE_DEFS GGML_USE_OPENBLAS) - if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) list(APPEND GGML_COMPILE_DEFS GGML_BLAS_USE_MKL) endif() @@ -179,18 +185,18 @@ if (LLAMA_BLAS) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct LLAMA_BLAS_VENDOR") + " to set correct GGML_BLAS_VENDOR") endif() endif() -if (LLAMA_LLAMAFILE) +if (GGML_LLAMAFILE) list(APPEND GGML_COMPILE_DEFS GGML_USE_LLAMAFILE) - set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/sgemm.h) - set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/sgemm.cpp) + set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.h) + set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.cpp) endif() -if (LLAMA_QKK_64) +if (GGML_QKK_64) list(APPEND GGML_COMPILE_DEFS GGML_QKK_64) endif() @@ -361,8 +367,9 @@ function(include_ggml SUFFIX) # libraries # - if (LLAMA_CUDA) - cmake_minimum_required(VERSION 3.17) + if (GGML_CUDA) + cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES + get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES) if (NOT CUDA IN_LIST LANGS) message(FATAL_ERROR "The CUDA language must be enabled.") @@ -376,35 +383,71 @@ function(include_ggml SUFFIX) # 60 == f16 CUDA intrinsics # 61 == integer CUDA intrinsics # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - set(GGML_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics else() - set(GGML_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics #set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work endif() endif() message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}") - set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml-cuda.h) + set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h) + file(GLOB GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh") + list(APPEND GGML_HEADERS_CUDA "${DIRECTORY}/ggml/include/ggml-cuda.h") - file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda.cu") + file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda.cu") + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + endif() list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CUDA) - if (LLAMA_CUDA_FORCE_DMMV) + + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + + if (GGML_CUDA_USE_GRAPHS) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_USE_GRAPHS) + endif() + + if (GGML_CUDA_FORCE_DMMV) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV) endif() - if (LLAMA_CUDA_FORCE_MMQ) + + if (GGML_CUDA_FORCE_MMQ) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ) endif() - list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - if (LLAMA_CUDA_F16) + + if (GGML_CUDA_FORCE_CUBLAS) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_CUBLAS) + endif() + + if (GGML_CUDA_NO_VMM) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_VMM) + endif() + + if (GGML_CUDA_F16) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_F16) endif() - list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) - if (LLAMA_CUDA_NO_PEER_COPY) + + if (GGML_CUDA_NO_PEER_COPY) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY) endif() @@ -422,45 +465,34 @@ function(include_ggml SUFFIX) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) endif() - if (LLAMA_CLBLAST) - find_package(CLBlast REQUIRED) - - set(GGML_HEADERS_OPENCL ${DIRECTORY}/ggml-opencl.h) - set(GGML_SOURCES_OPENCL ${DIRECTORY}/ggml-opencl.cpp) - - list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CLBLAST) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast) - endif() - - if (LLAMA_VULKAN) + if (GGML_VULKAN) find_package(Vulkan REQUIRED) - set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml-vulkan.h) - set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml-vulkan.cpp) + set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml/include/ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml/src/ggml-vulkan.cpp) list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_VULKAN) - if (LLAMA_VULKAN_CHECK_RESULTS) + if (GGML_VULKAN_CHECK_RESULTS) list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_CHECK_RESULTS) endif() - if (LLAMA_VULKAN_DEBUG) + if (GGML_VULKAN_DEBUG) list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_DEBUG) endif() - if (LLAMA_VULKAN_VALIDATE) + if (GGML_VULKAN_VALIDATE) list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_VALIDATE) endif() - if (LLAMA_VULKAN_RUN_TESTS) + if (GGML_VULKAN_RUN_TESTS) list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_RUN_TESTS) endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) endif() - if (LLAMA_HIPBLAS) + if (GGML_HIPBLAS) if ($ENV{ROCM_PATH}) set(ROCM_PATH $ENV{ROCM_PATH}) else() @@ -490,32 +522,32 @@ function(include_ggml SUFFIX) message(STATUS "HIP and hipBLAS found") - set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml-cuda.h) + set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml/include/ggml-cuda.h) - file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm/*.cu") - list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm.cu") + file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm/*.cu") + list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm.cu") list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_HIPBLAS GGML_USE_CUDA) - if (LLAMA_HIP_UMA) + if (GGML_HIP_UMA) list(APPEND GGML_COMPILE_DEFS GGML_HIP_UMA) endif() - if (LLAMA_CUDA_FORCE_DMMV) + if (GGML_CUDA_FORCE_DMMV) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV) endif() - if (LLAMA_CUDA_FORCE_MMQ) + if (GGML_CUDA_FORCE_MMQ) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ) endif() - if (LLAMA_CUDA_NO_PEER_COPY) + if (GGML_CUDA_NO_PEER_COPY) list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY) endif() - list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) if (CXX_IS_HIPCC) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) @@ -533,9 +565,9 @@ function(include_ggml SUFFIX) set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}) - if (LLAMA_KOMPUTE AND NOT GGML_KOMPUTE_ONCE) + if (GGML_KOMPUTE AND NOT GGML_KOMPUTE_ONCE) set(GGML_KOMPUTE_ONCE ON PARENT_SCOPE) - if (NOT EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt") + if (NOT EXISTS "${LLAMA_DIR}/ggml/src/kompute/CMakeLists.txt") message(FATAL_ERROR "Kompute not found") endif() message(STATUS "Kompute found") @@ -559,12 +591,12 @@ function(include_ggml SUFFIX) set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv) add_custom_command( OUTPUT ${spv_file} - DEPENDS ${LLAMA_DIR}/${source} - ${LLAMA_DIR}/kompute-shaders/common.comp - ${LLAMA_DIR}/kompute-shaders/op_getrows.comp - ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source} + DEPENDS ${LLAMA_DIR}/ggml/src/kompute-shaders/${source} + ${LLAMA_DIR}/ggml/src/kompute-shaders/common.comp + ${LLAMA_DIR}/ggml/src/kompute-shaders/op_getrows.comp + ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp + ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/ggml/src/kompute-shaders/${source} COMMENT "Compiling ${source} to ${source}.spv" ) @@ -610,39 +642,39 @@ function(include_ggml SUFFIX) set(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "v1.3.239" CACHE STRING "Kompute Vulkan headers tag") set(KOMPUTE_OPT_LOG_LEVEL Critical CACHE STRING "Kompute log level") set(FMT_INSTALL OFF) - add_subdirectory(${LLAMA_DIR}/kompute) + add_subdirectory(${LLAMA_DIR}/ggml/src/kompute) # Compile our shaders compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f32.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_f16.comp - kompute-shaders/op_rope_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp + op_scale.comp + op_scale_8.comp + op_add.comp + op_addrow.comp + op_mul.comp + op_silu.comp + op_relu.comp + op_gelu.comp + op_softmax.comp + op_norm.comp + op_rmsnorm.comp + op_diagmask.comp + op_mul_mat_mat_f32.comp + op_mul_mat_f16.comp + op_mul_mat_q8_0.comp + op_mul_mat_q4_0.comp + op_mul_mat_q4_1.comp + op_mul_mat_q6_k.comp + op_getrows_f32.comp + op_getrows_f16.comp + op_getrows_q4_0.comp + op_getrows_q4_1.comp + op_getrows_q6_k.comp + op_rope_f16.comp + op_rope_f32.comp + op_cpy_f16_f16.comp + op_cpy_f16_f32.comp + op_cpy_f32_f16.comp + op_cpy_f32_f32.comp ) # Create a custom target for our generated shaders @@ -687,12 +719,12 @@ function(include_ggml SUFFIX) ) endif() - if (LLAMA_KOMPUTE) + if (GGML_KOMPUTE) list(APPEND GGML_COMPILE_DEFS VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) # Add the stamp to the main sources to ensure dependency tracking - set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml-kompute.h) + set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml/src/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml/include/ggml-kompute.h) list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_KOMPUTE) @@ -701,7 +733,7 @@ function(include_ggml SUFFIX) set(CUDA_CXX_FLAGS "") - if (LLAMA_CUDA) + if (GGML_CUDA) set(CUDA_FLAGS -use_fast_math) if (LLAMA_FATAL_WARNINGS) @@ -748,25 +780,25 @@ function(include_ggml SUFFIX) endif() endif() - if (LLAMA_METAL) + if (GGML_METAL) find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(METAL_FRAMEWORK Metal REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") - set(GGML_HEADERS_METAL ${DIRECTORY}/ggml-metal.h) - set(GGML_SOURCES_METAL ${DIRECTORY}/ggml-metal.m) + set(GGML_HEADERS_METAL ${DIRECTORY}/ggml/include/ggml-metal.h) + set(GGML_SOURCES_METAL ${DIRECTORY}/ggml/src/ggml-metal.m) list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_METAL) - if (LLAMA_METAL_NDEBUG) + if (GGML_METAL_NDEBUG) list(APPEND GGML_COMPILE_DEFS GGML_METAL_NDEBUG) endif() # copy ggml-common.h and ggml-metal.metal to bin directory - configure_file(${DIRECTORY}/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) - configure_file(${DIRECTORY}/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + configure_file(${DIRECTORY}/ggml/src/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) + configure_file(${DIRECTORY}/ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) - if (LLAMA_METAL_SHADER_DEBUG) + if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib @@ -782,13 +814,13 @@ function(include_ggml SUFFIX) endif() # Append macOS metal versioning flags - if (LLAMA_METAL_MACOSX_VERSION_MIN) - message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation") - list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN}) + if (GGML_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list(APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN}) endif() - if (LLAMA_METAL_STD) - message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation") - list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD}) + if (GGML_METAL_STD) + message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation") + list(APPEND XC_FLAGS -std=${GGML_METAL_STD}) endif() set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib) @@ -799,7 +831,7 @@ function(include_ggml SUFFIX) COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal - DEPENDS ${DIRECTORY}/ggml-metal.metal ${DIRECTORY}/ggml-common.h + DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h COMMENT "Compiling Metal kernels" ) set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON) @@ -853,49 +885,49 @@ function(include_ggml SUFFIX) CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) message(STATUS "x86 detected") if (MSVC) - if (LLAMA_AVX512) + if (GGML_AVX512) list(APPEND ARCH_FLAGS /arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. # Do it manually. - if (LLAMA_AVX512_VBMI) + if (GGML_AVX512_VBMI) list(APPEND GGML_COMPILE_DEFS $<$:__AVX512VBMI__>) list(APPEND GGML_COMPILE_DEFS $<$:__AVX512VBMI__>) endif() - if (LLAMA_AVX512_VNNI) + if (GGML_AVX512_VNNI) list(APPEND GGML_COMPILE_DEFS $<$:__AVX512VNNI__>) list(APPEND GGML_COMPILE_DEFS $<$:__AVX512VNNI__>) endif() - elseif (LLAMA_AVX2) + elseif (GGML_AVX2) list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (LLAMA_AVX) + elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) endif() else() - if (LLAMA_NATIVE) + if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) endif() - if (LLAMA_F16C) + if (GGML_F16C) list(APPEND ARCH_FLAGS -mf16c) endif() - if (LLAMA_FMA) + if (GGML_FMA) list(APPEND ARCH_FLAGS -mfma) endif() - if (LLAMA_AVX) + if (GGML_AVX) list(APPEND ARCH_FLAGS -mavx) endif() - if (LLAMA_AVX2) + if (GGML_AVX2) list(APPEND ARCH_FLAGS -mavx2) endif() - if (LLAMA_AVX512) + if (GGML_AVX512) list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512bw) endif() - if (LLAMA_AVX512_VBMI) + if (GGML_AVX512_VBMI) list(APPEND ARCH_FLAGS -mavx512vbmi) endif() - if (LLAMA_AVX512_VNNI) + if (GGML_AVX512_VNNI) list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() @@ -914,7 +946,7 @@ function(include_ggml SUFFIX) list(APPEND GGML_COMPILE_OPTS "$<$:${ARCH_FLAGS}>") list(APPEND GGML_COMPILE_OPTS "$<$:${ARCH_FLAGS}>") - if (LLAMA_CUDA) + if (GGML_CUDA) list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") @@ -926,24 +958,26 @@ function(include_ggml SUFFIX) # ggml add_library(ggml${SUFFIX} OBJECT - ${DIRECTORY}/ggml.c - ${DIRECTORY}/ggml.h - ${DIRECTORY}/ggml-alloc.c - ${DIRECTORY}/ggml-alloc.h - ${DIRECTORY}/ggml-backend.c - ${DIRECTORY}/ggml-backend.h - ${DIRECTORY}/ggml-quants.c - ${DIRECTORY}/ggml-quants.h + ${DIRECTORY}/ggml/include/ggml.h + ${DIRECTORY}/ggml/include/ggml-alloc.h + ${DIRECTORY}/ggml/include/ggml-backend.h + ${DIRECTORY}/ggml/src/ggml.c + ${DIRECTORY}/ggml/src/ggml-alloc.c + ${DIRECTORY}/ggml/src/ggml-backend.c + ${DIRECTORY}/ggml/src/ggml-quants.c + ${DIRECTORY}/ggml/src/ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${DIRECTORY}/ggml/src/ggml-aarch64.c + ${DIRECTORY}/ggml/src/ggml-aarch64.h ) - target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY} ${LLAMA_EXTRA_INCLUDES}) + target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}/ggml/include ${LLAMA_EXTRA_INCLUDES}) + target_include_directories(ggml${SUFFIX} PRIVATE ${DIRECTORY}/ggml/src) target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump target_link_libraries(ggml${SUFFIX} PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) @@ -955,14 +989,15 @@ function(include_ggml SUFFIX) # llama add_library(llama${SUFFIX} STATIC - ${DIRECTORY}/llama.cpp - ${DIRECTORY}/llama.h - ${DIRECTORY}/unicode.h - ${DIRECTORY}/unicode.cpp - ${DIRECTORY}/unicode-data.cpp + ${DIRECTORY}/include/llama.h + ${DIRECTORY}/src/llama.cpp + ${DIRECTORY}/src/unicode.h + ${DIRECTORY}/src/unicode.cpp + ${DIRECTORY}/src/unicode-data.cpp ) - target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}) + target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include) + target_include_directories(llama${SUFFIX} PRIVATE ${DIRECTORY}/src) target_compile_features (llama${SUFFIX} PUBLIC cxx_std_11) # don't bump target_link_libraries(llama${SUFFIX} PRIVATE diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index ab560e89..d5c408fc 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -30,9 +30,9 @@ #ifdef GGML_USE_KOMPUTE # include -#elif GGML_USE_VULKAN +#elif defined(GGML_USE_VULKAN) # include -#elif GGML_USE_CUDA +#elif defined(GGML_USE_CUDA) # include #endif @@ -51,14 +51,14 @@ static const std::vector KNOWN_ARCHES { // "grok", -- 314B parameters "gpt2", // "gptj", -- no inference code - // "gptneox", -- no inference code + "gptneox", "mpt", "baichuan", "starcoder", - // "persimmon", -- CUDA generates garbage "refact", "bert", "nomic-bert", + // "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed. "bloom", "stablelm", "qwen", @@ -72,12 +72,20 @@ static const std::vector KNOWN_ARCHES { "internlm2", // "minicpm", -- CUDA generates garbage "gemma", + "gemma2", "starcoder2", // "mamba", -- CUDA missing SSM_CONV "xverse", "command-r", // "dbrx", -- 16x12B parameters "olmo", + "openelm", + // "arctic", -- 10B+128x3.66B parameters + // "deepseek2", -- excessive VRAM requirements + "chatglm", + // "bitnet", -- tensor not within file bounds? + // "t5", -- seq2seq model + "jais", }; static const std::vector EMBEDDING_ARCHES { @@ -103,6 +111,16 @@ static void llama_log_callback(enum ggml_log_level level, const char *text, void } } +#ifdef GGML_USE_CUDA +static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata) +{ + (void)userdata; + if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) { + fputs(text, stderr); + } +} +#endif + struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_keep = 0; // number of tokens to keep from initial prompt @@ -515,9 +533,8 @@ std::vector LLamaModel::tokenize(PromptContext &ctx, const std:: { const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty(); const bool useBOS = wantBOS && shouldAddBOS(); - auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore - std::vector fres(strCat.size()+4); - auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special); + std::vector fres(str.length() + 4); + auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, special); fres.resize(fres_len); return fres; } @@ -525,10 +542,10 @@ std::vector LLamaModel::tokenize(PromptContext &ctx, const std:: std::string LLamaModel::tokenToString(Token id) const { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false); + const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false); + int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false); GGML_ASSERT(check == -n_tokens); } else { @@ -1170,6 +1187,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch) DLL_EXPORT LLModel *construct() { llama_log_set(llama_log_callback, nullptr); +#ifdef GGML_USE_CUDA + ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr); +#endif return new LLamaModel; } }