diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
index e71693a4..3b1f7e4a 100644
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@@ -881,7 +881,7 @@ jobs:
             wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
             sudo dpkg -i cuda-keyring_1.1-1_all.deb
             sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-11-8 libcublas-dev-11-8 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-11-8 libcublas-dev-11-8 libnvidia-compute-550-server
             pip install setuptools wheel cmake
       - run:
           name: Build C library
@@ -889,7 +889,9 @@ jobs:
             export PATH=$PATH:/usr/local/cuda/bin
             git submodule update --init --recursive
             cd gpt4all-backend
-            cmake -B build -DCMAKE_BUILD_TYPE=Release
+            cmake -B build -DCMAKE_BUILD_TYPE=Release \
+              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON \
+              -DCMAKE_CUDA_ARCHITECTURES='52-virtual;61-virtual;70-virtual;75-virtual'
             cmake --build build -j$(nproc)
       - run:
           name: Build wheel
@@ -986,7 +988,9 @@ jobs:
             $Env:PATH += ";C:\VulkanSDK\1.3.261.1\bin"
             $Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
             cd gpt4all-backend
-            cmake -G Ninja -B build -DCMAKE_BUILD_TYPE=Release -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
+            cmake -G Ninja -B build -DCMAKE_BUILD_TYPE=Release `
+              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON `
+              -DCMAKE_CUDA_ARCHITECTURES='52-virtual;61-virtual;70-virtual;75-virtual'
             cmake --build build --parallel
       - run:
           name: Build wheel
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
index 5862c726..e6210d74 100644
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -63,10 +63,6 @@ if (LLMODEL_VULKAN)
     list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
-    if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(GGML_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
-    endif()
-
     include(CheckLanguage)
     check_language(CUDA)
     if (NOT CMAKE_CUDA_COMPILER)
diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
index eac2dcf7..a541da03 100644
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@@ -378,19 +378,19 @@ function(include_ggml SUFFIX)
         find_package(CUDAToolkit REQUIRED)
         set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
 
-        if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
+        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
             # 52 == lowest CUDA 12 standard
             # 60 == f16 CUDA intrinsics
             # 61 == integer CUDA intrinsics
             # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
             if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-                set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
+                set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
             else()
-                set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
-                #set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
+                set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
+                #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
             endif()
         endif()
-        message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
+        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
         set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
         file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
@@ -1018,9 +1018,6 @@ function(include_ggml SUFFIX)
         C_STANDARD 11
         C_STANDARD_REQUIRED true
         )
-    if (GGML_CUDA_ARCHITECTURES)
-        set_property(TARGET ggml${SUFFIX} llama${SUFFIX} PROPERTY CUDA_ARCHITECTURES "${GGML_CUDA_ARCHITECTURES}")
-    endif()
 
     target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
     target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
diff --git a/gpt4all-bindings/python/CHANGELOG.md b/gpt4all-bindings/python/CHANGELOG.md
index fce39350..20c8eece 100644
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
+## [Unreleased]
+
+### Changed
+- Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
+- Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
+
 ## [2.8.0] - 2024-08-05
 
 ### Added
@@ -16,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
 
 ### Changed
+- Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
 - Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
 
 ### Removed
@@ -33,4 +40,5 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
   - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
   - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
 
+[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...HEAD
 [2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0
diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
index a4952fe3..88be2949 100644
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -39,25 +39,34 @@ if platform.system() == "Darwin" and platform.processor() == "i386":
             Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
         """))
 
+
+def _load_cuda(rtver: str, blasver: str) -> None:
+    if platform.system() == "Linux":
+        cudalib   = f"lib/libcudart.so.{rtver}"
+        cublaslib = f"lib/libcublas.so.{blasver}"
+    else:  # Windows
+        cudalib   = fr"bin\cudart64_{rtver.replace(".", "")}.dll"
+        cublaslib = fr"bin\cublas64_{blasver}.dll"
+
+    # preload the CUDA libs so the backend can find them
+    ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
+    ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
+
+
 # Find CUDA libraries from the official packages
 cuda_found = False
-if platform.system() in ('Linux', 'Windows'):
+if platform.system() in ("Linux", "Windows"):
     try:
         from nvidia import cuda_runtime, cublas
     except ImportError:
         pass  # CUDA is optional
     else:
-        if platform.system() == 'Linux':
-            cudalib   = 'lib/libcudart.so.12'
-            cublaslib = 'lib/libcublas.so.12'
-        else:  # Windows
-            cudalib   = r'bin\cudart64_12.dll'
-            cublaslib = r'bin\cublas64_12.dll'
-
-        # preload the CUDA libs so the backend can find them
-        ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
-        ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
-        cuda_found = True
+        for rtver, blasver in [("12", "12"), ("11.0", "11")]:
+            try:
+                _load_cuda(rtver, blasver)
+                cuda_found = True
+            except OSError:  # dlopen() does not give specific error codes
+                pass  # try the next one
 
 
 # TODO: provide a config file to make this more robust
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index 1fe93873..e92fba61 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():
 
 setup(
     name=package_name,
-    version="2.8.0",
+    version="2.8.1.dev0",
     description="Python bindings for GPT4All",
     long_description=get_long_description(),
     long_description_content_type="text/markdown",