runtime: Add GPU annotations for remote hypervisor

Add GPU annotations for remote hypervisor to help with the right instance selection based on number of GPUs and model Signed-off-by: Pradipta Banerjee <pradipta.banerjee@gmail.com>
2025-08-31 16:36:38 +00:00 · 2024-10-23 13:22:59 -04:00
parent 68225b53ca
commit 6f1ba007ed
7 changed files with 58 additions and 1 deletions
--- a/docs/how-to/how-to-set-sandbox-config-kata.md
+++ b/docs/how-to/how-to-set-sandbox-config-kata.md
@@ -94,6 +94,8 @@ There are several kinds of Kata configurations and they are listed below.
 | `io.katacontainers.config.hypervisor.virtio_fs_extra_args` | string | extra options passed to `virtiofs` daemon |
 | `io.katacontainers.config.hypervisor.enable_guest_swap` | `boolean` | enable swap in the guest |
 | `io.katacontainers.config.hypervisor.use_legacy_serial` | `boolean` | uses legacy serial device for guest's console (QEMU) |
+| `io.katacontainers.config.hypervisor.default_gpus` | uint32 | the minimum number of GPUs required for the VM. Only used by remote hypervisor to help with instance selection |
+| `io.katacontainers.config.hypervisor.default_gpu_model` | string | the GPU model required for the VM. Only used by remote hypervisor to help with instance selection |

 ## Container Options
 | Key | Value Type | Comments |
--- a/src/runtime/config/configuration-remote.toml.in
+++ b/src/runtime/config/configuration-remote.toml.in
@@ -38,7 +38,7 @@ remote_hypervisor_timeout = 600
 # Each member of the list is a regular expression, which is the base name
 # of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
 # Note: Remote hypervisor is only handling the following annotations
-enable_annotations = ["machine_type", "default_memory", "default_vcpus", "image"]
+enable_annotations = ["machine_type", "default_memory", "default_vcpus", "image", "default_gpus", "gpu_model"]

 # Optional space-separated list of options to pass to the guest kernel.
 # For example, use `kernel_params = "vsyscall=emulate"` if you are having
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -560,6 +560,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
 		config.HypervisorConfig.Initdata = initdata
 	}

+	if err := addHypervisorGPUOverrides(ocispec, config); err != nil {
+		return err
+	}
+
 	return nil
 }

@@ -754,6 +758,26 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e
 	})
 }

+func addHypervisorGPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
+	if sbConfig.HypervisorType != vc.RemoteHypervisor {
+		return nil
+	}
+
+	if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultGPUs).setUint(func(gpus uint64) {
+		sbConfig.HypervisorConfig.DefaultGPUs = uint32(gpus)
+	}); err != nil {
+		return err
+	}
+
+	if value, ok := ocispec.Annotations[vcAnnotations.DefaultGPUModel]; ok {
+		if value != "" {
+			sbConfig.HypervisorConfig.DefaultGPUModel = value
+		}
+	}
+
+	return nil
+}
+
 func addHypervisorBlockOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
 	if value, ok := ocispec.Annotations[vcAnnotations.BlockDeviceDriver]; ok {
 		supportedBlockDrivers := []string{config.VirtioSCSI, config.VirtioBlock, config.VirtioMmio, config.Nvdimm, config.VirtioBlockCCW}
--- a/src/runtime/pkg/oci/utils_test.go
+++ b/src/runtime/pkg/oci/utils_test.go
@@ -775,6 +775,23 @@ func TestAddRemoteHypervisorAnnotations(t *testing.T) {
 	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
 	assert.NoError(err)
 	assert.Equal(sbConfig.HypervisorConfig.Initdata, "initdata")
+
+	// When GPU annotations are specified, remote hypervisor annotations have the annotation added
+	ocispec.Annotations[vcAnnotations.DefaultGPUs] = "-1"
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
+	assert.Error(err)
+
+	ocispec.Annotations[vcAnnotations.DefaultGPUs] = "1"
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
+	assert.NoError(err)
+	assert.Equal(sbConfig.HypervisorConfig.DefaultGPUs, uint32(1))
+
+	// When GPU annotations are specified, remote hypervisor annotations have the annotation added
+	ocispec.Annotations[vcAnnotations.DefaultGPUModel] = "tesla"
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
+	assert.NoError(err)
+	assert.Equal(sbConfig.HypervisorConfig.DefaultGPUModel, "tesla")
+
 }

 func TestAddProtectedHypervisorAnnotations(t *testing.T) {
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -673,6 +673,12 @@ type HypervisorConfig struct {

 	// Initdata defines the initdata passed into guest when CreateVM
 	Initdata string
+
+	// GPU specific annotations (currently only applicable for Remote Hypervisor)
+	//DefaultGPUs specifies the number of GPUs required for the Kata VM
+	DefaultGPUs uint32
+	// DefaultGPUModel specifies GPU model like tesla, h100, readeon etc.
+	DefaultGPUModel string
 }

 // vcpu mapping from vcpu number to thread number
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -132,6 +132,12 @@ const (
 	// UseLegacySerial sets legacy serial device for guest console if available and implemented for architecture
 	UseLegacySerial = kataAnnotHypervisorPrefix + "use_legacy_serial"

+	// GPU specific annotations used by remote hypervisor for instance selection
+	// Number of GPUs required in the Kata VM
+	DefaultGPUs = kataAnnotHypervisorPrefix + "default_gpus"
+	// GPU model - tesla, h100, radeon etc..
+	DefaultGPUModel = kataAnnotHypervisorPrefix + "default_gpu_model"
+
 	//
 	// CPU Annotations
 	//
--- a/src/runtime/virtcontainers/remote.go
+++ b/src/runtime/virtcontainers/remote.go
@@ -81,6 +81,8 @@ func (rh *remoteHypervisor) CreateVM(ctx context.Context, id string, network Net
 	annotations[hypannotations.DefaultVCPUs] = strconv.FormatUint(uint64(hypervisorConfig.NumVCPUs()), 10)
 	annotations[hypannotations.DefaultMemory] = strconv.FormatUint(uint64(hypervisorConfig.MemorySize), 10)
 	annotations[hypannotations.Initdata] = hypervisorConfig.Initdata
+	annotations[hypannotations.DefaultGPUs] = strconv.FormatUint(uint64(hypervisorConfig.DefaultGPUs), 10)
+	annotations[hypannotations.DefaultGPUModel] = hypervisorConfig.DefaultGPUModel

 	req := &pb.CreateVMRequest{
 		Id:                   id,