runtime: Add GPU annotations for remote hypervisor

Add GPU annotations for remote hypervisor to help
with the right instance selection based on number of GPUs
and model

Signed-off-by: Pradipta Banerjee <pradipta.banerjee@gmail.com>
This commit is contained in:
Pradipta Banerjee 2024-10-23 13:22:59 -04:00
parent 68225b53ca
commit 6f1ba007ed
7 changed files with 58 additions and 1 deletions

View File

@ -94,6 +94,8 @@ There are several kinds of Kata configurations and they are listed below.
| `io.katacontainers.config.hypervisor.virtio_fs_extra_args` | string | extra options passed to `virtiofs` daemon |
| `io.katacontainers.config.hypervisor.enable_guest_swap` | `boolean` | enable swap in the guest |
| `io.katacontainers.config.hypervisor.use_legacy_serial` | `boolean` | uses legacy serial device for guest's console (QEMU) |
| `io.katacontainers.config.hypervisor.default_gpus` | uint32 | the minimum number of GPUs required for the VM. Only used by remote hypervisor to help with instance selection |
| `io.katacontainers.config.hypervisor.default_gpu_model` | string | the GPU model required for the VM. Only used by remote hypervisor to help with instance selection |
## Container Options
| Key | Value Type | Comments |

View File

@ -38,7 +38,7 @@ remote_hypervisor_timeout = 600
# Each member of the list is a regular expression, which is the base name
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
# Note: Remote hypervisor is only handling the following annotations
enable_annotations = ["machine_type", "default_memory", "default_vcpus", "image"]
enable_annotations = ["machine_type", "default_memory", "default_vcpus", "image", "default_gpus", "gpu_model"]
# Optional space-separated list of options to pass to the guest kernel.
# For example, use `kernel_params = "vsyscall=emulate"` if you are having

View File

@ -560,6 +560,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
config.HypervisorConfig.Initdata = initdata
}
if err := addHypervisorGPUOverrides(ocispec, config); err != nil {
return err
}
return nil
}
@ -754,6 +758,26 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e
})
}
func addHypervisorGPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
if sbConfig.HypervisorType != vc.RemoteHypervisor {
return nil
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultGPUs).setUint(func(gpus uint64) {
sbConfig.HypervisorConfig.DefaultGPUs = uint32(gpus)
}); err != nil {
return err
}
if value, ok := ocispec.Annotations[vcAnnotations.DefaultGPUModel]; ok {
if value != "" {
sbConfig.HypervisorConfig.DefaultGPUModel = value
}
}
return nil
}
func addHypervisorBlockOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
if value, ok := ocispec.Annotations[vcAnnotations.BlockDeviceDriver]; ok {
supportedBlockDrivers := []string{config.VirtioSCSI, config.VirtioBlock, config.VirtioMmio, config.Nvdimm, config.VirtioBlockCCW}

View File

@ -775,6 +775,23 @@ func TestAddRemoteHypervisorAnnotations(t *testing.T) {
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.NoError(err)
assert.Equal(sbConfig.HypervisorConfig.Initdata, "initdata")
// When GPU annotations are specified, remote hypervisor annotations have the annotation added
ocispec.Annotations[vcAnnotations.DefaultGPUs] = "-1"
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultGPUs] = "1"
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.NoError(err)
assert.Equal(sbConfig.HypervisorConfig.DefaultGPUs, uint32(1))
// When GPU annotations are specified, remote hypervisor annotations have the annotation added
ocispec.Annotations[vcAnnotations.DefaultGPUModel] = "tesla"
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.NoError(err)
assert.Equal(sbConfig.HypervisorConfig.DefaultGPUModel, "tesla")
}
func TestAddProtectedHypervisorAnnotations(t *testing.T) {

View File

@ -673,6 +673,12 @@ type HypervisorConfig struct {
// Initdata defines the initdata passed into guest when CreateVM
Initdata string
// GPU specific annotations (currently only applicable for Remote Hypervisor)
//DefaultGPUs specifies the number of GPUs required for the Kata VM
DefaultGPUs uint32
// DefaultGPUModel specifies GPU model like tesla, h100, readeon etc.
DefaultGPUModel string
}
// vcpu mapping from vcpu number to thread number

View File

@ -132,6 +132,12 @@ const (
// UseLegacySerial sets legacy serial device for guest console if available and implemented for architecture
UseLegacySerial = kataAnnotHypervisorPrefix + "use_legacy_serial"
// GPU specific annotations used by remote hypervisor for instance selection
// Number of GPUs required in the Kata VM
DefaultGPUs = kataAnnotHypervisorPrefix + "default_gpus"
// GPU model - tesla, h100, radeon etc..
DefaultGPUModel = kataAnnotHypervisorPrefix + "default_gpu_model"
//
// CPU Annotations
//

View File

@ -81,6 +81,8 @@ func (rh *remoteHypervisor) CreateVM(ctx context.Context, id string, network Net
annotations[hypannotations.DefaultVCPUs] = strconv.FormatUint(uint64(hypervisorConfig.NumVCPUs()), 10)
annotations[hypannotations.DefaultMemory] = strconv.FormatUint(uint64(hypervisorConfig.MemorySize), 10)
annotations[hypannotations.Initdata] = hypervisorConfig.Initdata
annotations[hypannotations.DefaultGPUs] = strconv.FormatUint(uint64(hypervisorConfig.DefaultGPUs), 10)
annotations[hypannotations.DefaultGPUModel] = hypervisorConfig.DefaultGPUModel
req := &pb.CreateVMRequest{
Id: id,