From 57b932c1276ed83bc3a5bb61f005302130742433 Mon Sep 17 00:00:00 2001 From: ChengyuZhu6 Date: Thu, 20 Jul 2023 05:17:16 +0800 Subject: [PATCH] kata-runtime: Add configurable image request timeout Add ImageRequestTimeout field in the config struct, set RequestTimeout by configured image request timeout, add image_request_timeout to default configuration files, add image request timeout to annotations and add image timeout annotation to sandbox config documentation. exp: configure the image request timout in the configuration: [image] image_request_timeout = 300 configure the image request timeout in the yaml: annotations: "io.katacontainers.config.runtime.image_request_timeout": "300" Fixes: #7389 Signed-off-by: ChengyuZhu6 --- docs/how-to/how-to-set-sandbox-config-kata.md | 1 + src/runtime/Makefile | 4 ++++ src/runtime/config/configuration-clh-tdx.toml.in | 6 ++++++ src/runtime/config/configuration-clh.toml.in | 6 ++++++ .../config/configuration-qemu-nvidia-gpu.toml.in | 6 ++++++ src/runtime/config/configuration-qemu-se.toml.in | 6 ++++++ src/runtime/config/configuration-qemu-sev.toml.in | 6 ++++++ src/runtime/config/configuration-qemu-snp.toml.in | 6 ++++++ src/runtime/config/configuration-qemu-tdx.toml.in | 6 ++++++ src/runtime/config/configuration-qemu.toml.in | 6 ++++++ src/runtime/config/configuration-remote.toml.in | 6 ++++++ src/runtime/pkg/katautils/config.go | 6 ++++-- src/runtime/pkg/oci/utils.go | 12 +++++++++++- src/runtime/pkg/oci/utils_test.go | 3 +++ src/runtime/virtcontainers/kata_agent.go | 8 ++++++++ .../virtcontainers/pkg/annotations/annotations.go | 3 +++ src/runtime/virtcontainers/sandbox.go | 3 +++ 17 files changed, 91 insertions(+), 3 deletions(-) diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md index 1c55f97fc9..30d4504590 100644 --- a/docs/how-to/how-to-set-sandbox-config-kata.md +++ b/docs/how-to/how-to-set-sandbox-config-kata.md @@ -27,6 +27,7 @@ There are several kinds of Kata configurations and they are listed below. | `io.katacontainers.config.runtime.internetworking_model` | string| determines how the VM should be connected to the container network interface. Valid values are `macvtap`, `tcfilter` and `none` | | `io.katacontainers.config.runtime.sandbox_cgroup_only`| `boolean` | determines if Kata processes are managed only in sandbox cgroup | | `io.katacontainers.config.runtime.enable_pprof` | `boolean` | enables Golang `pprof` for `containerd-shim-kata-v2` process | +| `io.katacontainers.config.runtime.image_request_timeout` | `uint64` | the timeout for pulling an image within the guest in `seconds`, default is `60` | ## Agent Options | Key | Value Type | Comments | diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 138a5b9c06..6362c98aa7 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -275,6 +275,9 @@ DEFBINDMOUNTS := [] # Image Service Offload DEFSERVICEOFFLOAD ?= false +# Image Request Timeout in seconds +DEFIMAGEREQUESTTIMEOUT ?= 60 + # SEV & SEV-ES Guest Pre-Attestation DEFGUESTPREATTESTATION ?= false DEFGUESTPREATTESTATIONPROXY ?= localhost:44444 @@ -705,6 +708,7 @@ USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFSTATICRESOURCEMGMT_TEE USER_VARS += DEFBINDMOUNTS USER_VARS += DEFSERVICEOFFLOAD +USER_VARS += DEFIMAGEREQUESTTIMEOUT USER_VARS += DEFVFIOMODE USER_VARS += BUILDFLAGS USER_VARS += DEFSERVICEOFFLOAD diff --git a/src/runtime/config/configuration-clh-tdx.toml.in b/src/runtime/config/configuration-clh-tdx.toml.in index 6d9f045419..c705478345 100644 --- a/src/runtime/config/configuration-clh-tdx.toml.in +++ b/src/runtime/config/configuration-clh-tdx.toml.in @@ -424,6 +424,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 1cc8b54dc9..826fc29d72 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -443,6 +443,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 4861cb1ed6..116576ac36 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -678,6 +678,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) #service_offload = true +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu-se.toml.in b/src/runtime/config/configuration-qemu-se.toml.in index 6c574cd0bd..b59fc7653d 100644 --- a/src/runtime/config/configuration-qemu-se.toml.in +++ b/src/runtime/config/configuration-qemu-se.toml.in @@ -654,6 +654,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu-sev.toml.in b/src/runtime/config/configuration-qemu-sev.toml.in index db373e6cc6..557dfd67e3 100644 --- a/src/runtime/config/configuration-qemu-sev.toml.in +++ b/src/runtime/config/configuration-qemu-sev.toml.in @@ -658,6 +658,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in index f7aa09678c..7fecf527b7 100644 --- a/src/runtime/config/configuration-qemu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-snp.toml.in @@ -683,6 +683,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in index e679677c8c..182ff4e6c4 100644 --- a/src/runtime/config/configuration-qemu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-tdx.toml.in @@ -671,6 +671,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = true +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 8d34386a35..e0f92726e6 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -718,6 +718,12 @@ experimental=@DEFAULTEXPFEATURES@ # (default: false) service_offload = @DEFSERVICEOFFLOAD@ +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/config/configuration-remote.toml.in b/src/runtime/config/configuration-remote.toml.in index 4c75af275e..5a42096cc9 100644 --- a/src/runtime/config/configuration-remote.toml.in +++ b/src/runtime/config/configuration-remote.toml.in @@ -299,6 +299,12 @@ experimental=@DEFAULTEXPFEATURES@ # Note: The remote hypervisor offloads the pulling on images on the peer pod VM, so requries this to be true service_offload = true +# Image request timeout in seconds. +# If specified, indicates the image request timeout in the guest needed for the workload(s) +# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) +# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time. +image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@ + # Container image decryption keys provisioning. # Applies only if service_offload is true. # Keys can be provisioned locally (e.g. through a special command or diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 1397954aa5..a485cac814 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -66,8 +66,9 @@ type tomlConfig struct { } type image struct { - Provision string `toml:"provision"` - ServiceOffload bool `toml:"service_offload"` + Provision string `toml:"provision"` + ServiceOffload bool `toml:"service_offload"` + ImageRequestTimeout uint64 `toml:"image_request_timeout"` } type factory struct { @@ -1456,6 +1457,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat config.JaegerUser = tomlConf.Runtime.JaegerUser config.JaegerPassword = tomlConf.Runtime.JaegerPassword config.ServiceOffload = tomlConf.Image.ServiceOffload + config.ImageRequestTimeout = tomlConf.Image.ImageRequestTimeout for _, f := range tomlConf.Runtime.Experimental { feature := exp.Get(f) if feature == nil { diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 7881227a1a..7399e017a5 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -157,6 +157,10 @@ type RuntimeConfig struct { // Offload the CRI image management service to the Kata agent. ServiceOffload bool + + // Image request timeout which, if provided, indicates the image request timeout + // in the guest needed for the workload(s) + ImageRequestTimeout uint64 } // AddKernelParam allows the addition of new kernel parameters to an existing @@ -915,7 +919,11 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r value, vcAnnotations.VfioMode) } } - + if err := newAnnotationConfiguration(ocispec, vcAnnotations.ImageRequestTimeout).setUint(func(imageRequestTimeout uint64) { + sbConfig.ImageRequestTimeout = imageRequestTimeout + }); err != nil { + return err + } return nil } @@ -1029,6 +1037,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st Experimental: runtime.Experimental, ServiceOffload: runtime.ServiceOffload, + + ImageRequestTimeout: runtime.ImageRequestTimeout, } if err := addAnnotations(ocispec, &sandboxConfig, runtime); err != nil { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index b27ba7f8ee..7b53d4dc66 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -812,12 +812,15 @@ func TestAddRuntimeAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.SandboxCgroupOnly] = "true" ocispec.Annotations[vcAnnotations.DisableNewNetNs] = "true" ocispec.Annotations[vcAnnotations.InterNetworkModel] = "macvtap" + ocispec.Annotations[vcAnnotations.ImageRequestTimeout] = "100" addAnnotations(ocispec, &config, runtimeConfig) assert.Equal(config.DisableGuestSeccomp, true) assert.Equal(config.SandboxCgroupOnly, true) assert.Equal(config.NetworkConfig.DisableNewNetwork, true) assert.Equal(config.NetworkConfig.InterworkingModel, vc.NetXConnectMacVtapModel) + assert.Equal(config.ImageRequestTimeout, uint64(100)) + } func TestRegexpContains(t *testing.T) { diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 3afb2fcebf..b5061806e0 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -83,6 +83,7 @@ type customRequestTimeoutKeyType struct{} var ( checkRequestTimeout = 30 * time.Second defaultRequestTimeout = 60 * time.Second + imageRequestTimeout = 60 * time.Second remoteRequestTimeout = 300 * time.Second customRequestTimeoutKey = customRequestTimeoutKeyType(struct{}{}) errorMissingOCISpec = errors.New("Missing OCI specification") @@ -364,6 +365,11 @@ func (k *kataAgent) init(ctx context.Context, sandbox *Sandbox, config KataAgent k.kmodules = config.KernelModules k.dialTimout = config.DialTimeout + imageRequestTimeout = time.Duration(sandbox.config.ImageRequestTimeout) * time.Second + k.Logger().WithFields(logrus.Fields{ + "imageRequestTimeout": fmt.Sprintf("%+v", imageRequestTimeout), + }).Info("The imageRequestTimeout has been set ") + return disableVMShutdown, nil } @@ -2089,6 +2095,8 @@ func (k *kataAgent) getReqContext(ctx context.Context, reqName string) (newCtx c // Wait and GetOOMEvent have no timeout case grpcCheckRequest: newCtx, cancel = context.WithTimeout(ctx, checkRequestTimeout) + case grpcPullImageRequest: + newCtx, cancel = context.WithTimeout(ctx, imageRequestTimeout) default: var requestTimeout = defaultRequestTimeout diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index b9a8dfc109..9a1811254a 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -292,6 +292,9 @@ const ( // VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated // Overrides the runtime.vfio_mode parameter in the global configuration.toml VfioMode = kataAnnotRuntimePrefix + "vfio_mode" + + // ImageRequestTimeout is a sandbox annotaion that sets the image pull timeout in the guest. + ImageRequestTimeout = kataAnnotRuntimePrefix + "image_request_timeout" ) // Agent related annotations diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index a0c0780128..bdd6f296c2 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -161,6 +161,9 @@ type SandboxConfig struct { StaticResourceMgmt bool // Offload the CRI image management service to the Kata agent. ServiceOffload bool + // Image request timeout which, if provided, indicates the image request timeout + // in the guest needed for the workload(s) + ImageRequestTimeout uint64 // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool // SystemdCgroup enables systemd cgroup support