From 57b932c1276ed83bc3a5bb61f005302130742433 Mon Sep 17 00:00:00 2001
From: ChengyuZhu6 <chengyu.zhu@intel.com>
Date: Thu, 20 Jul 2023 05:17:16 +0800
Subject: [PATCH] kata-runtime: Add configurable image request timeout

Add ImageRequestTimeout field in the config struct, set RequestTimeout
by configured image request timeout, add image_request_timeout to
default configuration files, add image request timeout to annotations
and add image timeout annotation to sandbox config documentation.

exp:

configure the image request timout in the configuration:
[image]
image_request_timeout = 300

configure the image request timeout in the yaml:
annotations:
      "io.katacontainers.config.runtime.image_request_timeout": "300"

Fixes: #7389

Signed-off-by: ChengyuZhu6 <chengyu.zhu@intel.com>
---
 docs/how-to/how-to-set-sandbox-config-kata.md        |  1 +
 src/runtime/Makefile                                 |  4 ++++
 src/runtime/config/configuration-clh-tdx.toml.in     |  6 ++++++
 src/runtime/config/configuration-clh.toml.in         |  6 ++++++
 .../config/configuration-qemu-nvidia-gpu.toml.in     |  6 ++++++
 src/runtime/config/configuration-qemu-se.toml.in     |  6 ++++++
 src/runtime/config/configuration-qemu-sev.toml.in    |  6 ++++++
 src/runtime/config/configuration-qemu-snp.toml.in    |  6 ++++++
 src/runtime/config/configuration-qemu-tdx.toml.in    |  6 ++++++
 src/runtime/config/configuration-qemu.toml.in        |  6 ++++++
 src/runtime/config/configuration-remote.toml.in      |  6 ++++++
 src/runtime/pkg/katautils/config.go                  |  6 ++++--
 src/runtime/pkg/oci/utils.go                         | 12 +++++++++++-
 src/runtime/pkg/oci/utils_test.go                    |  3 +++
 src/runtime/virtcontainers/kata_agent.go             |  8 ++++++++
 .../virtcontainers/pkg/annotations/annotations.go    |  3 +++
 src/runtime/virtcontainers/sandbox.go                |  3 +++
 17 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md
index 1c55f97fc9..30d4504590 100644
--- a/docs/how-to/how-to-set-sandbox-config-kata.md
+++ b/docs/how-to/how-to-set-sandbox-config-kata.md
@@ -27,6 +27,7 @@ There are several kinds of Kata configurations and they are listed below.
 | `io.katacontainers.config.runtime.internetworking_model` | string| determines how the VM should be connected to the container network interface. Valid values are `macvtap`, `tcfilter` and `none` |
 | `io.katacontainers.config.runtime.sandbox_cgroup_only`| `boolean` | determines if Kata processes are managed only in sandbox cgroup |
 | `io.katacontainers.config.runtime.enable_pprof` | `boolean` | enables Golang `pprof` for `containerd-shim-kata-v2` process |
+| `io.katacontainers.config.runtime.image_request_timeout` | `uint64` | the timeout for pulling an image within the guest in `seconds`, default is `60` |
 
 ## Agent Options
 | Key | Value Type | Comments |
diff --git a/src/runtime/Makefile b/src/runtime/Makefile
index 138a5b9c06..6362c98aa7 100644
--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@@ -275,6 +275,9 @@ DEFBINDMOUNTS := []
 # Image Service Offload
 DEFSERVICEOFFLOAD ?= false
 
+# Image Request Timeout in seconds
+DEFIMAGEREQUESTTIMEOUT ?= 60
+
 # SEV & SEV-ES Guest Pre-Attestation
 DEFGUESTPREATTESTATION ?= false
 DEFGUESTPREATTESTATIONPROXY ?= localhost:44444
@@ -705,6 +708,7 @@ USER_VARS += DEFSTATICRESOURCEMGMT_FC
 USER_VARS += DEFSTATICRESOURCEMGMT_TEE
 USER_VARS += DEFBINDMOUNTS
 USER_VARS += DEFSERVICEOFFLOAD
+USER_VARS += DEFIMAGEREQUESTTIMEOUT
 USER_VARS += DEFVFIOMODE
 USER_VARS += BUILDFLAGS
 USER_VARS += DEFSERVICEOFFLOAD
diff --git a/src/runtime/config/configuration-clh-tdx.toml.in b/src/runtime/config/configuration-clh-tdx.toml.in
index 6d9f045419..c705478345 100644
--- a/src/runtime/config/configuration-clh-tdx.toml.in
+++ b/src/runtime/config/configuration-clh-tdx.toml.in
@@ -424,6 +424,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in
index 1cc8b54dc9..826fc29d72 100644
--- a/src/runtime/config/configuration-clh.toml.in
+++ b/src/runtime/config/configuration-clh.toml.in
@@ -443,6 +443,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
index 4861cb1ed6..116576ac36 100644
--- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
@@ -678,6 +678,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 #service_offload = true
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems, and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu-se.toml.in b/src/runtime/config/configuration-qemu-se.toml.in
index 6c574cd0bd..b59fc7653d 100644
--- a/src/runtime/config/configuration-qemu-se.toml.in
+++ b/src/runtime/config/configuration-qemu-se.toml.in
@@ -654,6 +654,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu-sev.toml.in b/src/runtime/config/configuration-qemu-sev.toml.in
index db373e6cc6..557dfd67e3 100644
--- a/src/runtime/config/configuration-qemu-sev.toml.in
+++ b/src/runtime/config/configuration-qemu-sev.toml.in
@@ -658,6 +658,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in
index f7aa09678c..7fecf527b7 100644
--- a/src/runtime/config/configuration-qemu-snp.toml.in
+++ b/src/runtime/config/configuration-qemu-snp.toml.in
@@ -683,6 +683,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in
index e679677c8c..182ff4e6c4 100644
--- a/src/runtime/config/configuration-qemu-tdx.toml.in
+++ b/src/runtime/config/configuration-qemu-tdx.toml.in
@@ -671,6 +671,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = true
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in
index 8d34386a35..e0f92726e6 100644
--- a/src/runtime/config/configuration-qemu.toml.in
+++ b/src/runtime/config/configuration-qemu.toml.in
@@ -718,6 +718,12 @@ experimental=@DEFAULTEXPFEATURES@
 # (default: false)
 service_offload = @DEFSERVICEOFFLOAD@
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/config/configuration-remote.toml.in b/src/runtime/config/configuration-remote.toml.in
index 4c75af275e..5a42096cc9 100644
--- a/src/runtime/config/configuration-remote.toml.in
+++ b/src/runtime/config/configuration-remote.toml.in
@@ -299,6 +299,12 @@ experimental=@DEFAULTEXPFEATURES@
 # Note: The remote hypervisor offloads the pulling on images on the peer pod VM, so requries this to be true
 service_offload = true
 
+# Image request timeout in seconds.
+# If specified, indicates the image request timeout in the guest needed for the workload(s)
+# If unspecified then it will be set @DEFIMAGEREQUESTTIMEOUT@ second(s) 
+# to reduce image pull failures caused by network problems and quickly obtain request failure information at the same time.
+image_request_timeout = @DEFIMAGEREQUESTTIMEOUT@
+
 # Container image decryption keys provisioning.
 # Applies only if service_offload is true.
 # Keys can be provisioned locally (e.g. through a special command or
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index 1397954aa5..a485cac814 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -66,8 +66,9 @@ type tomlConfig struct {
 }
 
 type image struct {
-	Provision      string `toml:"provision"`
-	ServiceOffload bool   `toml:"service_offload"`
+	Provision           string `toml:"provision"`
+	ServiceOffload      bool   `toml:"service_offload"`
+	ImageRequestTimeout uint64 `toml:"image_request_timeout"`
 }
 
 type factory struct {
@@ -1456,6 +1457,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
 	config.JaegerUser = tomlConf.Runtime.JaegerUser
 	config.JaegerPassword = tomlConf.Runtime.JaegerPassword
 	config.ServiceOffload = tomlConf.Image.ServiceOffload
+	config.ImageRequestTimeout = tomlConf.Image.ImageRequestTimeout
 	for _, f := range tomlConf.Runtime.Experimental {
 		feature := exp.Get(f)
 		if feature == nil {
diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
index 7881227a1a..7399e017a5 100644
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -157,6 +157,10 @@ type RuntimeConfig struct {
 
 	// Offload the CRI image management service to the Kata agent.
 	ServiceOffload bool
+
+	// Image request timeout which, if provided, indicates the image request timeout
+	// in the guest needed for the workload(s)
+	ImageRequestTimeout uint64
 }
 
 // AddKernelParam allows the addition of new kernel parameters to an existing
@@ -915,7 +919,11 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r
 				value, vcAnnotations.VfioMode)
 		}
 	}
-
+	if err := newAnnotationConfiguration(ocispec, vcAnnotations.ImageRequestTimeout).setUint(func(imageRequestTimeout uint64) {
+		sbConfig.ImageRequestTimeout = imageRequestTimeout
+	}); err != nil {
+		return err
+	}
 	return nil
 }
 
@@ -1029,6 +1037,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st
 		Experimental: runtime.Experimental,
 
 		ServiceOffload: runtime.ServiceOffload,
+
+		ImageRequestTimeout: runtime.ImageRequestTimeout,
 	}
 
 	if err := addAnnotations(ocispec, &sandboxConfig, runtime); err != nil {
diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go
index b27ba7f8ee..7b53d4dc66 100644
--- a/src/runtime/pkg/oci/utils_test.go
+++ b/src/runtime/pkg/oci/utils_test.go
@@ -812,12 +812,15 @@ func TestAddRuntimeAnnotations(t *testing.T) {
 	ocispec.Annotations[vcAnnotations.SandboxCgroupOnly] = "true"
 	ocispec.Annotations[vcAnnotations.DisableNewNetNs] = "true"
 	ocispec.Annotations[vcAnnotations.InterNetworkModel] = "macvtap"
+	ocispec.Annotations[vcAnnotations.ImageRequestTimeout] = "100"
 
 	addAnnotations(ocispec, &config, runtimeConfig)
 	assert.Equal(config.DisableGuestSeccomp, true)
 	assert.Equal(config.SandboxCgroupOnly, true)
 	assert.Equal(config.NetworkConfig.DisableNewNetwork, true)
 	assert.Equal(config.NetworkConfig.InterworkingModel, vc.NetXConnectMacVtapModel)
+	assert.Equal(config.ImageRequestTimeout, uint64(100))
+
 }
 
 func TestRegexpContains(t *testing.T) {
diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go
index 3afb2fcebf..b5061806e0 100644
--- a/src/runtime/virtcontainers/kata_agent.go
+++ b/src/runtime/virtcontainers/kata_agent.go
@@ -83,6 +83,7 @@ type customRequestTimeoutKeyType struct{}
 var (
 	checkRequestTimeout           = 30 * time.Second
 	defaultRequestTimeout         = 60 * time.Second
+	imageRequestTimeout           = 60 * time.Second
 	remoteRequestTimeout          = 300 * time.Second
 	customRequestTimeoutKey       = customRequestTimeoutKeyType(struct{}{})
 	errorMissingOCISpec           = errors.New("Missing OCI specification")
@@ -364,6 +365,11 @@ func (k *kataAgent) init(ctx context.Context, sandbox *Sandbox, config KataAgent
 	k.kmodules = config.KernelModules
 	k.dialTimout = config.DialTimeout
 
+	imageRequestTimeout = time.Duration(sandbox.config.ImageRequestTimeout) * time.Second
+	k.Logger().WithFields(logrus.Fields{
+		"imageRequestTimeout": fmt.Sprintf("%+v", imageRequestTimeout),
+	}).Info("The imageRequestTimeout has been set ")
+
 	return disableVMShutdown, nil
 }
 
@@ -2089,6 +2095,8 @@ func (k *kataAgent) getReqContext(ctx context.Context, reqName string) (newCtx c
 		// Wait and GetOOMEvent have no timeout
 	case grpcCheckRequest:
 		newCtx, cancel = context.WithTimeout(ctx, checkRequestTimeout)
+	case grpcPullImageRequest:
+		newCtx, cancel = context.WithTimeout(ctx, imageRequestTimeout)
 	default:
 		var requestTimeout = defaultRequestTimeout
 
diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
index b9a8dfc109..9a1811254a 100644
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -292,6 +292,9 @@ const (
 	// VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated
 	// Overrides the runtime.vfio_mode parameter in the global configuration.toml
 	VfioMode = kataAnnotRuntimePrefix + "vfio_mode"
+
+	// ImageRequestTimeout is a sandbox annotaion that sets the image pull timeout in the guest.
+	ImageRequestTimeout = kataAnnotRuntimePrefix + "image_request_timeout"
 )
 
 // Agent related annotations
diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index a0c0780128..bdd6f296c2 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -161,6 +161,9 @@ type SandboxConfig struct {
 	StaticResourceMgmt bool
 	// Offload the CRI image management service to the Kata agent.
 	ServiceOffload bool
+	// Image request timeout which, if provided, indicates the image request timeout
+	// in the guest needed for the workload(s)
+	ImageRequestTimeout uint64
 	// SharePidNs sets all containers to share the same sandbox level pid namespace.
 	SharePidNs bool
 	// SystemdCgroup enables systemd cgroup support