clh: Increase API and SandboxStop timeouts for TDX

While doing tests using `ctr`, I've noticed that I've been hitting those
timeouts more frequently than expected.

Till we find the root cause of the issue (which is *not* in the Kata
Containers), let's increase the timeouts when dealing with a
Confidential Guest.

Fixes: #4978

Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
Fabiano Fidêncio 2022-08-23 15:42:28 +02:00
parent c142fa2541
commit 9f0a57c0eb

View File

@ -66,17 +66,19 @@ const (
const ( const (
// Values are mandatory by http API // Values are mandatory by http API
// Values based on: // Values based on:
clhTimeout = 10 clhTimeout = 10
clhAPITimeout = 1 clhAPITimeout = 1
clhAPITimeoutConfidentialGuest = 10
// Timeout for hot-plug - hotplug devices can take more time, than usual API calls // Timeout for hot-plug - hotplug devices can take more time, than usual API calls
// Use longer time timeout for it. // Use longer time timeout for it.
clhHotPlugAPITimeout = 5 clhHotPlugAPITimeout = 5
clhStopSandboxTimeout = 3 clhStopSandboxTimeout = 3
clhSocket = "clh.sock" clhStopSandboxTimeoutConfidentialGuest = 5
clhAPISocket = "clh-api.sock" clhSocket = "clh.sock"
virtioFsSocket = "virtiofsd.sock" clhAPISocket = "clh-api.sock"
defaultClhPath = "/usr/local/bin/cloud-hypervisor" virtioFsSocket = "virtiofsd.sock"
virtioFsCacheAlways = "always" defaultClhPath = "/usr/local/bin/cloud-hypervisor"
virtioFsCacheAlways = "always"
) )
// Interface that hides the implementation of openAPI client // Interface that hides the implementation of openAPI client
@ -272,6 +274,28 @@ var clhDebugKernelParams = []Param{
// //
//########################################################### //###########################################################
func (clh *cloudHypervisor) getClhAPITimeout() time.Duration {
// Increase the APITimeout when dealing with a Confidential Guest.
// The value has been chosen based on tests using `ctr`, and hopefully
// this change can be dropped in further steps of the development.
if clh.config.ConfidentialGuest {
return clhAPITimeoutConfidentialGuest
}
return clhAPITimeout
}
func (clh *cloudHypervisor) getClhStopSandboxTimeout() time.Duration {
// Increase the StopSandboxTimeout when dealing with a Confidential Guest.
// The value has been chosen based on tests using `ctr`, and hopefully
// this change can be dropped in further steps of the development.
if clh.config.ConfidentialGuest {
return clhStopSandboxTimeoutConfidentialGuest
}
return clhStopSandboxTimeout
}
func (clh *cloudHypervisor) setConfig(config *HypervisorConfig) error { func (clh *cloudHypervisor) setConfig(config *HypervisorConfig) error {
clh.config = *config clh.config = *config
@ -594,7 +618,7 @@ func (clh *cloudHypervisor) StartVM(ctx context.Context, timeout int) error {
span, _ := katatrace.Trace(ctx, clh.Logger(), "StartVM", clhTracingTags, map[string]string{"sandbox_id": clh.id}) span, _ := katatrace.Trace(ctx, clh.Logger(), "StartVM", clhTracingTags, map[string]string{"sandbox_id": clh.id})
defer span.End() defer span.End()
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second) ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second)
defer cancel() defer cancel()
clh.Logger().WithField("function", "StartVM").Info("starting Sandbox") clh.Logger().WithField("function", "StartVM").Info("starting Sandbox")
@ -890,7 +914,7 @@ func (clh *cloudHypervisor) ResizeMemory(ctx context.Context, reqMemMB uint32, m
} }
cl := clh.client() cl := clh.client()
ctx, cancelResize := context.WithTimeout(ctx, clhAPITimeout*time.Second) ctx, cancelResize := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second)
defer cancelResize() defer cancelResize()
resize := *chclient.NewVmResize() resize := *chclient.NewVmResize()
@ -935,7 +959,7 @@ func (clh *cloudHypervisor) ResizeVCPUs(ctx context.Context, reqVCPUs uint32) (c
} }
// Resize (hot-plug) vCPUs via HTTP API // Resize (hot-plug) vCPUs via HTTP API
ctx, cancel := context.WithTimeout(ctx, clhAPITimeout*time.Second) ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second)
defer cancel() defer cancel()
resize := *chclient.NewVmResize() resize := *chclient.NewVmResize()
resize.DesiredVcpus = func(i int32) *int32 { return &i }(int32(reqVCPUs)) resize.DesiredVcpus = func(i int32) *int32 { return &i }(int32(reqVCPUs))
@ -1086,9 +1110,9 @@ func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err e
clh.Logger().Debug("Stopping Cloud Hypervisor") clh.Logger().Debug("Stopping Cloud Hypervisor")
if pidRunning && !waitOnly { if pidRunning && !waitOnly {
clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout) clhRunning, _ := clh.isClhRunning(uint(clh.getClhStopSandboxTimeout()))
if clhRunning { if clhRunning {
ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second) ctx, cancel := context.WithTimeout(context.Background(), clh.getClhStopSandboxTimeout()*time.Second)
defer cancel() defer cancel()
if _, err = clh.client().ShutdownVMM(ctx); err != nil { if _, err = clh.client().ShutdownVMM(ctx); err != nil {
return err return err
@ -1096,7 +1120,7 @@ func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err e
} }
} }
if err = utils.WaitLocalProcess(pid, clhStopSandboxTimeout, syscall.Signal(0), clh.Logger()); err != nil { if err = utils.WaitLocalProcess(pid, uint(clh.getClhStopSandboxTimeout()), syscall.Signal(0), clh.Logger()); err != nil {
return err return err
} }
@ -1281,7 +1305,7 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
timeStart := time.Now() timeStart := time.Now()
cl := clh.client() cl := clh.client()
for { for {
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second) ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second)
defer cancel() defer cancel()
_, _, err := cl.VmmPingGet(ctx) _, _, err := cl.VmmPingGet(ctx)
if err == nil { if err == nil {
@ -1547,7 +1571,7 @@ func (clh *cloudHypervisor) cleanupVM(force bool) error {
// vmInfo ask to hypervisor for current VM status // vmInfo ask to hypervisor for current VM status
func (clh *cloudHypervisor) vmInfo() (chclient.VmInfo, error) { func (clh *cloudHypervisor) vmInfo() (chclient.VmInfo, error) {
cl := clh.client() cl := clh.client()
ctx, cancelInfo := context.WithTimeout(context.Background(), clhAPITimeout*time.Second) ctx, cancelInfo := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second)
defer cancelInfo() defer cancelInfo()
info, _, err := cl.VmInfoGet(ctx) info, _, err := cl.VmInfoGet(ctx)