From cb6b7667cdf4299d4ef73b4a5350d0179c27e802 Mon Sep 17 00:00:00 2001
From: Hui Zhu <teawater@antfin.com>
Date: Thu, 8 Jul 2021 18:10:39 +0800
Subject: [PATCH] runtime: Add option "enable_guest_swap" to config
 hypervisor.qemu

This commit add option "enable_guest_swap" to config hypervisor.qemu.
It will enable swap in the guest. Default false.
When enable_guest_swap is enabled, insert a raw file to the guest as the
swap device if the swappiness of a container (set by annotation
"io.katacontainers.container.resource.swappiness") is bigger than 0.
The size of the swap device should be
swap_in_bytes (set by annotation
"io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes.
If swap_in_bytes is not set, the size should be memory_limit_in_bytes.
If swap_in_bytes and memory_limit_in_bytes is not set, the size should be
default_memory.

Fixes: #2201

Signed-off-by: Hui Zhu <teawater@antfin.com>
---
 .../cli/config/configuration-qemu.toml.in     | 11 +++
 .../pkg/katautils/config-settings.go.in       |  1 +
 src/runtime/pkg/katautils/config.go           |  3 +
 src/runtime/virtcontainers/hypervisor.go      |  3 +
 .../pkg/annotations/annotations.go            |  3 +
 src/runtime/virtcontainers/pkg/oci/utils.go   |  7 ++
 src/runtime/virtcontainers/sandbox.go         | 89 +++++++++++++++++--
 src/runtime/virtcontainers/sandbox_test.go    |  6 +-
 8 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/src/runtime/cli/config/configuration-qemu.toml.in b/src/runtime/cli/config/configuration-qemu.toml.in
index 19fa2b3b65..2113a527ee 100644
--- a/src/runtime/cli/config/configuration-qemu.toml.in
+++ b/src/runtime/cli/config/configuration-qemu.toml.in
@@ -356,6 +356,17 @@ valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
 # See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
 #guest_memory_dump_paging=false
 
+# Enable swap in the guest. Default false.
+# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device
+# if the swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness")
+# is bigger than 0.
+# The size of the swap device should be 
+# swap_in_bytes (set by annotation "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes.
+# If swap_in_bytes is not set, the size should be memory_limit_in_bytes.
+# If swap_in_bytes and memory_limit_in_bytes is not set, the size should
+# be default_memory.
+#enable_guest_swap = true
+
 [factory]
 # VM templating support. Once enabled, new VMs are created from template
 # using vm cloning. They will share the same initial kernel, initramfs and
diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in
index e92d8741c9..6231ddd393 100644
--- a/src/runtime/pkg/katautils/config-settings.go.in
+++ b/src/runtime/pkg/katautils/config-settings.go.in
@@ -55,6 +55,7 @@ const defaultVhostUserStorePath string = "/var/run/kata-containers/vhost-user/"
 const defaultRxRateLimiterMaxRate = uint64(0)
 const defaultTxRateLimiterMaxRate = uint64(0)
 const defaultConfidentialGuest = false
+const defaultGuestSwap = false
 
 var defaultSGXEPCSize = int64(0)
 
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index fc9e5a013b..41ff6c3bdf 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -133,6 +133,7 @@ type hypervisor struct {
 	DisableVhostNet         bool     `toml:"disable_vhost_net"`
 	GuestMemoryDumpPaging   bool     `toml:"guest_memory_dump_paging"`
 	ConfidentialGuest       bool     `toml:"confidential_guest"`
+	GuestSwap               bool     `toml:"enable_guest_swap"`
 }
 
 type runtime struct {
@@ -711,6 +712,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		GuestMemoryDumpPath:     h.GuestMemoryDumpPath,
 		GuestMemoryDumpPaging:   h.GuestMemoryDumpPaging,
 		ConfidentialGuest:       h.ConfidentialGuest,
+		GuestSwap:               h.GuestSwap,
 	}, nil
 }
 
@@ -1066,6 +1068,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
 		TxRateLimiterMaxRate:    defaultTxRateLimiterMaxRate,
 		SGXEPCSize:              defaultSGXEPCSize,
 		ConfidentialGuest:       defaultConfidentialGuest,
+		GuestSwap:               defaultGuestSwap,
 	}
 }
 
diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index 2c75d7f4a5..525d77ceb9 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -458,6 +458,9 @@ type HypervisorConfig struct {
 
 	// MemOffset specifies memory space for nvdimm device
 	MemOffset uint64
+
+	// GuestSwap Used to enable/disable swap in the guest
+	GuestSwap bool
 }
 
 // vcpu mapping from vcpu number to thread number
diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
index 1ac2497b6a..838b2994fd 100644
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -220,6 +220,9 @@ const (
 
 	// TxRateLimiter is a sandbox annotation that specifies max rate on network I/O outbound bandwidth
 	TxRateLimiterMaxRate = kataAnnotHypervisorPrefix + "tx_rate_limiter_max_rate"
+
+	// EnableGuestSwap is a sandbox annotation to enable swap in the guest.
+	EnableGuestSwap = kataAnnotHypervisorPrefix + "enable_guest_swap"
 )
 
 // Runtime related annotations
diff --git a/src/runtime/virtcontainers/pkg/oci/utils.go b/src/runtime/virtcontainers/pkg/oci/utils.go
index 1240b9bb06..bbdd5dcae1 100644
--- a/src/runtime/virtcontainers/pkg/oci/utils.go
+++ b/src/runtime/virtcontainers/pkg/oci/utils.go
@@ -539,6 +539,7 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
 			}
 		}
 	}
+
 	return nil
 }
 
@@ -616,6 +617,12 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
 		return err
 	}
 
+	if err := newAnnotationConfiguration(ocispec, vcAnnotations.EnableGuestSwap).setBool(func(enableGuestSwap bool) {
+		sbConfig.HypervisorConfig.GuestSwap = enableGuestSwap
+	}); err != nil {
+		return err
+	}
+
 	return nil
 }
 
diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index c5ab8d5647..7784a66b88 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -8,6 +8,7 @@ package virtcontainers
 
 import (
 	"bufio"
+	"bytes"
 	"context"
 	"fmt"
 	"io"
@@ -65,6 +66,8 @@ const (
 
 	// DirMode is the permission bits used for creating a directory
 	DirMode = os.FileMode(0750) | os.ModeDir
+
+	mkswapPath = "/sbin/mkswap"
 )
 
 var (
@@ -200,6 +203,10 @@ type Sandbox struct {
 	ctx context.Context
 
 	cw *consoleWatcher
+
+	swapDeviceNum uint
+	swapSizeBytes int64
+	swapDevices   []*config.BlockDrive
 }
 
 // ID returns the sandbox identifier string.
@@ -519,6 +526,9 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
 		sharePidNs:      sandboxConfig.SharePidNs,
 		networkNS:       NetworkNamespace{NetNsPath: sandboxConfig.NetworkConfig.NetNSPath},
 		ctx:             ctx,
+		swapDeviceNum:   0,
+		swapSizeBytes:   0,
+		swapDevices:     []*config.BlockDrive{},
 	}
 
 	hypervisor.setSandbox(s)
@@ -1028,9 +1038,13 @@ func (s *Sandbox) addSwap(ctx context.Context, swapID string, size int64) (*conf
 		return nil, err
 	}
 
-	err = exec.CommandContext(ctx, "/sbin/mkswap", swapFile).Run()
+	var outbuf, errbuf bytes.Buffer
+	cmd := exec.CommandContext(ctx, mkswapPath, swapFile)
+	cmd.Stdout = &outbuf
+	cmd.Stderr = &errbuf
+	err = cmd.Run()
 	if err != nil {
-		err = fmt.Errorf("mkswap swapfile %s fail %s", swapFile, err.Error())
+		err = fmt.Errorf("mkswap swapfile %s fail %s stdout %s stderr %s", swapFile, err.Error(), outbuf.String(), errbuf.String())
 		s.Logger().WithError(err).Error("addSwap")
 		return nil, err
 	}
@@ -1079,6 +1093,30 @@ func (s *Sandbox) removeSwap(ctx context.Context, blockDevice *config.BlockDrive
 	return err
 }
 
+func (s *Sandbox) setupSwap(ctx context.Context, sizeBytes int64) error {
+	if sizeBytes > s.swapSizeBytes {
+		dev, err := s.addSwap(ctx, fmt.Sprintf("swap%d", s.swapDeviceNum), sizeBytes-s.swapSizeBytes)
+		if err != nil {
+			return err
+		}
+
+		s.swapDeviceNum += 1
+		s.swapSizeBytes = sizeBytes
+		s.swapDevices = append(s.swapDevices, dev)
+	}
+
+	return nil
+}
+
+func (s *Sandbox) cleanSwap(ctx context.Context) {
+	for _, dev := range s.swapDevices {
+		err := s.removeSwap(ctx, dev)
+		if err != nil {
+			s.Logger().Warnf("remove swap device %+v got error %s", dev, err)
+		}
+	}
+}
+
 // startVM starts the VM.
 func (s *Sandbox) startVM(ctx context.Context) (err error) {
 	span, ctx := katatrace.Trace(ctx, s.Logger(), "startVM", s.tracingTags())
@@ -1641,6 +1679,8 @@ func (s *Sandbox) Stop(ctx context.Context, force bool) error {
 		return err
 	}
 
+	s.cleanSwap(ctx)
+
 	return nil
 }
 
@@ -1894,9 +1934,21 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	// Add default vcpus for sandbox
 	sandboxVCPUs += s.hypervisor.hypervisorConfig().NumVCPUs
 
-	sandboxMemoryByte := s.calculateSandboxMemory()
+	sandboxMemoryByte, sandboxneedPodSwap, sandboxSwapByte := s.calculateSandboxMemory()
 	// Add default / rsvd memory for sandbox.
-	sandboxMemoryByte += int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
+	hypervisorMemoryByte := int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
+	sandboxMemoryByte += hypervisorMemoryByte
+	if sandboxneedPodSwap {
+		sandboxSwapByte += hypervisorMemoryByte
+	}
+
+	// Setup the SWAP in the guest
+	if sandboxSwapByte > 0 {
+		err = s.setupSwap(ctx, sandboxSwapByte)
+		if err != nil {
+			return err
+		}
+	}
 
 	// Update VCPUs
 	s.Logger().WithField("cpus-sandbox", sandboxVCPUs).Debugf("Request to hypervisor to update vCPUs")
@@ -1941,8 +1993,10 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	return nil
 }
 
-func (s *Sandbox) calculateSandboxMemory() int64 {
+func (s *Sandbox) calculateSandboxMemory() (int64, bool, int64) {
 	memorySandbox := int64(0)
+	needPodSwap := false
+	swapSandbox := int64(0)
 	for _, c := range s.config.Containers {
 		// Do not hot add again non-running containers resources
 		if cont, ok := s.containers[c.ID]; ok && cont.state.State == types.StateStopped {
@@ -1950,11 +2004,30 @@ func (s *Sandbox) calculateSandboxMemory() int64 {
 			continue
 		}
 
-		if m := c.Resources.Memory; m != nil && m.Limit != nil {
-			memorySandbox += *m.Limit
+		if m := c.Resources.Memory; m != nil {
+			currentLimit := int64(0)
+			if m.Limit != nil {
+				currentLimit = *m.Limit
+				memorySandbox += currentLimit
+			}
+			if s.config.HypervisorConfig.GuestSwap && m.Swappiness != nil && *m.Swappiness > 0 {
+				currentSwap := int64(0)
+				if m.Swap != nil {
+					currentSwap = *m.Swap
+				}
+				if currentSwap == 0 {
+					if currentLimit == 0 {
+						needPodSwap = true
+					} else {
+						swapSandbox += currentLimit
+					}
+				} else if currentSwap > currentLimit {
+					swapSandbox = currentSwap - currentLimit
+				}
+			}
 		}
 	}
-	return memorySandbox
+	return memorySandbox, needPodSwap, swapSandbox
 }
 
 func (s *Sandbox) calculateSandboxCPUs() (uint32, error) {
diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go
index 27df559d36..77475be96e 100644
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -168,8 +168,10 @@ func TestCalculateSandboxMem(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			sandbox.config.Containers = tt.containers
-			got := sandbox.calculateSandboxMemory()
-			assert.Equal(t, got, tt.want)
+			mem, needSwap, swap := sandbox.calculateSandboxMemory()
+			assert.Equal(t, mem, tt.want)
+			assert.Equal(t, needSwap, false)
+			assert.Equal(t, swap, int64(0))
 		})
 	}
 }