From f390c122f065cfca3c4be057f6560544d3cf650d Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric_ernst@apple.com>
Date: Tue, 9 Aug 2022 15:36:09 -0700
Subject: [PATCH] sandbox: don't hotplug too much memory at once

If we're using ACPI hotplug for memory, there's a limitation on the
amount of memory which can be hotplugged at a single time.

During hotplug, we'll allocate memory for the memmap for each page,
resulting in a 64 byte per 4KiB page allocation. As an example, hotplugging 12GiB
of memory requires ~192 MiB of *free* memory, which is about the limit
we should expect for an idle 256 MiB guest (conservative heuristic of 75%
of provided memory).

From experimentation, at pod creation time we can reliably add 48 times
what is provided to the guest. (a factor of 48 results in using 75% of
provided memory for hotplug). Using prior example of a guest with 256Mi
RAM, 256 Mi * 48 = 12 Gi; 12GiB is upper end of what we should expect
can be hotplugged successfully into the guest.

Note: It isn't expected that we'll need to hotplug large amounts of RAM
after workloads have already started -- container additions are expected
to occur first in pod lifecycle. Based on this, we expect that provided
memory should be freely available for hotplug.

If virtio-mem is being utilized, there isn't such a limitation - we can
hotplug the max allowed memory at a single time.

Fixes: #4847

Signed-off-by: Eric Ernst <eric_ernst@apple.com>
---
 src/runtime/virtcontainers/sandbox.go | 64 +++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index e691ea1dee..e4a16983ed 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -77,6 +77,14 @@ const (
 
 	// Restricted permission for shared directory managed by virtiofs
 	sharedDirMode = os.FileMode(0700) | os.ModeDir
+
+	// hotplug factor indicates how much memory can be hotplugged relative to the amount of
+	// RAM provided to the guest. This is a conservative heuristic based on needing 64 bytes per
+	// 4KiB page of hotplugged memory.
+	//
+	// As an example: 12 GiB hotplugged -> 3 Mi pages -> 192 MiBytes overhead (3Mi x 64B).
+	// This is approximately what should be free in a relatively unloaded 256 MiB guest (75% of available memory). So, 256 Mi x 48 => 12 Gi
+	acpiMemoryHotplugFactor = 48
 )
 
 var (
@@ -2012,9 +2020,60 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	}
 	s.Logger().Debugf("Sandbox CPUs: %d", newCPUs)
 
-	// Update Memory
-	s.Logger().WithField("memory-sandbox-size-byte", sandboxMemoryByte).Debugf("Request to hypervisor to update memory")
+	// Update Memory --
+	// If we're using ACPI hotplug for memory, there's a limitation on the amount of memory which can be hotplugged at a single time.
+	// We must have enough free memory in the guest kernel to cover 64bytes per (4KiB) page of memory added for mem_map.
+	// See https://github.com/kata-containers/kata-containers/issues/4847 for more details.
+	// For a typical pod lifecycle, we expect that each container is added when we start the workloads. Based on this, we'll "assume" that majority
+	// of the guest memory is readily available. From experimentation, we see that we can add approximately 48 times what is already provided to
+	// the guest workload. For example, a 256 MiB guest should be able to accommodate hotplugging 12 GiB of memory.
+	//
+	// If virtio-mem is being used, there isn't such a limitation - we can hotplug the maximum allowed memory at a single time.
+	//
 	newMemoryMB := uint32(sandboxMemoryByte >> utils.MibToBytesShift)
+	finalMemoryMB := newMemoryMB
+
+	hconfig := s.hypervisor.HypervisorConfig()
+
+	for {
+		currentMemoryMB := s.hypervisor.GetTotalMemoryMB(ctx)
+
+		maxhotPluggableMemoryMB := currentMemoryMB * acpiMemoryHotplugFactor
+
+		// In the case of virtio-mem, we don't have a restriction on how much can be hotplugged at
+		// a single time. As a result, the max hotpluggable is only limited by the maximum memory size
+		// of the guest.
+		if hconfig.VirtioMem {
+			maxhotPluggableMemoryMB = uint32(hconfig.DefaultMaxMemorySize) - currentMemoryMB
+		}
+
+		deltaMB := int32(finalMemoryMB - currentMemoryMB)
+
+		if deltaMB > int32(maxhotPluggableMemoryMB) {
+			s.Logger().Warnf("Large hotplug. Adding %d MB of %d total memory", maxhotPluggableMemoryMB, deltaMB)
+			newMemoryMB = currentMemoryMB + maxhotPluggableMemoryMB
+		} else {
+			newMemoryMB = finalMemoryMB
+		}
+
+		// Add the memory to the guest and online the memory:
+		if err := s.updateMemory(ctx, newMemoryMB); err != nil {
+			return err
+		}
+
+		if newMemoryMB == finalMemoryMB {
+			break
+		}
+
+	}
+
+	return nil
+
+}
+
+func (s *Sandbox) updateMemory(ctx context.Context, newMemoryMB uint32) error {
+	// online the memory:
+	s.Logger().WithField("memory-sandbox-size-mb", newMemoryMB).Debugf("Request to hypervisor to update memory")
 	newMemory, updatedMemoryDevice, err := s.hypervisor.ResizeMemory(ctx, newMemoryMB, s.state.GuestMemoryBlockSizeMB, s.state.GuestMemoryHotplugProbe)
 	if err != nil {
 		if err == noGuestMemHotplugErr {
@@ -2034,7 +2093,6 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	if err := s.agent.onlineCPUMem(ctx, 0, false); err != nil {
 		return err
 	}
-
 	return nil
 }