From e0142db24f8c8755c3a407a64e25f9c70bb04a35 Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric_ernst@apple.com>
Date: Tue, 9 Aug 2022 14:56:10 -0700
Subject: [PATCH 1/3] hypervisor: Add GetTotalMemoryMB to interface

It'll be useful to get the total memory provided to the guest
(hotplugged + coldplugged). We'll use this information when calcualting
how much memory we can add at a time when utilizing ACPI hotplug.

Signed-off-by: Eric Ernst <eric_ernst@apple.com>
---
 src/runtime/virtcontainers/acrn.go            |  4 ++++
 src/runtime/virtcontainers/clh.go             | 10 ++++++++++
 src/runtime/virtcontainers/fc.go              |  4 ++++
 src/runtime/virtcontainers/hypervisor.go      |  1 +
 src/runtime/virtcontainers/mock_hypervisor.go |  3 +++
 src/runtime/virtcontainers/qemu.go            |  6 +++++-
 6 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go
index 1c3ebc147..008b3bd97 100644
--- a/src/runtime/virtcontainers/acrn.go
+++ b/src/runtime/virtcontainers/acrn.go
@@ -667,6 +667,10 @@ func (a *Acrn) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
 	return VcpuThreadIDs{}, nil
 }
 
+func (a *Acrn) GetTotalMemoryMB(ctx context.Context) uint32 {
+	return a.config.MemorySize
+}
+
 func (a *Acrn) ResizeMemory(ctx context.Context, reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error) {
 	return 0, MemoryDevice{}, nil
 }
diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go
index 3a02a645a..d93ceed67 100644
--- a/src/runtime/virtcontainers/clh.go
+++ b/src/runtime/virtcontainers/clh.go
@@ -1598,6 +1598,16 @@ func (clh *cloudHypervisor) cleanupVM(force bool) error {
 	return nil
 }
 
+func (clh *cloudHypervisor) GetTotalMemoryMB(ctx context.Context) uint32 {
+	vminfo, err := clh.vmInfo()
+	if err != nil {
+		clh.Logger().WithError(err).Error("failed to get vminfo")
+		return 0
+	}
+
+	return uint32(vminfo.GetMemoryActualSize() >> utils.MibToBytesShift)
+}
+
 // vmInfo ask to hypervisor for current VM status
 func (clh *cloudHypervisor) vmInfo() (chclient.VmInfo, error) {
 	cl := clh.client()
diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go
index 703e6e88b..f81cc319c 100644
--- a/src/runtime/virtcontainers/fc.go
+++ b/src/runtime/virtcontainers/fc.go
@@ -1165,6 +1165,10 @@ func (fc *firecracker) HypervisorConfig() HypervisorConfig {
 	return fc.config
 }
 
+func (fc *firecracker) GetTotalMemoryMB(ctx context.Context) uint32 {
+	return fc.config.MemorySize
+}
+
 func (fc *firecracker) ResizeMemory(ctx context.Context, reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error) {
 	return 0, MemoryDevice{}, nil
 }
diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index 49b658db3..0e7b4785b 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -922,6 +922,7 @@ type Hypervisor interface {
 	HotplugRemoveDevice(ctx context.Context, devInfo interface{}, devType DeviceType) (interface{}, error)
 	ResizeMemory(ctx context.Context, memMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error)
 	ResizeVCPUs(ctx context.Context, vcpus uint32) (uint32, uint32, error)
+	GetTotalMemoryMB(ctx context.Context) uint32
 	GetVMConsole(ctx context.Context, sandboxID string) (string, string, error)
 	Disconnect(ctx context.Context)
 	Capabilities(ctx context.Context) types.Capabilities
diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go
index f4a0b934e..19b818dff 100644
--- a/src/runtime/virtcontainers/mock_hypervisor.go
+++ b/src/runtime/virtcontainers/mock_hypervisor.go
@@ -98,6 +98,9 @@ func (m *mockHypervisor) ResizeVCPUs(ctx context.Context, cpus uint32) (uint32,
 	return 0, 0, nil
 }
 
+func (m *mockHypervisor) GetTotalMemoryMB(ctx context.Context) uint32 {
+	return 0
+}
 func (m *mockHypervisor) Disconnect(ctx context.Context) {
 }
 
diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go
index 56bd5c389..6ef2310f4 100644
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -2166,6 +2166,10 @@ func (q *qemu) Disconnect(ctx context.Context) {
 	q.qmpShutdown()
 }
 
+func (q *qemu) GetTotalMemoryMB(ctx context.Context) uint32 {
+	return q.config.MemorySize + uint32(q.state.HotpluggedMemory)
+}
+
 // ResizeMemory gets a request to update the VM memory to reqMemMB
 // Memory update is managed with two approaches
 // Add memory to VM:
@@ -2179,7 +2183,7 @@ func (q *qemu) Disconnect(ctx context.Context) {
 // A longer term solution is evaluate solutions like virtio-mem
 func (q *qemu) ResizeMemory(ctx context.Context, reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error) {
 
-	currentMemory := q.config.MemorySize + uint32(q.state.HotpluggedMemory)
+	currentMemory := q.GetTotalMemoryMB(ctx)
 	if err := q.qmpSetup(); err != nil {
 		return 0, MemoryDevice{}, err
 	}

From f390c122f065cfca3c4be057f6560544d3cf650d Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric_ernst@apple.com>
Date: Tue, 9 Aug 2022 15:36:09 -0700
Subject: [PATCH 2/3] sandbox: don't hotplug too much memory at once

If we're using ACPI hotplug for memory, there's a limitation on the
amount of memory which can be hotplugged at a single time.

During hotplug, we'll allocate memory for the memmap for each page,
resulting in a 64 byte per 4KiB page allocation. As an example, hotplugging 12GiB
of memory requires ~192 MiB of *free* memory, which is about the limit
we should expect for an idle 256 MiB guest (conservative heuristic of 75%
of provided memory).

From experimentation, at pod creation time we can reliably add 48 times
what is provided to the guest. (a factor of 48 results in using 75% of
provided memory for hotplug). Using prior example of a guest with 256Mi
RAM, 256 Mi * 48 = 12 Gi; 12GiB is upper end of what we should expect
can be hotplugged successfully into the guest.

Note: It isn't expected that we'll need to hotplug large amounts of RAM
after workloads have already started -- container additions are expected
to occur first in pod lifecycle. Based on this, we expect that provided
memory should be freely available for hotplug.

If virtio-mem is being utilized, there isn't such a limitation - we can
hotplug the max allowed memory at a single time.

Fixes: #4847

Signed-off-by: Eric Ernst <eric_ernst@apple.com>
---
 src/runtime/virtcontainers/sandbox.go | 64 +++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index e691ea1de..e4a16983e 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -77,6 +77,14 @@ const (
 
 	// Restricted permission for shared directory managed by virtiofs
 	sharedDirMode = os.FileMode(0700) | os.ModeDir
+
+	// hotplug factor indicates how much memory can be hotplugged relative to the amount of
+	// RAM provided to the guest. This is a conservative heuristic based on needing 64 bytes per
+	// 4KiB page of hotplugged memory.
+	//
+	// As an example: 12 GiB hotplugged -> 3 Mi pages -> 192 MiBytes overhead (3Mi x 64B).
+	// This is approximately what should be free in a relatively unloaded 256 MiB guest (75% of available memory). So, 256 Mi x 48 => 12 Gi
+	acpiMemoryHotplugFactor = 48
 )
 
 var (
@@ -2012,9 +2020,60 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	}
 	s.Logger().Debugf("Sandbox CPUs: %d", newCPUs)
 
-	// Update Memory
-	s.Logger().WithField("memory-sandbox-size-byte", sandboxMemoryByte).Debugf("Request to hypervisor to update memory")
+	// Update Memory --
+	// If we're using ACPI hotplug for memory, there's a limitation on the amount of memory which can be hotplugged at a single time.
+	// We must have enough free memory in the guest kernel to cover 64bytes per (4KiB) page of memory added for mem_map.
+	// See https://github.com/kata-containers/kata-containers/issues/4847 for more details.
+	// For a typical pod lifecycle, we expect that each container is added when we start the workloads. Based on this, we'll "assume" that majority
+	// of the guest memory is readily available. From experimentation, we see that we can add approximately 48 times what is already provided to
+	// the guest workload. For example, a 256 MiB guest should be able to accommodate hotplugging 12 GiB of memory.
+	//
+	// If virtio-mem is being used, there isn't such a limitation - we can hotplug the maximum allowed memory at a single time.
+	//
 	newMemoryMB := uint32(sandboxMemoryByte >> utils.MibToBytesShift)
+	finalMemoryMB := newMemoryMB
+
+	hconfig := s.hypervisor.HypervisorConfig()
+
+	for {
+		currentMemoryMB := s.hypervisor.GetTotalMemoryMB(ctx)
+
+		maxhotPluggableMemoryMB := currentMemoryMB * acpiMemoryHotplugFactor
+
+		// In the case of virtio-mem, we don't have a restriction on how much can be hotplugged at
+		// a single time. As a result, the max hotpluggable is only limited by the maximum memory size
+		// of the guest.
+		if hconfig.VirtioMem {
+			maxhotPluggableMemoryMB = uint32(hconfig.DefaultMaxMemorySize) - currentMemoryMB
+		}
+
+		deltaMB := int32(finalMemoryMB - currentMemoryMB)
+
+		if deltaMB > int32(maxhotPluggableMemoryMB) {
+			s.Logger().Warnf("Large hotplug. Adding %d MB of %d total memory", maxhotPluggableMemoryMB, deltaMB)
+			newMemoryMB = currentMemoryMB + maxhotPluggableMemoryMB
+		} else {
+			newMemoryMB = finalMemoryMB
+		}
+
+		// Add the memory to the guest and online the memory:
+		if err := s.updateMemory(ctx, newMemoryMB); err != nil {
+			return err
+		}
+
+		if newMemoryMB == finalMemoryMB {
+			break
+		}
+
+	}
+
+	return nil
+
+}
+
+func (s *Sandbox) updateMemory(ctx context.Context, newMemoryMB uint32) error {
+	// online the memory:
+	s.Logger().WithField("memory-sandbox-size-mb", newMemoryMB).Debugf("Request to hypervisor to update memory")
 	newMemory, updatedMemoryDevice, err := s.hypervisor.ResizeMemory(ctx, newMemoryMB, s.state.GuestMemoryBlockSizeMB, s.state.GuestMemoryHotplugProbe)
 	if err != nil {
 		if err == noGuestMemHotplugErr {
@@ -2034,7 +2093,6 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	if err := s.agent.onlineCPUMem(ctx, 0, false); err != nil {
 		return err
 	}
-
 	return nil
 }
 

From 9997ab064a592c412e277dd1cf48bdc9e1119220 Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric_ernst@apple.com>
Date: Fri, 22 Jul 2022 19:35:34 -0700
Subject: [PATCH 3/3] sandbox_test: Add test to verify memory hotplug behavior

Augment the mock hypervisor so that we can validate that ACPI memory hotplug
is carried out as expected.

We'll augment the number of memory slots in the hypervisor config each
time the memory of the hypervisor is changed. In this way we can ensure
that large memory hotplugs are broken up into appropriately sized
pieces in the unit test.

Signed-off-by: Eric Ernst <eric_ernst@apple.com>
---
 src/runtime/virtcontainers/mock_hypervisor.go | 13 ++++++---
 .../virtcontainers/mock_hypervisor_test.go    |  2 +-
 src/runtime/virtcontainers/sandbox_test.go    | 27 ++++++++++++-------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go
index 19b818dff..7d6da561f 100644
--- a/src/runtime/virtcontainers/mock_hypervisor.go
+++ b/src/runtime/virtcontainers/mock_hypervisor.go
@@ -17,6 +17,7 @@ import (
 var MockHybridVSockPath = "/tmp/kata-mock-hybrid-vsock.socket"
 
 type mockHypervisor struct {
+	config  HypervisorConfig
 	mockPid int
 }
 
@@ -27,10 +28,11 @@ func (m *mockHypervisor) Capabilities(ctx context.Context) types.Capabilities {
 }
 
 func (m *mockHypervisor) HypervisorConfig() HypervisorConfig {
-	return HypervisorConfig{}
+	return m.config
 }
 
 func (m *mockHypervisor) setConfig(config *HypervisorConfig) error {
+	m.config = *config
 	return nil
 }
 
@@ -38,7 +40,7 @@ func (m *mockHypervisor) CreateVM(ctx context.Context, id string, network Networ
 	if err := m.setConfig(hypervisorConfig); err != nil {
 		return err
 	}
-
+	m.config.MemSlots = 0
 	return nil
 }
 
@@ -92,6 +94,11 @@ func (m *mockHypervisor) GetVMConsole(ctx context.Context, sandboxID string) (st
 }
 
 func (m *mockHypervisor) ResizeMemory(ctx context.Context, memMB uint32, memorySectionSizeMB uint32, probe bool) (uint32, MemoryDevice, error) {
+	if m.config.MemorySize != memMB {
+		// For testing, we'll use MemSlots to track how many times we resized memory
+		m.config.MemSlots += 1
+		m.config.MemorySize = memMB
+	}
 	return 0, MemoryDevice{}, nil
 }
 func (m *mockHypervisor) ResizeVCPUs(ctx context.Context, cpus uint32) (uint32, uint32, error) {
@@ -99,7 +106,7 @@ func (m *mockHypervisor) ResizeVCPUs(ctx context.Context, cpus uint32) (uint32,
 }
 
 func (m *mockHypervisor) GetTotalMemoryMB(ctx context.Context) uint32 {
-	return 0
+	return m.config.MemorySize
 }
 func (m *mockHypervisor) Disconnect(ctx context.Context) {
 }
diff --git a/src/runtime/virtcontainers/mock_hypervisor_test.go b/src/runtime/virtcontainers/mock_hypervisor_test.go
index 0159a993d..ba4435f13 100644
--- a/src/runtime/virtcontainers/mock_hypervisor_test.go
+++ b/src/runtime/virtcontainers/mock_hypervisor_test.go
@@ -14,7 +14,7 @@ import (
 )
 
 func TestMockHypervisorCreateVM(t *testing.T) {
-	var m *mockHypervisor
+	m := &mockHypervisor{}
 	assert := assert.New(t)
 
 	sandbox := &Sandbox{
diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go
index 331094ee3..59ed24c1a 100644
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -41,6 +41,7 @@ func newHypervisorConfig(kernelParams []Param, hParams []Param) HypervisorConfig
 		HypervisorPath:   filepath.Join(testDir, testHypervisor),
 		KernelParams:     kernelParams,
 		HypervisorParams: hParams,
+		MemorySize:       1,
 	}
 
 }
@@ -1360,7 +1361,6 @@ func TestSandboxUpdateResources(t *testing.T) {
 	contConfig1 := newTestContainerConfigNoop("cont-00001")
 	contConfig2 := newTestContainerConfigNoop("cont-00002")
 	hConfig := newHypervisorConfig(nil, nil)
-
 	defer cleanUp()
 	// create a sandbox
 	s, err := testCreateSandbox(t,
@@ -1370,28 +1370,37 @@ func TestSandboxUpdateResources(t *testing.T) {
 		NetworkConfig{},
 		[]ContainerConfig{contConfig1, contConfig2},
 		nil)
-
 	assert.NoError(t, err)
+
 	err = s.updateResources(context.Background())
 	assert.NoError(t, err)
 
-	containerMemLimit := int64(1000)
+	// For mock hypervisor, we MemSlots to be 0 since the memory wasn't changed.
+	assert.Equal(t, s.hypervisor.HypervisorConfig().MemSlots, uint32(0))
+
+	containerMemLimit := int64(4 * 1024 * 1024 * 1024)
 	containerCPUPeriod := uint64(1000)
 	containerCPUQouta := int64(5)
-	for _, c := range s.config.Containers {
-		c.Resources.Memory = &specs.LinuxMemory{
+	for idx := range s.config.Containers {
+		s.config.Containers[idx].Resources.Memory = &specs.LinuxMemory{
 			Limit: new(int64),
 		}
-		c.Resources.CPU = &specs.LinuxCPU{
+		s.config.Containers[idx].Resources.CPU = &specs.LinuxCPU{
 			Period: new(uint64),
 			Quota:  new(int64),
 		}
-		c.Resources.Memory.Limit = &containerMemLimit
-		c.Resources.CPU.Period = &containerCPUPeriod
-		c.Resources.CPU.Quota = &containerCPUQouta
+		s.config.Containers[idx].Resources.Memory.Limit = &containerMemLimit
+		s.config.Containers[idx].Resources.CPU.Period = &containerCPUPeriod
+		s.config.Containers[idx].Resources.CPU.Quota = &containerCPUQouta
 	}
 	err = s.updateResources(context.Background())
 	assert.NoError(t, err)
+
+	// Since we're starting with a memory of 1 MB, we expect it to take 3 hotplugs to add 4GiB of memory when using ACPI hotplug:
+	// +48MB
+	// +2352MB
+	// +the remaining
+	assert.Equal(t, s.hypervisor.HypervisorConfig().MemSlots, uint32(3))
 }
 
 func TestSandboxExperimentalFeature(t *testing.T) {