diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index be7481f433d..388e9e1c232 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -22,6 +22,7 @@ import ( "os" "path" "path/filepath" + "strconv" "strings" "sync" "time" @@ -557,3 +558,86 @@ func (m *cgroupManagerImpl) MemoryUsage(name CgroupName) (int64, error) { val, err := fscommon.GetCgroupParamUint(path, file) return int64(val), err } + +// Get the memory limit in bytes applied to the cgroup +func (m *cgroupManagerImpl) GetCgroupMemoryConfig(name CgroupName) (uint64, error) { + cgroupPaths := m.buildCgroupPaths(name) + cgroupMemoryPath, found := cgroupPaths["memory"] + if !found { + return 0, fmt.Errorf("failed to build memory cgroup fs path for cgroup %v", name) + } + memLimit, err := fscommon.GetCgroupParamUint(cgroupMemoryPath, "memory.limit_in_bytes") + if err != nil { + return 0, fmt.Errorf("failed to get memory.limit_in_bytes for cgroup %v: %v", name, err) + } + return memLimit, nil +} + +// Get the cpu quota, cpu period, and cpu shares applied to the cgroup +func (m *cgroupManagerImpl) GetCgroupCpuConfig(name CgroupName) (int64, uint64, uint64, error) { + cgroupPaths := m.buildCgroupPaths(name) + cgroupCpuPath, found := cgroupPaths["cpu"] + if !found { + return 0, 0, 0, fmt.Errorf("failed to build CPU cgroup fs path for cgroup %v", name) + } + cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupCpuPath, "cpu.cfs_quota_us") + if errQ != nil { + return 0, 0, 0, fmt.Errorf("failed to read CPU quota for cgroup %v: %v", name, errQ) + } + cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64) + if errInt != nil { + return 0, 0, 0, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %v", name, errInt) + } + cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupCpuPath, "cpu.cfs_period_us") + if errP != nil { + return 0, 0, 0, fmt.Errorf("failed to read CPU period for cgroup %v: %v", name, errP) + } + cpuShares, errS := fscommon.GetCgroupParamUint(cgroupCpuPath, "cpu.shares") + if errP != nil { + return 0, 0, 0, fmt.Errorf("failed to read CPU shares for cgroup %v: %v", name, errS) + } + return cpuQuota, cpuPeriod, cpuShares, nil +} + +// Set the memory limit in bytes applied to the cgroup +func (m *cgroupManagerImpl) SetCgroupMemoryConfig(name CgroupName, memoryLimit int64) error { + cgroupPaths := m.buildCgroupPaths(name) + cgroupMemoryPath, found := cgroupPaths["memory"] + if !found { + return fmt.Errorf("failed to build memory cgroup fs path for cgroup %v", name) + } + memLimit := strconv.FormatInt(memoryLimit, 10) + if err := os.WriteFile(filepath.Join(cgroupMemoryPath, "memory.limit_in_bytes"), []byte(memLimit), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", memLimit, cgroupMemoryPath, err) + } + return nil +} + +// Set the cpu quota, cpu period, and cpu shares applied to the cgroup +func (m *cgroupManagerImpl) SetCgroupCpuConfig(name CgroupName, cpuQuota *int64, cpuPeriod, cpuShares *uint64) error { + var cpuQuotaStr, cpuPeriodStr, cpuSharesStr string + cgroupPaths := m.buildCgroupPaths(name) + cgroupCpuPath, found := cgroupPaths["cpu"] + if !found { + return fmt.Errorf("failed to build cpu cgroup fs path for cgroup %v", name) + } + if cpuQuota != nil { + cpuQuotaStr = strconv.FormatInt(*cpuQuota, 10) + if err := os.WriteFile(filepath.Join(cgroupCpuPath, "cpu.cfs_quota_us"), []byte(cpuQuotaStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", cpuQuotaStr, cgroupCpuPath, err) + } + } + if cpuPeriod != nil { + cpuPeriodStr = strconv.FormatUint(*cpuPeriod, 10) + if err := os.WriteFile(filepath.Join(cgroupCpuPath, "cpu.cfs_period_us"), []byte(cpuPeriodStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", cpuPeriodStr, cgroupCpuPath, err) + } + } + if cpuShares != nil { + cpuSharesStr = strconv.FormatUint(*cpuShares, 10) + if err := os.WriteFile(filepath.Join(cgroupCpuPath, "cpu.shares"), []byte(cpuSharesStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", cpuSharesStr, cgroupCpuPath, err) + } + } + return nil +} diff --git a/pkg/kubelet/cm/cgroup_manager_unsupported.go b/pkg/kubelet/cm/cgroup_manager_unsupported.go index 684a1f12f29..6220f5c96e5 100644 --- a/pkg/kubelet/cm/cgroup_manager_unsupported.go +++ b/pkg/kubelet/cm/cgroup_manager_unsupported.go @@ -77,6 +77,22 @@ func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error return nil } +func (m *unsupportedCgroupManager) GetCgroupMemoryConfig(name CgroupName) (uint64, error) { + return 0, errNotSupported +} + +func (m *unsupportedCgroupManager) GetCgroupCpuConfig(name CgroupName) (int64, uint64, uint64, error) { + return 0, 0, 0, errNotSupported +} + +func (m *unsupportedCgroupManager) SetCgroupMemoryConfig(name CgroupName, memoryLimit int64) error { + return errNotSupported +} + +func (m *unsupportedCgroupManager) SetCgroupCpuConfig(name CgroupName, cpuQuota *int64, cpuPeriod, cpuShares *uint64) error { + return errNotSupported +} + var RootCgroupName = CgroupName([]string{}) func NewCgroupName(base CgroupName, components ...string) CgroupName { diff --git a/pkg/kubelet/cm/container_manager_stub.go b/pkg/kubelet/cm/container_manager_stub.go index 1a79c3b0f96..4555fcbd651 100644 --- a/pkg/kubelet/cm/container_manager_stub.go +++ b/pkg/kubelet/cm/container_manager_stub.go @@ -95,6 +95,22 @@ func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceLi return cm.extendedPluginResources, cm.extendedPluginResources, []string{} } +func (m *podContainerManagerStub) GetPodCgroupMemoryConfig(_ *v1.Pod) (uint64, error) { + return 0, nil +} + +func (m *podContainerManagerStub) GetPodCgroupCpuConfig(_ *v1.Pod) (int64, uint64, uint64, error) { + return 0, 0, 0, nil +} + +func (m *podContainerManagerStub) SetPodCgroupMemoryConfig(_ *v1.Pod, _ int64) error { + return nil +} + +func (m *podContainerManagerStub) SetPodCgroupCpuConfig(_ *v1.Pod, _ *int64, _, _ *uint64) error { + return nil +} + func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager { return &podContainerManagerStub{} } diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 660da92fba4..69a410b0d0f 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -20,9 +20,11 @@ import ( "fmt" v1 "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/klog/v2" - + podutil "k8s.io/kubernetes/pkg/api/v1/pod" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" @@ -380,6 +382,11 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int return 0 } cpuQuantity := container.Resources.Requests[v1.ResourceCPU] + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { + cpuQuantity = cs.ResourcesAllocated[v1.ResourceCPU] + } + } if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { return 0 } diff --git a/pkg/kubelet/cm/fake_pod_container_manager.go b/pkg/kubelet/cm/fake_pod_container_manager.go index cafae75f569..c77898f5231 100644 --- a/pkg/kubelet/cm/fake_pod_container_manager.go +++ b/pkg/kubelet/cm/fake_pod_container_manager.go @@ -104,3 +104,38 @@ func (m *FakePodContainerManager) IsPodCgroup(cgroupfs string) (bool, types.UID) m.CalledFunctions = append(m.CalledFunctions, "IsPodCgroup") return false, types.UID("") } + +func (cm *FakePodContainerManager) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) { + cm.Lock() + defer cm.Unlock() + cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupMemoryUsage") + return 0, nil +} + +func (cm *FakePodContainerManager) GetPodCgroupMemoryConfig(_ *v1.Pod) (uint64, error) { + cm.Lock() + defer cm.Unlock() + cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupMemoryConfig") + return 0, nil +} + +func (cm *FakePodContainerManager) GetPodCgroupCpuConfig(_ *v1.Pod) (int64, uint64, uint64, error) { + cm.Lock() + defer cm.Unlock() + cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupCpuConfig") + return 0, 0, 0, nil +} + +func (cm *FakePodContainerManager) SetPodCgroupMemoryConfig(_ *v1.Pod, _ int64) error { + cm.Lock() + defer cm.Unlock() + cm.CalledFunctions = append(cm.CalledFunctions, "SetPodCgroupMemoryConfig") + return nil +} + +func (cm *FakePodContainerManager) SetPodCgroupCpuConfig(_ *v1.Pod, _ *int64, _, _ *uint64) error { + cm.Lock() + defer cm.Unlock() + cm.CalledFunctions = append(cm.CalledFunctions, "SetPodCgroupCpuConfig") + return nil +} diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 8cf15d1f1f1..59cb37e71c8 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -28,6 +28,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" utilfeature "k8s.io/apiserver/pkg/util/feature" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/api/v1/resource" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" @@ -151,6 +152,11 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, memoryLimitsDeclared = false } containerHugePageLimits := HugePageLimits(container.Resources.Requests) + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.InPlacePodVerticalScaling) { + if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { + containerHugePageLimits = HugePageLimits(cs.ResourcesAllocated) + } + } for k, v := range containerHugePageLimits { if value, exists := hugePageLimits[k]; exists { hugePageLimits[k] = value + v diff --git a/pkg/kubelet/cm/helpers_unsupported.go b/pkg/kubelet/cm/helpers_unsupported.go index c0677c7bfc7..187cae59883 100644 --- a/pkg/kubelet/cm/helpers_unsupported.go +++ b/pkg/kubelet/cm/helpers_unsupported.go @@ -25,10 +25,13 @@ import ( ) const ( - MinShares = 0 + MinShares = 0 + MaxShares = 0 + SharesPerCPU = 0 MilliCPUToCPU = 0 + QuotaPeriod = 0 MinQuotaPeriod = 0 ) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index d1dcd7ba0e8..c36436e18b6 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -25,9 +25,12 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/klog/v2" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" corehelper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" @@ -107,7 +110,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai hint := p.affinity.GetAffinity(podUID, container.Name) klog.InfoS("Got topology affinity", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "hint", hint) - requestedResources, err := getRequestedResources(container) + requestedResources, err := getRequestedResources(pod, container) if err != nil { return err } @@ -319,7 +322,7 @@ func getPodRequestedResources(pod *v1.Pod) (map[v1.ResourceName]uint64, error) { reqRsrcsByAppCtrs := make(map[v1.ResourceName]uint64) for _, ctr := range pod.Spec.InitContainers { - reqRsrcs, err := getRequestedResources(&ctr) + reqRsrcs, err := getRequestedResources(pod, &ctr) if err != nil { return nil, err @@ -336,7 +339,7 @@ func getPodRequestedResources(pod *v1.Pod) (map[v1.ResourceName]uint64, error) { } for _, ctr := range pod.Spec.Containers { - reqRsrcs, err := getRequestedResources(&ctr) + reqRsrcs, err := getRequestedResources(pod, &ctr) if err != nil { return nil, err @@ -391,7 +394,7 @@ func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v return nil } - requestedResources, err := getRequestedResources(container) + requestedResources, err := getRequestedResources(pod, container) if err != nil { klog.ErrorS(err, "Failed to get container requested resources", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name) return nil @@ -408,9 +411,15 @@ func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v return p.calculateHints(s.GetMachineState(), pod, requestedResources) } -func getRequestedResources(container *v1.Container) (map[v1.ResourceName]uint64, error) { +func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.ResourceName]uint64, error) { requestedResources := map[v1.ResourceName]uint64{} - for resourceName, quantity := range container.Resources.Requests { + resources := container.Resources.Requests + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { + resources = cs.ResourcesAllocated + } + } + for resourceName, quantity := range resources { if resourceName != v1.ResourceMemory && !corehelper.IsHugePageResourceName(resourceName) { continue } diff --git a/pkg/kubelet/cm/pod_container_manager_linux.go b/pkg/kubelet/cm/pod_container_manager_linux.go index 3b798a4a466..eab6f5d846c 100644 --- a/pkg/kubelet/cm/pod_container_manager_linux.go +++ b/pkg/kubelet/cm/pod_container_manager_linux.go @@ -120,6 +120,35 @@ func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, return cgroupName, cgroupfsName } +func (m *podContainerManagerImpl) GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error) { + podCgroupName, _ := m.GetPodContainerName(pod) + memUsage, err := m.cgroupManager.MemoryUsage(podCgroupName) + if err != nil { + return 0, err + } + return uint64(memUsage), nil +} + +func (m *podContainerManagerImpl) GetPodCgroupMemoryConfig(pod *v1.Pod) (uint64, error) { + podCgroupName, _ := m.GetPodContainerName(pod) + return m.cgroupManager.GetCgroupMemoryConfig(podCgroupName) +} + +func (m *podContainerManagerImpl) GetPodCgroupCpuConfig(pod *v1.Pod) (int64, uint64, uint64, error) { + podCgroupName, _ := m.GetPodContainerName(pod) + return m.cgroupManager.GetCgroupCpuConfig(podCgroupName) +} + +func (m *podContainerManagerImpl) SetPodCgroupMemoryConfig(pod *v1.Pod, memoryLimit int64) error { + podCgroupName, _ := m.GetPodContainerName(pod) + return m.cgroupManager.SetCgroupMemoryConfig(podCgroupName, memoryLimit) +} + +func (m *podContainerManagerImpl) SetPodCgroupCpuConfig(pod *v1.Pod, cpuQuota *int64, cpuPeriod, cpuShares *uint64) error { + podCgroupName, _ := m.GetPodContainerName(pod) + return m.cgroupManager.SetCgroupCpuConfig(podCgroupName, cpuQuota, cpuPeriod, cpuShares) +} + // Kill one process ID func (m *podContainerManagerImpl) killOnePid(pid int) error { // os.FindProcess never returns an error on POSIX @@ -322,3 +351,23 @@ func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupN func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) { return false, types.UID("") } + +func (m *podContainerManagerNoop) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) { + return 0, nil +} + +func (m *podContainerManagerNoop) GetPodCgroupMemoryConfig(_ *v1.Pod) (uint64, error) { + return 0, nil +} + +func (m *podContainerManagerNoop) GetPodCgroupCpuConfig(_ *v1.Pod) (int64, uint64, uint64, error) { + return 0, 0, 0, nil +} + +func (m *podContainerManagerNoop) SetPodCgroupMemoryConfig(_ *v1.Pod, _ int64) error { + return nil +} + +func (m *podContainerManagerNoop) SetPodCgroupCpuConfig(_ *v1.Pod, _ *int64, _, _ *uint64) error { + return nil +} diff --git a/pkg/kubelet/cm/pod_container_manager_stub.go b/pkg/kubelet/cm/pod_container_manager_stub.go index 26c56ec7910..36a995c1895 100644 --- a/pkg/kubelet/cm/pod_container_manager_stub.go +++ b/pkg/kubelet/cm/pod_container_manager_stub.go @@ -53,3 +53,23 @@ func (m *podContainerManagerStub) GetAllPodsFromCgroups() (map[types.UID]CgroupN func (m *podContainerManagerStub) IsPodCgroup(cgroupfs string) (bool, types.UID) { return false, types.UID("") } + +func (m *podContainerManagerStub) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) { + return 0, nil +} + +func (m *podContainerManagerStub) GetPodCgroupMemoryLimit(_ *v1.Pod) (uint64, error) { + return 0, nil +} + +func (m *podContainerManagerStub) GetPodCgroupCpuLimit(_ *v1.Pod) (int64, uint64, uint64, error) { + return 0, 0, 0, nil +} + +func (m *podContainerManagerStub) SetPodCgroupMemoryLimit(_ *v1.Pod, _ int64) error { + return nil +} + +func (m *podContainerManagerStub) SetPodCgroupCpuLimit(_ *v1.Pod, _ *int64, _, _ *uint64) error { + return nil +} diff --git a/pkg/kubelet/cm/types.go b/pkg/kubelet/cm/types.go index f9ddc55ae2c..68ea1bef37b 100644 --- a/pkg/kubelet/cm/types.go +++ b/pkg/kubelet/cm/types.go @@ -84,6 +84,14 @@ type CgroupManager interface { ReduceCPULimits(cgroupName CgroupName) error // MemoryUsage returns current memory usage of the specified cgroup, as read from the cgroupfs. MemoryUsage(name CgroupName) (int64, error) + // GetCgroupMemoryConfig returns the memory limit of the specified cgroup as read from cgroup fs. + GetCgroupMemoryConfig(name CgroupName) (uint64, error) + // GetCgroupCpuConfig returns the cpu quota, cpu period, and cpu shares of the specified cgroup as read from cgroup fs. + GetCgroupCpuConfig(name CgroupName) (int64, uint64, uint64, error) + // SetCgroupMemoryConfig sets the memory limit of the specified cgroup. + SetCgroupMemoryConfig(name CgroupName, memoryLimit int64) error + // SetCgroupCpuConfig sets the cpu quota, cpu period, and cpu shares of the specified cgroup. + SetCgroupCpuConfig(name CgroupName, cpuQuota *int64, cpuPeriod, cpuShares *uint64) error } // QOSContainersInfo stores the names of containers per qos @@ -119,4 +127,19 @@ type PodContainerManager interface { // IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod IsPodCgroup(cgroupfs string) (bool, types.UID) + + // Get value of memory.usage_in_bytes for the pod Cgroup + GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error) + + // Get value of memory.limit_in_bytes for the pod Cgroup + GetPodCgroupMemoryConfig(pod *v1.Pod) (uint64, error) + + // Get values of cpu.cfs_quota_us, cpu.cfs_period_us, and cpu.shares for the pod Cgroup + GetPodCgroupCpuConfig(pod *v1.Pod) (int64, uint64, uint64, error) + + // Set value of memory.limit_in_bytes for the pod Cgroup + SetPodCgroupMemoryConfig(pod *v1.Pod, memoryLimit int64) error + + // Set values of cpu.cfs_quota_us, cpu.cfs_period_us, and cpu.shares for the pod Cgroup + SetPodCgroupCpuConfig(pod *v1.Pod, cpuQuota *int64, cpuPeriod, cpuShares *uint64) error } diff --git a/pkg/kubelet/container/helpers.go b/pkg/kubelet/container/helpers.go index 42a67211a74..0388d579e55 100644 --- a/pkg/kubelet/container/helpers.go +++ b/pkg/kubelet/container/helpers.go @@ -117,6 +117,23 @@ func HashContainer(container *v1.Container) uint64 { return uint64(hash.Sum32()) } +// HashContainerWithoutResources returns the hash of the container with Resources field zero'd out. +func HashContainerWithoutResources(container *v1.Container) uint64 { + // InPlacePodVerticalScaling enables mutable Resources field. + // Changes to this field may not require container restart depending on policy. + // Compute hash over fields besides the Resources field + // NOTE: This is needed during alpha and beta so that containers using Resources but + // not subject to In-place resize are not unexpectedly restarted when + // InPlacePodVerticalScaling feature-gate is toggled. + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this in GA+1 and make HashContainerWithoutResources to become Hash. + hashWithoutResources := fnv.New32a() + containerCopy := container.DeepCopy() + containerCopy.Resources = v1.ResourceRequirements{} + containerJSON, _ := json.Marshal(containerCopy) + hashutil.DeepHashObject(hashWithoutResources, containerJSON) + return uint64(hashWithoutResources.Sum32()) +} + // envVarsToMap constructs a map of environment name to value from a slice // of env vars. func envVarsToMap(envs []EnvVar) map[string]string { @@ -252,12 +269,13 @@ func ConvertPodStatusToRunningPod(runtimeName string, podStatus *PodStatus) Pod continue } container := &Container{ - ID: containerStatus.ID, - Name: containerStatus.Name, - Image: containerStatus.Image, - ImageID: containerStatus.ImageID, - Hash: containerStatus.Hash, - State: containerStatus.State, + ID: containerStatus.ID, + Name: containerStatus.Name, + Image: containerStatus.Image, + ImageID: containerStatus.ImageID, + Hash: containerStatus.Hash, + HashWithoutResources: containerStatus.HashWithoutResources, + State: containerStatus.State, } runningPod.Containers = append(runningPod.Containers, container) } diff --git a/pkg/kubelet/container/helpers_test.go b/pkg/kubelet/container/helpers_test.go index fcd5b6fb6ea..e02f7479a04 100644 --- a/pkg/kubelet/container/helpers_test.go +++ b/pkg/kubelet/container/helpers_test.go @@ -25,6 +25,7 @@ import ( "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -908,3 +909,83 @@ func TestHasWindowsHostProcessContainer(t *testing.T) { }) } } + +func TestHashContainerWithoutResources(t *testing.T) { + cpu100m := resource.MustParse("100m") + cpu200m := resource.MustParse("200m") + mem100M := resource.MustParse("100Mi") + mem200M := resource.MustParse("200Mi") + cpuPolicyRestartNotRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceCPU, Policy: v1.RestartNotRequired} + memPolicyRestartNotRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, Policy: v1.RestartNotRequired} + cpuPolicyRestartRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceCPU, Policy: v1.RestartRequired} + memPolicyRestartRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, Policy: v1.RestartRequired} + + type testCase struct { + name string + container *v1.Container + expectedHash uint64 + } + + tests := []testCase{ + { + "Burstable pod with CPU policy restart required", + &v1.Container{ + Name: "foo", + Image: "bar", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + Requests: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + }, + ResizePolicy: []v1.ContainerResizePolicy{cpuPolicyRestartRequired, memPolicyRestartNotRequired}, + }, + 0x86a4393c, + }, + { + "Burstable pod with memory policy restart required", + &v1.Container{ + Name: "foo", + Image: "bar", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + Requests: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + }, + ResizePolicy: []v1.ContainerResizePolicy{cpuPolicyRestartNotRequired, memPolicyRestartRequired}, + }, + 0x73a18cce, + }, + { + "Guaranteed pod with CPU policy restart required", + &v1.Container{ + Name: "foo", + Image: "bar", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + Requests: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + }, + ResizePolicy: []v1.ContainerResizePolicy{cpuPolicyRestartRequired, memPolicyRestartNotRequired}, + }, + 0x86a4393c, + }, + { + "Guaranteed pod with memory policy restart required", + &v1.Container{ + Name: "foo", + Image: "bar", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + Requests: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + }, + ResizePolicy: []v1.ContainerResizePolicy{cpuPolicyRestartNotRequired, memPolicyRestartRequired}, + }, + 0x73a18cce, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + containerCopy := tc.container.DeepCopy() + hash := HashContainerWithoutResources(tc.container) + assert.Equal(t, tc.expectedHash, hash, "[%s]", tc.name) + assert.Equal(t, containerCopy, tc.container, "[%s]", tc.name) + }) + } +} diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index e53b74bca54..c2076142d06 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -27,6 +27,7 @@ import ( "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/remotecommand" "k8s.io/client-go/util/flowcontrol" @@ -295,6 +296,11 @@ type Container struct { // Hash of the container, used for comparison. Optional for containers // not managed by kubelet. Hash uint64 + // Hash of the container over fields with Resources field zero'd out. + // NOTE: This is needed during alpha and beta so that containers using Resources are + // not unexpectedly restarted when InPlacePodVerticalScaling feature-gate is toggled. + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this in GA+1 and make HashWithoutResources to become Hash. + HashWithoutResources uint64 // State is the state of the container. State State } @@ -319,6 +325,18 @@ type PodStatus struct { TimeStamp time.Time } +// ContainerResources represents the Resources allocated to the running container. +type ContainerResources struct { + // CPU capacity reserved for the container (cpu.shares) + CPURequest *resource.Quantity + // CPU limit enforced on the container (cpu.cfs_quota_us) + CPULimit *resource.Quantity + // Memory capaacity reserved for the container + MemoryRequest *resource.Quantity + // Memory limit enforced on the container (memory.limit_in_bytes) + MemoryLimit *resource.Quantity +} + // Status represents the status of a container. type Status struct { // ID of the container. @@ -342,6 +360,8 @@ type Status struct { ImageID string // Hash of the container, used for comparison. Hash uint64 + // Hash of the container over fields with Resources field zero'd out. + HashWithoutResources uint64 // Number of times that the container has been restarted. RestartCount int // A string explains why container is in such a status. @@ -349,6 +369,8 @@ type Status struct { // Message written by the container before exiting (stored in // TerminationMessagePath). Message string + // CPU and memory resources for this container + Resources *ContainerResources } // FindContainerStatusByName returns container status in the pod status with the given name. diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index be451070602..122d061dbc6 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -25,10 +25,13 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + utilfeature "k8s.io/apiserver/pkg/util/feature" corev1helpers "k8s.io/component-helpers/scheduling/corev1" "k8s.io/klog/v2" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" v1resource "k8s.io/kubernetes/pkg/api/v1/resource" + "k8s.io/kubernetes/pkg/features" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" volumeutils "k8s.io/kubernetes/pkg/volume/util" @@ -1018,6 +1021,12 @@ func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats stats for _, container := range pod.Spec.Containers { if container.Name == containerStats.Name { requests := container.Resources.Requests[resourceToReclaim] + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && + (resourceToReclaim == v1.ResourceMemory || resourceToReclaim == v1.ResourceCPU) { + if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { + requests = cs.ResourcesAllocated[resourceToReclaim] + } + } var usage *resource.Quantity switch resourceToReclaim { case v1.ResourceEphemeralStorage: diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index bf4b0d9434c..2efb9b9d91c 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -21,6 +21,7 @@ import ( "fmt" "reflect" "sort" + "strings" "testing" "time" @@ -2121,3 +2122,51 @@ func (s1 thresholdList) Equal(s2 thresholdList) bool { } return true } + +func TestEvictonMessageWithResourceResize(t *testing.T) { + testpod := newPod("testpod", 1, []v1.Container{ + newContainer("testcontainer", newResourceList("", "200Mi", ""), newResourceList("", "", "")), + }, nil) + testpod.Status = v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "testcontainer", + ResourcesAllocated: newResourceList("", "100Mi", ""), + }, + }, + } + testpodMemory := resource.MustParse("150Mi") + testpodStats := newPodMemoryStats(testpod, testpodMemory) + testpodMemoryBytes := uint64(testpodMemory.Value()) + testpodStats.Containers = []statsapi.ContainerStats{ + { + Name: "testcontainer", + Memory: &statsapi.MemoryStats{ + WorkingSetBytes: &testpodMemoryBytes, + }, + }, + } + stats := map[*v1.Pod]statsapi.PodStats{ + testpod: testpodStats, + } + statsFn := func(pod *v1.Pod) (statsapi.PodStats, bool) { + result, found := stats[pod] + return result, found + } + + for _, enabled := range []bool{true, false} { + t.Run(fmt.Sprintf("InPlacePodVerticalScaling enabled=%v", enabled), func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, enabled)() + msg, _ := evictionMessage(v1.ResourceMemory, testpod, statsFn) + if enabled { + if !strings.Contains(msg, "testcontainer was using 150Mi, which exceeds its request of 100Mi") { + t.Errorf("Expected 'exceeds memory' eviction message was not found.") + } + } else { + if strings.Contains(msg, "which exceeds its request") { + t.Errorf("Found 'exceeds memory' eviction message which was not expected.") + } + } + }) + } +} diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index fd1d8a18c82..31c51c0d3ec 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -48,6 +48,7 @@ import ( "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" @@ -66,6 +67,8 @@ import ( "k8s.io/klog/v2" pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/api/v1/resource" "k8s.io/kubernetes/pkg/features" kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/apis/podresources" @@ -608,7 +611,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, mirrorPodClient := kubepod.NewBasicMirrorClient(klet.kubeClient, string(nodeName), nodeLister) klet.podManager = kubepod.NewBasicPodManager(mirrorPodClient) - klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker) + klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker, klet.getRootDir()) klet.resourceAnalyzer = serverstats.NewResourceAnalyzer(klet, kubeCfg.VolumeStatsAggPeriod.Duration, kubeDeps.Recorder) @@ -665,7 +668,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeCfg.CPUCFSQuotaPeriod, kubeDeps.RemoteRuntimeService, kubeDeps.RemoteImageService, - kubeDeps.ContainerManager.InternalContainerLifecycle(), + kubeDeps.ContainerManager, klet.containerLogManager, klet.runtimeClassManager, seccompDefault, @@ -1247,6 +1250,9 @@ type Kubelet struct { // Manage user namespaces usernsManager *usernsManager + + // Mutex to serialize new pod admission and existing pod resizing + podResizeMutex sync.Mutex } // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface @@ -1826,6 +1832,16 @@ func (kl *Kubelet) syncPod(_ context.Context, updateType kubetypes.SyncPodType, // Ensure the pod is being probed kl.probeManager.AddPod(pod) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + // Handle pod resize here instead of doing it in HandlePodUpdates because + // this conveniently retries any Deferred resize requests + // TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan + // See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060 + if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) { + kl.handlePodResourcesResize(pod) + } + } + // Call the container runtime's SyncPod callback result := kl.containerRuntime.SyncPod(ctx, pod, podStatus, pullSecrets, kl.backOff) kl.reasonCache.Update(pod.UID, result) @@ -1842,6 +1858,15 @@ func (kl *Kubelet) syncPod(_ context.Context, updateType kubetypes.SyncPodType, return false, nil } + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) { + // While resize is in progress, periodically call PLEG to update pod cache + runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) + if err := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil { + klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod)) + return false, err + } + } + return false, nil } @@ -2078,6 +2103,23 @@ func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, strin // TODO: move out of disk check into a pod admitter // TODO: out of resource eviction should have a pod admitter call-out attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods} + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + // Use allocated resources values from checkpoint store (source of truth) to determine fit + otherPods := make([]*v1.Pod, 0, len(pods)) + checkpointState := kl.statusManager.State() + for _, p := range pods { + op := p.DeepCopy() + for _, c := range op.Spec.Containers { + resourcesAllocated, found := checkpointState.GetContainerResourceAllocation(string(p.UID), c.Name) + if c.Resources.Requests != nil && found { + c.Resources.Requests[v1.ResourceCPU] = resourcesAllocated[v1.ResourceCPU] + c.Resources.Requests[v1.ResourceMemory] = resourcesAllocated[v1.ResourceMemory] + } + } + otherPods = append(otherPods, op) + } + attrs.OtherPods = otherPods + } for _, podAdmitHandler := range kl.admitHandlers { if result := podAdmitHandler.Admit(attrs); !result.Admit { return false, result.Reason, result.Message @@ -2332,6 +2374,10 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) { func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { start := kl.clock.Now() sort.Sort(sliceutils.PodsByCreationTime(pods)) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + kl.podResizeMutex.Lock() + defer kl.podResizeMutex.Unlock() + } for _, pod := range pods { existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod @@ -2356,10 +2402,36 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { // pods that are alive. activePods := kl.filterOutInactivePods(existingPods) - // Check if we can admit the pod; if not, reject it. - if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { - kl.rejectPod(pod, reason, message) - continue + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + // To handle kubelet restarts, test pod admissibility using ResourcesAllocated values + // (for cpu & memory) from checkpoint store. If found, that is the source of truth. + checkpointState := kl.statusManager.State() + podCopy := pod.DeepCopy() + for _, c := range podCopy.Spec.Containers { + resourcesAllocated, found := checkpointState.GetContainerResourceAllocation(string(pod.UID), c.Name) + if c.Resources.Requests != nil && found { + c.Resources.Requests[v1.ResourceCPU] = resourcesAllocated[v1.ResourceCPU] + c.Resources.Requests[v1.ResourceMemory] = resourcesAllocated[v1.ResourceMemory] + } + } + + // Check if we can admit the pod; if not, reject it. + if ok, reason, message := kl.canAdmitPod(activePods, podCopy); !ok { + kl.rejectPod(pod, reason, message) + continue + } + + // For new pod, checkpoint the resource values at which the Pod has been admitted + if err := kl.statusManager.SetPodAllocation(podCopy); err != nil { + //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate + klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod)) + } + } else { + // Check if we can admit the pod; if not, reject it. + if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { + kl.rejectPod(pod, reason, message) + continue + } } } mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) @@ -2434,6 +2506,116 @@ func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) { } } +func isPodResizeInProgress(pod *v1.Pod, podStatus *v1.PodStatus) bool { + for _, c := range pod.Spec.Containers { + if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { + if cs.Resources == nil { + continue + } + if diff.ObjectDiff(c.Resources.Limits, cs.Resources.Limits) != "" || + diff.ObjectDiff(cs.ResourcesAllocated, cs.Resources.Requests) != "" { + return true + } + } + } + return false +} + +func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus) { + var otherActivePods []*v1.Pod + + node, err := kl.getNodeAnyWay() + if err != nil { + klog.ErrorS(err, "getNodeAnyway function failed") + return false, nil, "" + } + cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() + memAvailable := node.Status.Allocatable.Memory().Value() + cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU) + memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory) + if cpuRequests > cpuAvailable || memRequests > memAvailable { + klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "Pod", pod.Name) + return false, nil, v1.PodResizeStatusInfeasible + } + + // Treat the existing pod needing resize as a new pod with desired resources seeking admit. + // If desired resources don't fit, pod continues to run with currently allocated resources. + activePods := kl.GetActivePods() + for _, p := range activePods { + if p.UID != pod.UID { + otherActivePods = append(otherActivePods, p) + } + } + + if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, pod); !ok { + // Log reason and return. Let the next sync iteration retry the resize + klog.V(3).InfoS("Resize cannot be accommodated", "Pod", pod.Name, "Reason", failReason, "Message", failMessage) + return false, nil, v1.PodResizeStatusDeferred + } + + podCopy := pod.DeepCopy() + for _, container := range podCopy.Spec.Containers { + idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name) + if found { + for rName, rQuantity := range container.Resources.Requests { + podCopy.Status.ContainerStatuses[idx].ResourcesAllocated[rName] = rQuantity + } + } + } + return true, podCopy, v1.PodResizeStatusInProgress +} + +func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) { + if pod.Status.Phase != v1.PodRunning { + return + } + podResized := false + for _, container := range pod.Spec.Containers { + if len(container.Resources.Requests) == 0 { + continue + } + containerStatus, found := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name) + if !found { + klog.V(5).InfoS("ContainerStatus not found", "pod", pod.Name, "container", container.Name) + break + } + if len(containerStatus.ResourcesAllocated) != len(container.Resources.Requests) { + klog.V(5).InfoS("ContainerStatus.ResourcesAllocated length mismatch", "pod", pod.Name, "container", container.Name) + break + } + if len(diff.ObjectDiff(container.Resources.Requests, containerStatus.ResourcesAllocated)) > 0 { + podResized = true + break + } + } + if !podResized { + return + } + + kl.podResizeMutex.Lock() + defer kl.podResizeMutex.Unlock() + fit, updatedPod, resizeStatus := kl.canResizePod(pod) + if fit { + // Update pod resource allocation checkpoint + if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil { + //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate + klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod)) + } + *pod = *updatedPod + } + if resizeStatus != "" { + // Save resize decision to checkpoint + if err := kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus); err != nil { + //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate + klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(pod)) + } + pod.Status.Resize = resizeStatus + } + kl.podManager.UpdatePod(pod) + kl.statusManager.SetPodStatus(pod, pod.Status) + return +} + // LatestLoopEntryTime returns the last time in the sync loop monitor. func (kl *Kubelet) LatestLoopEntryTime() time.Time { val := kl.syncLoopMonitor.Load() diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 2a4272d2692..5ee3b9f4bec 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -34,6 +34,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/sets" utilvalidation "k8s.io/apimachinery/pkg/util/validation" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -1454,6 +1455,31 @@ func getPhase(spec *v1.PodSpec, info []v1.ContainerStatus) v1.PodPhase { } } +func (kl *Kubelet) determinePodResizeStatus(pod *v1.Pod, podStatus *v1.PodStatus) v1.PodResizeStatus { + var podResizeStatus v1.PodResizeStatus + specStatusDiffer := false + for _, c := range pod.Spec.Containers { + if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { + if cs.Resources != nil && diff.ObjectDiff(c.Resources, *cs.Resources) != "" { + specStatusDiffer = true + break + } + } + } + if !specStatusDiffer { + // Clear last resize state from checkpoint + if err := kl.statusManager.SetPodResizeStatus(pod.UID, ""); err != nil { + klog.ErrorS(err, "SetPodResizeStatus failed", "pod", pod.Name) + } + } else { + checkpointState := kl.statusManager.State() + if resizeStatus, found := checkpointState.GetPodResizeStatus(string(pod.UID)); found { + podResizeStatus = resizeStatus + } + } + return podResizeStatus +} + // generateAPIPodStatus creates the final API pod status for a pod, given the // internal pod status. This method should only be called from within sync*Pod methods. func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { @@ -1464,6 +1490,9 @@ func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.Po oldPodStatus = pod.Status } s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + s.Resize = kl.determinePodResizeStatus(pod, s) + } // calculate the next phase and preserve reason allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) s.Phase = getPhase(&pod.Spec, allStatus) @@ -1715,6 +1744,84 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon return status } + convertContainerStatusResources := func(cName string, status *v1.ContainerStatus, cStatus *kubecontainer.Status, oldStatuses map[string]v1.ContainerStatus) *v1.ResourceRequirements { + var requests, limits v1.ResourceList + // oldStatus should always exist if container is running + oldStatus, oldStatusFound := oldStatuses[cName] + // Initialize limits/requests from container's spec upon transition to Running state + // For cpu & memory, values queried from runtime via CRI always supercedes spec values + // For ephemeral-storage, a running container's status.limit/request equals spec.limit/request + determineResource := func(rName v1.ResourceName, v1ContainerResource, oldStatusResource, resource v1.ResourceList) { + if oldStatusFound { + if oldStatus.State.Running == nil || status.ContainerID != oldStatus.ContainerID { + if r, exists := v1ContainerResource[rName]; exists { + resource[rName] = r.DeepCopy() + } + } else { + if oldStatusResource != nil { + if r, exists := oldStatusResource[rName]; exists { + resource[rName] = r.DeepCopy() + } + } + } + } + } + container := kubecontainer.GetContainerSpec(pod, cName) + // ResourcesAllocated values come from checkpoint. It is the source-of-truth. + found := false + checkpointState := kl.statusManager.State() + status.ResourcesAllocated, found = checkpointState.GetContainerResourceAllocation(string(pod.UID), cName) + if !(container.Resources.Requests == nil && container.Resources.Limits == nil) && !found { + // Log error and fallback to ResourcesAllocated in oldStatus if it exists + klog.ErrorS(nil, "resource allocation not found in checkpoint store", "pod", pod.Name, "container", cName) + if oldStatusFound { + status.ResourcesAllocated = oldStatus.ResourcesAllocated + } + } + if oldStatus.Resources == nil { + oldStatus.Resources = &v1.ResourceRequirements{} + } + // Convert Limits + if container.Resources.Limits != nil { + limits = make(v1.ResourceList) + if cStatus.Resources != nil && cStatus.Resources.CPULimit != nil { + limits[v1.ResourceCPU] = cStatus.Resources.CPULimit.DeepCopy() + } else { + determineResource(v1.ResourceCPU, container.Resources.Limits, oldStatus.Resources.Limits, limits) + } + if cStatus.Resources != nil && cStatus.Resources.MemoryLimit != nil { + limits[v1.ResourceMemory] = cStatus.Resources.MemoryLimit.DeepCopy() + } else { + determineResource(v1.ResourceMemory, container.Resources.Limits, oldStatus.Resources.Limits, limits) + } + if ephemeralStorage, found := container.Resources.Limits[v1.ResourceEphemeralStorage]; found { + limits[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() + } + } + // Convert Requests + if status.ResourcesAllocated != nil { + requests = make(v1.ResourceList) + if cStatus.Resources != nil && cStatus.Resources.CPURequest != nil { + requests[v1.ResourceCPU] = cStatus.Resources.CPURequest.DeepCopy() + } else { + determineResource(v1.ResourceCPU, status.ResourcesAllocated, oldStatus.Resources.Requests, requests) + } + if memory, found := status.ResourcesAllocated[v1.ResourceMemory]; found { + requests[v1.ResourceMemory] = memory.DeepCopy() + } + if ephemeralStorage, found := status.ResourcesAllocated[v1.ResourceEphemeralStorage]; found { + requests[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() + } + } + //TODO(vinaykul,derekwaynecarr,InPlacePodVerticalScaling): Update this to include extended resources in + // addition to CPU, memory, ephemeral storage. Add test case for extended resources. + resources := &v1.ResourceRequirements{ + Limits: limits, + Requests: requests, + } + return resources + } + // Fetch old containers statuses from old pod status. oldStatuses := make(map[string]v1.ContainerStatus, len(containers)) for _, status := range previousStatus { @@ -1835,6 +1942,11 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon oldStatusPtr = &oldStatus } status := convertContainerStatus(cStatus, oldStatusPtr) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if status.State.Running != nil { + status.Resources = convertContainerStatusResources(cName, status, cStatus, oldStatuses) + } + } if containerSeen[cName] == 0 { statuses[cName] = status } else { diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index edb4f00f8f1..5752737fc94 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -33,6 +33,7 @@ import ( v1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -56,6 +57,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward" "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand" "k8s.io/kubernetes/pkg/kubelet/prober/results" + "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -3861,3 +3863,219 @@ func TestConvertToAPIContainerStatusesDataRace(t *testing.T) { }() } } + +func TestConvertToAPIContainerStatusesForResources(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + nowTime := time.Now() + testContainerName := "ctr0" + testContainerID := kubecontainer.ContainerID{Type: "test", ID: testContainerName} + testContainer := v1.Container{ + Name: testContainerName, + Image: "img", + } + testContainerStatus := v1.ContainerStatus{ + Name: testContainerName, + } + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "123456", + Name: "foo", + Namespace: "bar", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{testContainer}, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{testContainerStatus}, + }, + } + testKubeContainerStatus := kubecontainer.Status{ + Name: testContainerName, + ID: testContainerID, + Image: "img", + ImageID: "img1234", + State: kubecontainer.ContainerStateRunning, + StartedAt: nowTime, + } + testPodStatus := &kubecontainer.PodStatus{ + ID: testPod.UID, + Name: testPod.Name, + Namespace: testPod.Namespace, + ContainerStatuses: []*kubecontainer.Status{&testKubeContainerStatus}, + } + CPU1AndMem1G := v1.ResourceList{v1.ResourceCPU: resource.MustParse("1"), v1.ResourceMemory: resource.MustParse("1Gi")} + CPU2AndMem2G := v1.ResourceList{v1.ResourceCPU: resource.MustParse("2"), v1.ResourceMemory: resource.MustParse("2Gi")} + CPU1AndMem1GAndStorage2G := CPU1AndMem1G.DeepCopy() + CPU1AndMem1GAndStorage2G[v1.ResourceEphemeralStorage] = resource.MustParse("2Gi") + CPU2AndMem2GAndStorage2G := CPU2AndMem2G.DeepCopy() + CPU2AndMem2GAndStorage2G[v1.ResourceEphemeralStorage] = resource.MustParse("2Gi") + + testKubelet := newTestKubelet(t, false) + defer testKubelet.Cleanup() + kubelet := testKubelet.kubelet + kubelet.statusManager = status.NewFakeManager() + + idx := 0 + for tdesc, tc := range map[string]struct { + Resources []v1.ResourceRequirements + OldStatus []v1.ContainerStatus + Expected []v1.ContainerStatus + }{ + "GuaranteedQoSPod with CPU and memory CRI status": { + Resources: []v1.ResourceRequirements{{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}}, + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + ResourcesAllocated: CPU1AndMem1G, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}, + }, + }, + }, + "BurstableQoSPod with CPU and memory CRI status": { + Resources: []v1.ResourceRequirements{{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}}, + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + Resources: &v1.ResourceRequirements{Limits: CPU2AndMem2G, Requests: CPU1AndMem1G}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + ResourcesAllocated: CPU1AndMem1G, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}, + }, + }, + }, + "GuaranteedQoSPod with CPU and memory CRI status, with ephemeral storage": { + Resources: []v1.ResourceRequirements{{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}}, + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1G, Requests: CPU1AndMem1G}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + ResourcesAllocated: CPU1AndMem1GAndStorage2G, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}, + }, + }, + }, + "BurstableQoSPod with CPU and memory CRI status, with ephemeral storage": { + Resources: []v1.ResourceRequirements{{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}}, + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + Resources: &v1.ResourceRequirements{Limits: CPU2AndMem2GAndStorage2G, Requests: CPU2AndMem2GAndStorage2G}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + ResourcesAllocated: CPU1AndMem1GAndStorage2G, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}, + }, + }, + }, + "BurstableQoSPod with CPU and memory CRI status, with ephemeral storage, nil resources in OldStatus": { + Resources: []v1.ResourceRequirements{{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}}, + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + ResourcesAllocated: CPU1AndMem1GAndStorage2G, + Resources: &v1.ResourceRequirements{Limits: CPU1AndMem1GAndStorage2G, Requests: CPU1AndMem1GAndStorage2G}, + }, + }, + }, + "BestEffortQoSPod": { + OldStatus: []v1.ContainerStatus{ + { + Name: testContainerName, + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + Resources: &v1.ResourceRequirements{}, + }, + }, + Expected: []v1.ContainerStatus{ + { + Name: testContainerName, + ContainerID: testContainerID.String(), + Image: "img", + ImageID: "img1234", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{StartedAt: metav1.NewTime(nowTime)}}, + Resources: &v1.ResourceRequirements{}, + }, + }, + }, + } { + tPod := testPod.DeepCopy() + tPod.Name = fmt.Sprintf("%s-%d", testPod.Name, idx) + for i := range tPod.Spec.Containers { + if tc.Resources != nil { + tPod.Spec.Containers[i].Resources = tc.Resources[i] + } + kubelet.statusManager.SetPodAllocation(tPod) + if tc.Resources != nil { + tPod.Status.ContainerStatuses[i].ResourcesAllocated = tc.Resources[i].Requests + testPodStatus.ContainerStatuses[i].Resources = &kubecontainer.ContainerResources{ + MemoryLimit: tc.Resources[i].Limits.Memory(), + CPULimit: tc.Resources[i].Limits.Cpu(), + CPURequest: tc.Resources[i].Requests.Cpu(), + } + } + } + + t.Logf("TestCase: %q", tdesc) + cStatuses := kubelet.convertToAPIContainerStatuses(tPod, testPodStatus, tc.OldStatus, tPod.Spec.Containers, false, false) + assert.Equal(t, tc.Expected, cStatuses) + } +} diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 67549bd1120..5b06f84104c 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -48,12 +48,15 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/flowcontrol" + featuregatetesting "k8s.io/component-base/featuregate/testing" internalapi "k8s.io/cri-api/pkg/apis" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2/ktesting" + "k8s.io/kubernetes/pkg/features" kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" "k8s.io/kubernetes/pkg/kubelet/cm" @@ -260,7 +263,7 @@ func newTestKubeletWithImageList( kubelet.configMapManager = configMapManager kubelet.podManager = kubepod.NewBasicPodManager(fakeMirrorClient) podStartupLatencyTracker := kubeletutil.NewPodStartupLatencyTracker() - kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker) + kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker, kubelet.getRootDir()) kubelet.containerRuntime = fakeRuntime kubelet.runtimeCache = containertest.NewFakeRuntimeCache(kubelet.containerRuntime) @@ -2436,6 +2439,162 @@ func TestHandlePodAdditionsInvokesPodAdmitHandlers(t *testing.T) { checkPodStatus(t, kl, podToAdmit, v1.PodPending) } +func TestHandlePodResourcesResize(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + testKubelet := newTestKubelet(t, false) + defer testKubelet.Cleanup() + kubelet := testKubelet.kubelet + kubelet.statusManager = status.NewFakeManager() + + cpu500m := resource.MustParse("500m") + cpu1000m := resource.MustParse("1") + cpu1500m := resource.MustParse("1500m") + cpu2500m := resource.MustParse("2500m") + cpu5000m := resource.MustParse("5000m") + mem500M := resource.MustParse("500Mi") + mem1000M := resource.MustParse("1Gi") + mem1500M := resource.MustParse("1500Mi") + mem2500M := resource.MustParse("2500Mi") + mem4500M := resource.MustParse("4500Mi") + + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("8"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("4Gi"), + v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), + }, + }, + }, + } + kubelet.nodeLister = testNodeLister{nodes: nodes} + + testPod1 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "1111", + Name: "pod1", + Namespace: "ns1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Image: "i1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "c1", + ResourcesAllocated: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + Resources: &v1.ResourceRequirements{}, + }, + }, + }, + } + testPod2 := testPod1.DeepCopy() + testPod2.UID = "2222" + testPod2.Name = "pod2" + testPod2.Namespace = "ns2" + testPod3 := testPod1.DeepCopy() + testPod3.UID = "3333" + testPod3.Name = "pod3" + testPod3.Namespace = "ns2" + + testKubelet.fakeKubeClient = fake.NewSimpleClientset(testPod1, testPod2, testPod3) + kubelet.kubeClient = testKubelet.fakeKubeClient + defer testKubelet.fakeKubeClient.ClearActions() + kubelet.podManager.AddPod(testPod1) + kubelet.podManager.AddPod(testPod2) + kubelet.podManager.AddPod(testPod3) + kubelet.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ + testPod1.UID: true, + testPod2.UID: true, + testPod3.UID: true, + } + defer kubelet.podManager.DeletePod(testPod3) + defer kubelet.podManager.DeletePod(testPod2) + defer kubelet.podManager.DeletePod(testPod1) + + tests := []struct { + name string + pod *v1.Pod + newRequests v1.ResourceList + expectedAllocations v1.ResourceList + expectedResize v1.PodResizeStatus + }{ + { + name: "Request CPU and memory decrease - expect InProgress", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + expectedResize: v1.PodResizeStatusInProgress, + }, + { + name: "Request CPU increase, memory decrease - expect InProgress", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M}, + expectedResize: v1.PodResizeStatusInProgress, + }, + { + name: "Request CPU decrease, memory increase - expect InProgress", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M}, + expectedResize: v1.PodResizeStatusInProgress, + }, + { + name: "Request CPU and memory increase beyond current capacity - expect Deferred", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu2500m, v1.ResourceMemory: mem2500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + expectedResize: v1.PodResizeStatusDeferred, + }, + { + name: "Request CPU decrease and memory increase beyond current capacity - expect Deferred", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem2500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + expectedResize: v1.PodResizeStatusDeferred, + }, + { + name: "Request memory increase beyond node capacity - expect Infeasible", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem4500M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + expectedResize: v1.PodResizeStatusInfeasible, + }, + { + name: "Request CPU increase beyond node capacity - expect Infeasible", + pod: testPod2, + newRequests: v1.ResourceList{v1.ResourceCPU: cpu5000m, v1.ResourceMemory: mem1000M}, + expectedAllocations: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + expectedResize: v1.PodResizeStatusInfeasible, + }, + } + + for _, tt := range tests { + tt.pod.Spec.Containers[0].Resources.Requests = tt.newRequests + tt.pod.Status.ContainerStatuses[0].ResourcesAllocated = v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M} + kubelet.handlePodResourcesResize(tt.pod) + assert.Equal(t, tt.expectedAllocations, tt.pod.Status.ContainerStatuses[0].ResourcesAllocated, tt.name) + assert.Equal(t, tt.expectedResize, tt.pod.Status.Resize, tt.name) + testKubelet.fakeKubeClient.ClearActions() + } +} + // testPodSyncLoopHandler is a lifecycle.PodSyncLoopHandler that is used for testing. type testPodSyncLoopHandler struct { // list of pods to sync diff --git a/pkg/kubelet/kuberuntime/helpers.go b/pkg/kubelet/kuberuntime/helpers.go index c5db7c9a13f..0605ab4d328 100644 --- a/pkg/kubelet/kuberuntime/helpers.go +++ b/pkg/kubelet/kuberuntime/helpers.go @@ -94,12 +94,13 @@ func (m *kubeGenericRuntimeManager) toKubeContainer(c *runtimeapi.Container) (*k annotatedInfo := getContainerInfoFromAnnotations(c.Annotations) return &kubecontainer.Container{ - ID: kubecontainer.ContainerID{Type: m.runtimeName, ID: c.Id}, - Name: c.GetMetadata().GetName(), - ImageID: c.ImageRef, - Image: c.Image.Image, - Hash: annotatedInfo.Hash, - State: toKubeContainerState(c.State), + ID: kubecontainer.ContainerID{Type: m.runtimeName, ID: c.Id}, + Name: c.GetMetadata().GetName(), + ImageID: c.ImageRef, + Image: c.Image.Image, + Hash: annotatedInfo.Hash, + HashWithoutResources: annotatedInfo.HashWithoutResources, + State: toKubeContainerState(c.State), }, nil } diff --git a/pkg/kubelet/kuberuntime/helpers_linux.go b/pkg/kubelet/kuberuntime/helpers_linux.go index a96cbb6dfb5..ef77faec26c 100644 --- a/pkg/kubelet/kuberuntime/helpers_linux.go +++ b/pkg/kubelet/kuberuntime/helpers_linux.go @@ -19,6 +19,11 @@ limitations under the License. package kuberuntime +import ( + "k8s.io/kubernetes/pkg/kubelet/cm" + "math" +) + const ( milliCPUToCPU = 1000 @@ -53,3 +58,22 @@ func milliCPUToQuota(milliCPU int64, period int64) (quota int64) { return } + +// sharesToMilliCPU converts CpuShares (cpu.shares) to milli-CPU value +// TODO(vinaykul,InPlacePodVerticalScaling): Address issue that sets min req/limit to 2m/10m before beta +// See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r662552642 +func sharesToMilliCPU(shares int64) int64 { + milliCPU := int64(0) + if shares >= int64(cm.MinShares) { + milliCPU = int64(math.Ceil(float64(shares*milliCPUToCPU) / float64(cm.SharesPerCPU))) + } + return milliCPU +} + +// quotaToMilliCPU converts cpu.cfs_quota_us and cpu.cfs_period_us to milli-CPU value +func quotaToMilliCPU(quota int64, period int64) int64 { + if quota == -1 { + return int64(0) + } + return (quota * milliCPUToCPU) / period +} diff --git a/pkg/kubelet/kuberuntime/helpers_linux_test.go b/pkg/kubelet/kuberuntime/helpers_linux_test.go index 2cfc09ca778..dfa3ede278a 100644 --- a/pkg/kubelet/kuberuntime/helpers_linux_test.go +++ b/pkg/kubelet/kuberuntime/helpers_linux_test.go @@ -28,6 +28,7 @@ import ( featuregatetesting "k8s.io/component-base/featuregate/testing" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm" utilpointer "k8s.io/utils/pointer" ) @@ -670,3 +671,81 @@ func TestGetSeccompProfileDefaultSeccomp(t *testing.T) { func getLocal(v string) *string { return &v } + +func TestSharesToMilliCPU(t *testing.T) { + knownMilliCPUToShares := map[int64]int64{ + 0: 2, + 1: 2, + 2: 2, + 3: 3, + 4: 4, + 32: 32, + 64: 65, + 100: 102, + 250: 256, + 500: 512, + 1000: 1024, + 1500: 1536, + 2000: 2048, + } + + t.Run("sharesToMilliCPUTest", func(t *testing.T) { + var testMilliCPU int64 + for testMilliCPU = 0; testMilliCPU <= 2000; testMilliCPU++ { + shares := int64(cm.MilliCPUToShares(testMilliCPU)) + if expectedShares, found := knownMilliCPUToShares[testMilliCPU]; found { + if shares != expectedShares { + t.Errorf("Test milliCPIToShares: Input milliCPU %v, expected shares %v, but got %v", testMilliCPU, expectedShares, shares) + } + } + expectedMilliCPU := testMilliCPU + if testMilliCPU < 2 { + expectedMilliCPU = 2 + } + milliCPU := sharesToMilliCPU(shares) + if milliCPU != expectedMilliCPU { + t.Errorf("Test sharesToMilliCPU: Input shares %v, expected milliCPU %v, but got %v", shares, expectedMilliCPU, milliCPU) + } + } + }) +} + +func TestQuotaToMilliCPU(t *testing.T) { + for _, tc := range []struct { + name string + quota int64 + period int64 + expected int64 + }{ + { + name: "50m", + quota: int64(5000), + period: int64(100000), + expected: int64(50), + }, + { + name: "750m", + quota: int64(75000), + period: int64(100000), + expected: int64(750), + }, + { + name: "1000m", + quota: int64(100000), + period: int64(100000), + expected: int64(1000), + }, + { + name: "1500m", + quota: int64(150000), + period: int64(100000), + expected: int64(1500), + }} { + t.Run(tc.name, func(t *testing.T) { + milliCPU := quotaToMilliCPU(tc.quota, tc.period) + if milliCPU != tc.expected { + t.Errorf("Test %s: Input quota %v and period %v, expected milliCPU %v, but got %v", tc.name, tc.quota, tc.period, tc.expected, milliCPU) + } + }) + } +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index d7fa0bc15eb..c0c55d60687 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -46,7 +46,9 @@ import ( kubetypes "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/cri/remote" "k8s.io/kubernetes/pkg/kubelet/events" @@ -359,6 +361,18 @@ func (m *kubeGenericRuntimeManager) generateContainerConfig(ctx context.Context, return config, cleanupAction, nil } +func (m *kubeGenericRuntimeManager) updateContainerResources(pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error { + containerResources := m.generateContainerResources(pod, container) + if containerResources == nil { + return fmt.Errorf("container %q updateContainerResources failed: cannot generate resources config", containerID.String()) + } + err := m.runtimeService.UpdateContainerResources(containerID.ID, containerResources) + if err != nil { + klog.ErrorS(err, "UpdateContainerResources failed", "container", containerID.String()) + } + return err +} + // makeDevices generates container devices for kubelet runtime v1. func makeDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.Device { devices := make([]*runtimeapi.Device, len(opts.Devices)) @@ -557,18 +571,25 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context, func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName string) *kubecontainer.Status { annotatedInfo := getContainerInfoFromAnnotations(status.Annotations) labeledInfo := getContainerInfoFromLabels(status.Labels) + var cStatusResources *kubecontainer.ContainerResources + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + // If runtime reports cpu & memory resources info, add it to container status + cStatusResources = toKubeContainerResources(status.Resources) + } cStatus := &kubecontainer.Status{ ID: kubecontainer.ContainerID{ Type: runtimeName, ID: status.Id, }, - Name: labeledInfo.ContainerName, - Image: status.Image.Image, - ImageID: status.ImageRef, - Hash: annotatedInfo.Hash, - RestartCount: annotatedInfo.RestartCount, - State: toKubeContainerState(status.State), - CreatedAt: time.Unix(0, status.CreatedAt), + Name: labeledInfo.ContainerName, + Image: status.Image.Image, + ImageID: status.ImageRef, + Hash: annotatedInfo.Hash, + HashWithoutResources: annotatedInfo.HashWithoutResources, + RestartCount: annotatedInfo.RestartCount, + State: toKubeContainerState(status.State), + CreatedAt: time.Unix(0, status.CreatedAt), + Resources: cStatusResources, } if status.State != runtimeapi.ContainerState_CONTAINER_CREATED { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index 7bcc139e7e8..202ff2bce48 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -60,7 +60,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C return nil, err } lc := &runtimeapi.LinuxContainerConfig{ - Resources: &runtimeapi.LinuxContainerResources{}, + Resources: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), SecurityContext: sc, } @@ -69,17 +69,22 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID } + return lc, nil +} + +// generateLinuxContainerResources generates linux container resources config for runtime +func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources { // set linux container resources var cpuRequest *resource.Quantity if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists { cpuRequest = container.Resources.Requests.Cpu() } - lc.Resources = m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory()) + lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory()) - lc.Resources.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container, + lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container, int64(m.machineInfo.MemoryCapacity))) - lc.Resources.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) + lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { // NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec: @@ -87,14 +92,14 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C switch m.memorySwapBehavior { case kubelettypes.UnlimitedSwap: // -1 = unlimited swap - lc.Resources.MemorySwapLimitInBytes = -1 + lcr.MemorySwapLimitInBytes = -1 case kubelettypes.LimitedSwap: fallthrough default: // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit // Some swapping is still possible. // Note that if memory limit is 0, memory swap limit is ignored. - lc.Resources.MemorySwapLimitInBytes = lc.Resources.MemoryLimitInBytes + lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes } } @@ -125,18 +130,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } if len(unified) > 0 { - if lc.Resources.Unified == nil { - lc.Resources.Unified = unified + if lcr.Unified == nil { + lcr.Unified = unified } else { for k, v := range unified { - lc.Resources.Unified[k] = v + lcr.Unified[k] = v } } klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified) } } - return lc, nil + return lcr +} + +// generateContainerResources generates platform specific (linux) container resources config for runtime +func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { + enforceMemoryQoS := false + // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && + libcontainercgroups.IsCgroup2UnifiedMode() { + enforceMemoryQoS = true + } + return &runtimeapi.ContainerResources{ + Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), + } } // calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits @@ -218,3 +236,34 @@ func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtim return hugepageLimits } + +func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources { + var cStatusResources *kubecontainer.ContainerResources + runtimeStatusResources := statusResources.GetLinux() + if runtimeStatusResources != nil { + var cpuLimit, memLimit, cpuRequest *resource.Quantity + if runtimeStatusResources.CpuPeriod > 0 { + milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod) + if milliCPU > 0 { + cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) + } + } + if runtimeStatusResources.CpuShares > 0 { + milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares) + if milliCPU > 0 { + cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) + } + } + if runtimeStatusResources.MemoryLimitInBytes > 0 { + memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI) + } + if cpuLimit != nil || memLimit != nil || cpuRequest != nil { + cStatusResources = &kubecontainer.ContainerResources{ + CPULimit: cpuLimit, + CPURequest: cpuRequest, + MemoryLimit: memLimit, + } + } + } + return cStatusResources +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index d47a94377e0..3f8c70219ea 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -31,6 +31,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/diff" utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -712,3 +713,167 @@ func TestGenerateLinuxContainerConfigSwap(t *testing.T) { }) } } + +func TestGenerateLinuxContainerResources(t *testing.T) { + _, _, m, err := createTestRuntimeManager() + assert.NoError(t, err) + m.machineInfo.MemoryCapacity = 17179860387 // 16GB + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "foo", + Namespace: "bar", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Image: "busybox", + }, + }, + }, + Status: v1.PodStatus{}, + } + + for _, tc := range []struct { + name string + scalingFg bool + limits v1.ResourceList + requests v1.ResourceList + cStatus []v1.ContainerStatus + expected *runtimeapi.LinuxContainerResources + }{ + { + "requests & limits, cpu & memory, guaranteed qos - no container status", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{}, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, + }, + { + "requests & limits, cpu & memory, burstable qos - no container status", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{}, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970}, + }, + { + "best-effort qos - no container status", + true, + nil, + nil, + []v1.ContainerStatus{}, + &runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000}, + }, + { + "requests & limits, cpu & memory, guaranteed qos - empty resources container status", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{{Name: "c1"}}, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, + }, + { + "requests & limits, cpu & memory, burstable qos - empty resources container status", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{{Name: "c1"}}, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 999}, + }, + { + "best-effort qos - empty resources container status", + true, + nil, + nil, + []v1.ContainerStatus{{Name: "c1"}}, + &runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000}, + }, + { + "requests & limits, cpu & memory, guaranteed qos - container status with resourcesAllocated", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{ + { + Name: "c1", + ResourcesAllocated: v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + }, + }, + &runtimeapi.LinuxContainerResources{CpuShares: 204, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, + }, + { + "requests & limits, cpu & memory, burstable qos - container status with resourcesAllocated", + true, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{ + { + Name: "c1", + ResourcesAllocated: v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + }, + }, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970}, + }, + { + "requests & limits, cpu & memory, guaranteed qos - no container status", + false, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{}, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, + }, + { + "requests & limits, cpu & memory, burstable qos - container status with resourcesAllocated", + false, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{ + { + Name: "c1", + ResourcesAllocated: v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + }, + }, + &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970}, + }, + { + "requests & limits, cpu & memory, guaranteed qos - container status with resourcesAllocated", + false, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + []v1.ContainerStatus{ + { + Name: "c1", + ResourcesAllocated: v1.ResourceList{v1.ResourceCPU: resource.MustParse("200m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + }, + }, + &runtimeapi.LinuxContainerResources{CpuShares: 204, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, + }, + { + "best-effort qos - no container status", + false, + nil, + nil, + []v1.ContainerStatus{}, + &runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + if tc.scalingFg { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + } + tc.expected.HugepageLimits = []*runtimeapi.HugepageLimit{{PageSize: "2MB", Limit: 0}, {PageSize: "1GB", Limit: 0}} + pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests} + if len(tc.cStatus) > 0 { + pod.Status.ContainerStatuses = tc.cStatus + } + resources := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false) + if diff.ObjectDiff(resources, tc.expected) != "" { + t.Errorf("Test %s: expected resources %+v, but got %+v", tc.name, tc.expected, resources) + } + }) + } +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go index 7b4d03a378d..d7d26877cee 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go @@ -28,6 +28,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -230,6 +231,111 @@ func TestToKubeContainerStatus(t *testing.T) { } } +// TestToKubeContainerStatusWithResources tests the converting the CRI container status to +// the internal type (i.e., toKubeContainerStatus()) for containers that returns Resources. +func TestToKubeContainerStatusWithResources(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + cid := &kubecontainer.ContainerID{Type: "testRuntime", ID: "dummyid"} + meta := &runtimeapi.ContainerMetadata{Name: "cname", Attempt: 3} + imageSpec := &runtimeapi.ImageSpec{Image: "fimage"} + var ( + createdAt int64 = 327 + startedAt int64 = 999 + ) + + for desc, test := range map[string]struct { + input *runtimeapi.ContainerStatus + expected *kubecontainer.Status + }{ + "container reporting cpu and memory": { + input: &runtimeapi.ContainerStatus{ + Id: cid.ID, + Metadata: meta, + Image: imageSpec, + State: runtimeapi.ContainerState_CONTAINER_RUNNING, + CreatedAt: createdAt, + StartedAt: startedAt, + Resources: &runtimeapi.ContainerResources{ + Linux: &runtimeapi.LinuxContainerResources{ + CpuQuota: 25000, + CpuPeriod: 100000, + MemoryLimitInBytes: 524288000, + OomScoreAdj: -998, + }, + }, + }, + expected: &kubecontainer.Status{ + ID: *cid, + Image: imageSpec.Image, + State: kubecontainer.ContainerStateRunning, + CreatedAt: time.Unix(0, createdAt), + StartedAt: time.Unix(0, startedAt), + Resources: &kubecontainer.ContainerResources{ + CPULimit: resource.NewMilliQuantity(250, resource.DecimalSI), + MemoryLimit: resource.NewQuantity(524288000, resource.BinarySI), + }, + }, + }, + "container reporting cpu only": { + input: &runtimeapi.ContainerStatus{ + Id: cid.ID, + Metadata: meta, + Image: imageSpec, + State: runtimeapi.ContainerState_CONTAINER_RUNNING, + CreatedAt: createdAt, + StartedAt: startedAt, + Resources: &runtimeapi.ContainerResources{ + Linux: &runtimeapi.LinuxContainerResources{ + CpuQuota: 50000, + CpuPeriod: 100000, + }, + }, + }, + expected: &kubecontainer.Status{ + ID: *cid, + Image: imageSpec.Image, + State: kubecontainer.ContainerStateRunning, + CreatedAt: time.Unix(0, createdAt), + StartedAt: time.Unix(0, startedAt), + Resources: &kubecontainer.ContainerResources{ + CPULimit: resource.NewMilliQuantity(500, resource.DecimalSI), + }, + }, + }, + "container reporting memory only": { + input: &runtimeapi.ContainerStatus{ + Id: cid.ID, + Metadata: meta, + Image: imageSpec, + State: runtimeapi.ContainerState_CONTAINER_RUNNING, + CreatedAt: createdAt, + StartedAt: startedAt, + Resources: &runtimeapi.ContainerResources{ + Linux: &runtimeapi.LinuxContainerResources{ + MemoryLimitInBytes: 524288000, + OomScoreAdj: -998, + }, + }, + }, + expected: &kubecontainer.Status{ + ID: *cid, + Image: imageSpec.Image, + State: kubecontainer.ContainerStateRunning, + CreatedAt: time.Unix(0, createdAt), + StartedAt: time.Unix(0, startedAt), + Resources: &kubecontainer.ContainerResources{ + MemoryLimit: resource.NewQuantity(524288000, resource.BinarySI), + }, + }, + }, + } { + t.Run(desc, func(t *testing.T) { + actual := toKubeContainerStatus(test.input, cid.Type) + assert.Equal(t, test.expected, actual, desc) + }) + } +} + func TestLifeCycleHook(t *testing.T) { // Setup @@ -696,3 +802,39 @@ func TestKillContainerGracePeriod(t *testing.T) { }) } } + +// TestUpdateContainerResources tests updating a container in a Pod. +func TestUpdateContainerResources(t *testing.T) { + fakeRuntime, _, m, errCreate := createTestRuntimeManager() + require.NoError(t, errCreate) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "bar", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "foo", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + }, + }, + }, + } + + // Create fake sandbox and container + _, fakeContainers := makeAndSetFakePod(t, m, fakeRuntime, pod) + assert.Equal(t, len(fakeContainers), 1) + + cStatus, err := m.getPodContainerStatuses(pod.UID, pod.Name, pod.Namespace) + assert.NoError(t, err) + containerID := cStatus[0].ID + + err = m.updateContainerResources(pod, &pod.Spec.Containers[0], containerID) + assert.NoError(t, err) + + // Verify container is updated + assert.Contains(t, fakeRuntime.Called, "UpdateContainerResources") +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go b/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go index 96095cfcce6..b7d460ec791 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go @@ -29,3 +29,12 @@ import ( func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { return nil } + +// generateContainerResources generates platform specific container resources config for runtime +func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { + return nil +} + +func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources { + return nil +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go index 4ae6ffa09d7..a0d2999edaa 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go @@ -40,6 +40,12 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config return nil } +// generateContainerResources generates platform specific (windows) container resources config for runtime +func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { + //TODO: Add windows support + return nil +} + // generateWindowsContainerConfig generates windows container config for kubelet runtime v1. // Refer https://git.k8s.io/design-proposals-archive/node/cri-windows.md. func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string) (*runtimeapi.WindowsContainerConfig, error) { @@ -126,3 +132,8 @@ func calculateCPUMaximum(cpuLimit *resource.Quantity, cpuCount int64) int64 { } return cpuMaximum } + +func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources { + //TODO: Add windows support + return nil +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 56958b3605d..bf5a77816c3 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -30,8 +30,10 @@ import ( "k8s.io/klog/v2" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubetypes "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilversion "k8s.io/apimachinery/pkg/util/version" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -42,6 +44,7 @@ import ( internalapi "k8s.io/cri-api/pkg/apis" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/api/legacyscheme" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/credentialprovider" "k8s.io/kubernetes/pkg/credentialprovider/plugin" "k8s.io/kubernetes/pkg/features" @@ -133,6 +136,9 @@ type kubeGenericRuntimeManager struct { // The directory path for seccomp profiles. seccompProfileRoot string + // Container management interface for pod container. + containerManager cm.ContainerManager + // Internal lifecycle event handlers for container resource management. internalLifecycle cm.InternalContainerLifecycle @@ -190,7 +196,7 @@ func NewKubeGenericRuntimeManager( cpuCFSQuotaPeriod metav1.Duration, runtimeService internalapi.RuntimeService, imageService internalapi.ImageManagerService, - internalLifecycle cm.InternalContainerLifecycle, + containerManager cm.ContainerManager, logManager logs.ContainerLogManager, runtimeClassManager *runtimeclass.Manager, seccompDefault bool, @@ -215,7 +221,8 @@ func NewKubeGenericRuntimeManager( runtimeHelper: runtimeHelper, runtimeService: runtimeService, imageService: imageService, - internalLifecycle: internalLifecycle, + containerManager: containerManager, + internalLifecycle: containerManager.InternalContainerLifecycle(), logManager: logManager, runtimeClassManager: runtimeClassManager, logReduction: logreduction.NewLogReduction(identicalErrorDelay), @@ -446,6 +453,26 @@ type containerToKillInfo struct { reason containerKillReason } +// containerResources holds the set of resources applicable to the running container +type containerResources struct { + memoryLimit int64 + memoryRequest int64 + cpuLimit int64 + cpuRequest int64 +} + +// containerToUpdateInfo contains necessary information to update a container's resources. +type containerToUpdateInfo struct { + // Index of the container in pod.Spec.Containers that needs resource update + apiContainerIdx int + // ID of the runtime container that needs resource update + kubeContainerID kubecontainer.ContainerID + // Desired resources for the running container + desiredContainerResources containerResources + // Most recently configured resources on the running container + currentContainerResources *containerResources +} + // podActions keeps information what to do for a pod. type podActions struct { // Stop all running (regular, init and ephemeral) containers and the sandbox for the pod. @@ -471,6 +498,11 @@ type podActions struct { // EphemeralContainersToStart is a list of indexes for the ephemeral containers to start, // where the index is the index of the specific container in pod.Spec.EphemeralContainers. EphemeralContainersToStart []int + // ContainersToUpdate keeps a list of containers needing resource update. + // Container resource update is applicable only for CPU and memory. + ContainersToUpdate map[v1.ResourceName][]containerToUpdateInfo + // UpdatePodResources is true if container(s) need resource update with restart + UpdatePodResources bool } func containerChanged(container *v1.Container, containerStatus *kubecontainer.Status) (uint64, uint64, bool) { @@ -490,6 +522,263 @@ func containerSucceeded(c *v1.Container, podStatus *kubecontainer.PodStatus) boo return cStatus.ExitCode == 0 } +func (m *kubeGenericRuntimeManager) computePodResizeAction(pod *v1.Pod, containerIdx int, kubeContainerStatus *kubecontainer.Status, changes *podActions) bool { + container := pod.Spec.Containers[containerIdx] + if container.Resources.Limits == nil || len(pod.Status.ContainerStatuses) == 0 { + return true + } + + // Determine if the *running* container needs resource update by comparing v1.Spec.Resources (desired) + // with v1.Status.Resources / runtime.Status.Resources (last known actual). + // Proceed only when kubelet has accepted the resize a.k.a v1.Spec.Resources.Requests == v1.Status.ResourcesAllocated. + // Skip if runtime containerID doesn't match pod.Status containerID (container is restarting) + apiContainerStatus, exists := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name) + if !exists || apiContainerStatus.State.Running == nil || apiContainerStatus.Resources == nil || + kubeContainerStatus.State != kubecontainer.ContainerStateRunning || + kubeContainerStatus.ID.String() != apiContainerStatus.ContainerID || + len(diff.ObjectDiff(container.Resources.Requests, apiContainerStatus.ResourcesAllocated)) != 0 { + return true + } + + desiredMemoryLimit := container.Resources.Limits.Memory().Value() + desiredCPULimit := container.Resources.Limits.Cpu().MilliValue() + desiredCPURequest := container.Resources.Requests.Cpu().MilliValue() + currentMemoryLimit := apiContainerStatus.Resources.Limits.Memory().Value() + currentCPULimit := apiContainerStatus.Resources.Limits.Cpu().MilliValue() + currentCPURequest := apiContainerStatus.Resources.Requests.Cpu().MilliValue() + // Runtime container status resources (from CRI), if set, supercedes v1(api) container status resrouces. + if kubeContainerStatus.Resources != nil { + if kubeContainerStatus.Resources.MemoryLimit != nil { + currentMemoryLimit = kubeContainerStatus.Resources.MemoryLimit.Value() + } + if kubeContainerStatus.Resources.CPULimit != nil { + currentCPULimit = kubeContainerStatus.Resources.CPULimit.MilliValue() + } + if kubeContainerStatus.Resources.CPURequest != nil { + currentCPURequest = kubeContainerStatus.Resources.CPURequest.MilliValue() + } + } + + // Note: cgroup doesn't support memory request today, so we don't compare that. If canAdmitPod called during + // handlePodResourcesResize finds 'fit', then desiredMemoryRequest == currentMemoryRequest. + if desiredMemoryLimit == currentMemoryLimit && desiredCPULimit == currentCPULimit && desiredCPURequest == currentCPURequest { + return true + } + + desiredResources := containerResources{ + memoryLimit: desiredMemoryLimit, + memoryRequest: apiContainerStatus.ResourcesAllocated.Memory().Value(), + cpuLimit: desiredCPULimit, + cpuRequest: desiredCPURequest, + } + currentResources := containerResources{ + memoryLimit: currentMemoryLimit, + memoryRequest: apiContainerStatus.Resources.Requests.Memory().Value(), + cpuLimit: currentCPULimit, + cpuRequest: currentCPURequest, + } + + resizePolicy := make(map[v1.ResourceName]v1.ResourceResizePolicy) + for _, pol := range container.ResizePolicy { + resizePolicy[pol.ResourceName] = pol.Policy + } + determineContainerResize := func(rName v1.ResourceName, specValue, statusValue int64) (resize, restart bool) { + if specValue == statusValue { + return false, false + } + if resizePolicy[rName] == v1.RestartRequired { + return true, true + } + return true, false + } + markContainerForUpdate := func(rName v1.ResourceName, specValue, statusValue int64) { + cUpdateInfo := containerToUpdateInfo{ + apiContainerIdx: containerIdx, + kubeContainerID: kubeContainerStatus.ID, + desiredContainerResources: desiredResources, + currentContainerResources: ¤tResources, + } + // Order the container updates such that resource decreases are applied before increases + switch { + case specValue > statusValue: // append + changes.ContainersToUpdate[rName] = append(changes.ContainersToUpdate[rName], cUpdateInfo) + case specValue < statusValue: // prepend + changes.ContainersToUpdate[rName] = append(changes.ContainersToUpdate[rName], containerToUpdateInfo{}) + copy(changes.ContainersToUpdate[rName][1:], changes.ContainersToUpdate[rName]) + changes.ContainersToUpdate[rName][0] = cUpdateInfo + } + } + resizeMemLim, restartMemLim := determineContainerResize(v1.ResourceMemory, desiredMemoryLimit, currentMemoryLimit) + resizeCPULim, restartCPULim := determineContainerResize(v1.ResourceCPU, desiredCPULimit, currentCPULimit) + resizeCPUReq, restartCPUReq := determineContainerResize(v1.ResourceCPU, desiredCPURequest, currentCPURequest) + if restartCPULim || restartCPUReq || restartMemLim { + // resize policy requires this container to restart + changes.ContainersToKill[kubeContainerStatus.ID] = containerToKillInfo{ + name: kubeContainerStatus.Name, + container: &pod.Spec.Containers[containerIdx], + message: fmt.Sprintf("Container %s resize requires restart", container.Name), + } + changes.ContainersToStart = append(changes.ContainersToStart, containerIdx) + changes.UpdatePodResources = true + return false + } else { + if resizeMemLim { + markContainerForUpdate(v1.ResourceMemory, desiredMemoryLimit, currentMemoryLimit) + } + if resizeCPULim { + markContainerForUpdate(v1.ResourceCPU, desiredCPULimit, currentCPULimit) + } else if resizeCPUReq { + markContainerForUpdate(v1.ResourceCPU, desiredCPURequest, currentCPURequest) + } + } + return true +} + +func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podContainerChanges podActions, result kubecontainer.PodSyncResult) { + pcm := m.containerManager.NewPodContainerManager() + //TODO(vinaykul,InPlacePodVerticalScaling): Figure out best way to get enforceMemoryQoS value (parameter #4 below) in platform-agnostic way + podResources := cm.ResourceConfigForPod(pod, m.cpuCFSQuota, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false) + if podResources == nil { + klog.ErrorS(nil, "Unable to get resource configuration", "pod", pod.Name) + result.Fail(fmt.Errorf("Unable to get resource configuration processing resize for pod %s", pod.Name)) + return + } + setPodCgroupConfig := func(rName v1.ResourceName, setLimitValue bool) error { + var err error + switch rName { + case v1.ResourceCPU: + if setLimitValue == true { + err = pcm.SetPodCgroupCpuConfig(pod, podResources.CpuQuota, podResources.CpuPeriod, nil) + } else { + err = pcm.SetPodCgroupCpuConfig(pod, nil, podResources.CpuPeriod, podResources.CpuShares) + } + case v1.ResourceMemory: + err = pcm.SetPodCgroupMemoryConfig(pod, *podResources.Memory) + } + if err != nil { + klog.ErrorS(err, "Failed to set cgroup config", "resource", rName, "pod", pod.Name) + } + return err + } + // Memory and CPU are updated separately because memory resizes may be ordered differently than CPU resizes. + // If resize results in net pod resource increase, set pod cgroup config before resizing containers. + // If resize results in net pod resource decrease, set pod cgroup config after resizing containers. + // If an error occurs at any point, abort. Let future syncpod iterations retry the unfinished stuff. + resizeContainers := func(rName v1.ResourceName, currPodCgLimValue, newPodCgLimValue, currPodCgReqValue, newPodCgReqValue int64) error { + var err error + if newPodCgLimValue > currPodCgLimValue { + if err = setPodCgroupConfig(rName, true); err != nil { + return err + } + } + if newPodCgReqValue > currPodCgReqValue { + if err = setPodCgroupConfig(rName, false); err != nil { + return err + } + } + if len(podContainerChanges.ContainersToUpdate[rName]) > 0 { + if err = m.updatePodContainerResources(pod, rName, podContainerChanges.ContainersToUpdate[rName]); err != nil { + klog.ErrorS(err, "updatePodContainerResources failed", "pod", format.Pod(pod), "resource", rName) + return err + } + } + if newPodCgLimValue < currPodCgLimValue { + err = setPodCgroupConfig(rName, true) + } + if newPodCgReqValue < currPodCgReqValue { + if err = setPodCgroupConfig(rName, false); err != nil { + return err + } + } + return err + } + if len(podContainerChanges.ContainersToUpdate[v1.ResourceMemory]) > 0 || podContainerChanges.UpdatePodResources { + currentPodMemoryLimit, err := pcm.GetPodCgroupMemoryConfig(pod) + if err != nil { + klog.ErrorS(err, "GetPodCgroupMemoryConfig failed", "pod", pod.Name) + result.Fail(err) + return + } + currentPodMemoryUsage, err := pcm.GetPodCgroupMemoryUsage(pod) + if err != nil { + klog.ErrorS(err, "GetPodCgroupMemoryUsage failed", "pod", pod.Name) + result.Fail(err) + return + } + if currentPodMemoryUsage >= uint64(*podResources.Memory) { + klog.ErrorS(nil, "Aborting attempt to set pod memory limit less than current memory usage", "pod", pod.Name) + result.Fail(fmt.Errorf("Aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name)) + return + } + if errResize := resizeContainers(v1.ResourceMemory, int64(currentPodMemoryLimit), *podResources.Memory, 0, 0); errResize != nil { + result.Fail(errResize) + return + } + } + if len(podContainerChanges.ContainersToUpdate[v1.ResourceCPU]) > 0 || podContainerChanges.UpdatePodResources { + currentPodCpuQuota, _, currentPodCPUShares, err := pcm.GetPodCgroupCpuConfig(pod) + if err != nil { + klog.ErrorS(err, "GetPodCgroupCpuConfig failed", "pod", pod.Name) + result.Fail(err) + return + } + if errResize := resizeContainers(v1.ResourceCPU, currentPodCpuQuota, *podResources.CpuQuota, + int64(currentPodCPUShares), int64(*podResources.CpuShares)); errResize != nil { + result.Fail(errResize) + return + } + } +} + +func (m *kubeGenericRuntimeManager) updatePodContainerResources(pod *v1.Pod, resourceName v1.ResourceName, containersToUpdate []containerToUpdateInfo) error { + klog.V(5).InfoS("Updating container resources", "pod", klog.KObj(pod)) + + for _, cInfo := range containersToUpdate { + container := pod.Spec.Containers[cInfo.apiContainerIdx].DeepCopy() + // If updating memory limit, use most recently configured CPU request and limit values. + // If updating CPU request and limit, use most recently configured memory request and limit values. + switch resourceName { + case v1.ResourceMemory: + container.Resources.Limits = v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.currentContainerResources.cpuLimit, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(cInfo.desiredContainerResources.memoryLimit, resource.BinarySI), + } + container.Resources.Requests = v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.currentContainerResources.cpuRequest, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(cInfo.desiredContainerResources.memoryRequest, resource.BinarySI), + } + case v1.ResourceCPU: + container.Resources.Limits = v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.desiredContainerResources.cpuLimit, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(cInfo.currentContainerResources.memoryLimit, resource.BinarySI), + } + container.Resources.Requests = v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.desiredContainerResources.cpuRequest, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(cInfo.currentContainerResources.memoryRequest, resource.BinarySI), + } + } + if err := m.updateContainerResources(pod, container, cInfo.kubeContainerID); err != nil { + // Log error and abort as container updates need to succeed in the order determined by computePodResizeAction. + // The recovery path is for SyncPod to keep retrying at later times until it succeeds. + klog.ErrorS(err, "updateContainerResources failed", "container", container.Name, "cID", cInfo.kubeContainerID, + "pod", format.Pod(pod), "resourceName", resourceName) + return err + } + // If UpdateContainerResources is error-free, it means desired values for 'resourceName' was accepted by runtime. + // So we update currentContainerResources for 'resourceName', which is our view of most recently configured resources. + // Note: We can't rely on GetPodStatus as runtime may lag in actuating the resource values it just accepted. + switch resourceName { + case v1.ResourceMemory: + cInfo.currentContainerResources.memoryLimit = cInfo.desiredContainerResources.memoryLimit + cInfo.currentContainerResources.memoryRequest = cInfo.desiredContainerResources.memoryRequest + case v1.ResourceCPU: + cInfo.currentContainerResources.cpuLimit = cInfo.desiredContainerResources.cpuLimit + cInfo.currentContainerResources.cpuRequest = cInfo.desiredContainerResources.cpuRequest + } + } + return nil +} + // computePodActions checks whether the pod spec has changed and returns the changes if true. func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus) podActions { klog.V(5).InfoS("Syncing Pod", "pod", klog.KObj(pod)) @@ -582,6 +871,14 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku return changes } + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + changes.ContainersToUpdate = make(map[v1.ResourceName][]containerToUpdateInfo) + latestPodStatus, err := m.GetPodStatus(podStatus.ID, pod.Name, pod.Namespace) + if err == nil { + podStatus = latestPodStatus + } + } + // Number of running containers to keep. keepCount := 0 // check the status of containers. @@ -623,7 +920,10 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku var message string var reason containerKillReason restart := shouldRestartOnFailure(pod) - if _, _, changed := containerChanged(&container, containerStatus); changed { + // Do not restart if only the Resources field has changed with InPlacePodVerticalScaling enabled + if _, _, changed := containerChanged(&container, containerStatus); changed && + (!utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) || + kubecontainer.HashContainerWithoutResources(&container) != containerStatus.HashWithoutResources) { message = fmt.Sprintf("Container %s definition changed", container.Name) // Restart regardless of the restart policy because the container // spec changed. @@ -636,6 +936,10 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku // If the container failed the startup probe, we should kill it. message = fmt.Sprintf("Container %s failed startup probe", container.Name) reason = reasonStartupProbe + } else if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && + !m.computePodResizeAction(pod, idx, containerStatus, &changes) { + // computePodResizeAction updates 'changes' if resize policy requires restarting this container + continue } else { // Keep the container. keepCount++ @@ -674,7 +978,8 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku // 4. Create sandbox if necessary. // 5. Create ephemeral containers. // 6. Create init containers. -// 7. Create normal containers. +// 7. Resize running containers (if InPlacePodVerticalScaling==true) +// 8. Create normal containers. func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) { // Step 1: Compute sandbox and container changes. podContainerChanges := m.computePodActions(pod, podStatus) @@ -903,7 +1208,14 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po klog.V(4).InfoS("Completed init container for pod", "containerName", container.Name, "pod", klog.KObj(pod)) } - // Step 7: start containers in podContainerChanges.ContainersToStart. + // Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources { + m.doPodResizeAction(pod, podStatus, podContainerChanges, result) + } + } + + // Step 8: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { start(ctx, "container", metrics.Container, containerStartSpec(&pod.Spec.Containers[idx])) } @@ -1096,7 +1408,6 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(ctx context.Context, uid kubety } m.logReduction.ClearID(podFullName) - return &kubecontainer.PodStatus{ ID: uid, Name: name, diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 3dc339e253b..ad33fe24121 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -37,11 +37,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/util/flowcontrol" + featuregatetesting "k8s.io/component-base/featuregate/testing" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" apitest "k8s.io/cri-api/pkg/apis/testing" podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/credentialprovider" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" @@ -861,6 +864,28 @@ func makeBasePodAndStatus() (*v1.Pod, *kubecontainer.PodStatus) { }, }, }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + ContainerID: "://id1", + Name: "foo1", + Image: "busybox", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + }, + { + ContainerID: "://id2", + Name: "foo2", + Image: "busybox", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + }, + { + ContainerID: "://id3", + Name: "foo3", + Image: "busybox", + State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, + }, + }, + }, } status := &kubecontainer.PodStatus{ ID: pod.UID, @@ -1615,3 +1640,466 @@ func makeBasePodAndStatusWithInitAndEphemeralContainers() (*v1.Pod, *kubecontain }) return pod, status } + +func TestComputePodActionsForPodResize(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + fakeRuntime, _, m, err := createTestRuntimeManager() + m.machineInfo.MemoryCapacity = 17179860387 // 16GB + assert.NoError(t, err) + + cpu100m := resource.MustParse("100m") + cpu200m := resource.MustParse("200m") + mem100M := resource.MustParse("100Mi") + mem200M := resource.MustParse("200Mi") + cpuPolicyRestartNotRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceCPU, Policy: v1.RestartNotRequired} + memPolicyRestartNotRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, Policy: v1.RestartNotRequired} + cpuPolicyRestartRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceCPU, Policy: v1.RestartRequired} + memPolicyRestartRequired := v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, Policy: v1.RestartRequired} + + for desc, test := range map[string]struct { + podResizePolicyFn func(*v1.Pod) + mutatePodFn func(*v1.Pod) + getExpectedPodActionsFn func(*v1.Pod, *kubecontainer.PodStatus) *podActions + }{ + "Update container CPU and memory resources": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[1].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[1].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[1].Name) + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{}, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{ + v1.ResourceMemory: { + { + apiContainerIdx: 1, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem200M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + }, + }, + v1.ResourceCPU: { + { + apiContainerIdx: 1, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem200M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + }, + }, + }, + } + return &pa + }, + }, + "Update container CPU resources": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[1].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[1].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[1].Name) + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{}, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{ + v1.ResourceCPU: { + { + apiContainerIdx: 1, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + }, + }, + }, + } + return &pa + }, + }, + "Update container memory resources": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[2].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[2].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[2].Name) + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{}, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{ + v1.ResourceMemory: { + { + apiContainerIdx: 2, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem200M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + }, + }, + }, + } + return &pa + }, + }, + "Nothing when spec.Resources and status.Resources are equal": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[1].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m}, + } + pod.Status.ContainerStatuses[1].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m}, + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToStart: []int{}, + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{}, + } + return &pa + }, + }, + "Update container CPU and memory resources with Restart policy for CPU": { + podResizePolicyFn: func(pod *v1.Pod) { + pod.Spec.Containers[0].ResizePolicy = []v1.ContainerResizePolicy{cpuPolicyRestartRequired, memPolicyRestartNotRequired} + }, + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[0].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[0].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[0].Name) + killMap := make(map[kubecontainer.ContainerID]containerToKillInfo) + killMap[kcs.ID] = containerToKillInfo{ + container: &pod.Spec.Containers[0], + name: pod.Spec.Containers[0].Name, + } + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{0}, + ContainersToKill: killMap, + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{}, + UpdatePodResources: true, + } + return &pa + }, + }, + "Update container CPU and memory resources with Restart policy for memory": { + podResizePolicyFn: func(pod *v1.Pod) { + pod.Spec.Containers[2].ResizePolicy = []v1.ContainerResizePolicy{cpuPolicyRestartNotRequired, memPolicyRestartRequired} + }, + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[2].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[2].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[2].Name) + killMap := make(map[kubecontainer.ContainerID]containerToKillInfo) + killMap[kcs.ID] = containerToKillInfo{ + container: &pod.Spec.Containers[2], + name: pod.Spec.Containers[2].Name, + } + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{2}, + ContainersToKill: killMap, + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{}, + UpdatePodResources: true, + } + return &pa + }, + }, + "Update container memory resources with Restart policy for CPU": { + podResizePolicyFn: func(pod *v1.Pod) { + pod.Spec.Containers[1].ResizePolicy = []v1.ContainerResizePolicy{cpuPolicyRestartRequired, memPolicyRestartNotRequired} + }, + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[1].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem200M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[1].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[1].Name) + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{}, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{ + v1.ResourceMemory: { + { + apiContainerIdx: 1, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem200M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + }, + }, + }, + } + return &pa + }, + }, + "Update container CPU resources with Restart policy for memory": { + podResizePolicyFn: func(pod *v1.Pod) { + pod.Spec.Containers[2].ResizePolicy = []v1.ContainerResizePolicy{cpuPolicyRestartNotRequired, memPolicyRestartRequired} + }, + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.Containers[2].Resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem100M}, + } + if idx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[2].Name); found { + pod.Status.ContainerStatuses[idx].Resources = &v1.ResourceRequirements{ + Limits: v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M}, + } + } + }, + getExpectedPodActionsFn: func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *podActions { + kcs := podStatus.FindContainerStatusByName(pod.Spec.Containers[2].Name) + pa := podActions{ + SandboxID: podStatus.SandboxStatuses[0].Id, + ContainersToStart: []int{}, + ContainersToKill: getKillMap(pod, podStatus, []int{}), + ContainersToUpdate: map[v1.ResourceName][]containerToUpdateInfo{ + v1.ResourceCPU: { + { + apiContainerIdx: 2, + kubeContainerID: kcs.ID, + desiredContainerResources: containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu200m.MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: mem100M.Value(), + cpuLimit: cpu100m.MilliValue(), + }, + }, + }, + }, + } + return &pa + }, + }, + } { + pod, kps := makeBasePodAndStatus() + for idx := range pod.Spec.Containers { + // default resize policy when pod resize feature is enabled + pod.Spec.Containers[idx].ResizePolicy = []v1.ContainerResizePolicy{cpuPolicyRestartNotRequired, memPolicyRestartNotRequired} + } + if test.podResizePolicyFn != nil { + test.podResizePolicyFn(pod) + } + for idx := range pod.Spec.Containers { + // compute hash + if kcs := kps.FindContainerStatusByName(pod.Spec.Containers[idx].Name); kcs != nil { + kcs.Hash = kubecontainer.HashContainer(&pod.Spec.Containers[idx]) + kcs.HashWithoutResources = kubecontainer.HashContainerWithoutResources(&pod.Spec.Containers[idx]) + } + } + makeAndSetFakePod(t, m, fakeRuntime, pod) + status, _ := m.GetPodStatus(kps.ID, pod.Name, pod.Namespace) + for idx := range pod.Spec.Containers { + if rcs := status.FindContainerStatusByName(pod.Spec.Containers[idx].Name); rcs != nil { + if csIdx, found := podutil.GetIndexOfContainerStatus(pod.Status.ContainerStatuses, pod.Spec.Containers[idx].Name); found { + pod.Status.ContainerStatuses[csIdx].ContainerID = rcs.ID.String() + } + } + } + for idx := range pod.Spec.Containers { + if kcs := kps.FindContainerStatusByName(pod.Spec.Containers[idx].Name); kcs != nil { + kcs.Hash = kubecontainer.HashContainer(&pod.Spec.Containers[idx]) + kcs.HashWithoutResources = kubecontainer.HashContainerWithoutResources(&pod.Spec.Containers[idx]) + } + } + if test.mutatePodFn != nil { + test.mutatePodFn(pod) + } + expectedActions := test.getExpectedPodActionsFn(pod, status) + actions := m.computePodActions(pod, status) + verifyActions(t, expectedActions, &actions, desc) + } +} + +func TestUpdatePodContainerResources(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() + fakeRuntime, _, m, err := createTestRuntimeManager() + m.machineInfo.MemoryCapacity = 17179860387 // 16GB + assert.NoError(t, err) + + cpu100m := resource.MustParse("100m") + cpu150m := resource.MustParse("150m") + cpu200m := resource.MustParse("200m") + cpu250m := resource.MustParse("250m") + cpu300m := resource.MustParse("300m") + cpu350m := resource.MustParse("350m") + mem100M := resource.MustParse("100Mi") + mem150M := resource.MustParse("150Mi") + mem200M := resource.MustParse("200Mi") + mem250M := resource.MustParse("250Mi") + mem300M := resource.MustParse("300Mi") + mem350M := resource.MustParse("350Mi") + res100m100Mi := v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem100M} + res150m100Mi := v1.ResourceList{v1.ResourceCPU: cpu150m, v1.ResourceMemory: mem100M} + res100m150Mi := v1.ResourceList{v1.ResourceCPU: cpu100m, v1.ResourceMemory: mem150M} + res150m150Mi := v1.ResourceList{v1.ResourceCPU: cpu150m, v1.ResourceMemory: mem150M} + res200m200Mi := v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem200M} + res250m200Mi := v1.ResourceList{v1.ResourceCPU: cpu250m, v1.ResourceMemory: mem200M} + res200m250Mi := v1.ResourceList{v1.ResourceCPU: cpu200m, v1.ResourceMemory: mem250M} + res250m250Mi := v1.ResourceList{v1.ResourceCPU: cpu250m, v1.ResourceMemory: mem250M} + res300m300Mi := v1.ResourceList{v1.ResourceCPU: cpu300m, v1.ResourceMemory: mem300M} + res350m300Mi := v1.ResourceList{v1.ResourceCPU: cpu350m, v1.ResourceMemory: mem300M} + res300m350Mi := v1.ResourceList{v1.ResourceCPU: cpu300m, v1.ResourceMemory: mem350M} + res350m350Mi := v1.ResourceList{v1.ResourceCPU: cpu350m, v1.ResourceMemory: mem350M} + + pod, _ := makeBasePodAndStatus() + makeAndSetFakePod(t, m, fakeRuntime, pod) + + for dsc, tc := range map[string]struct { + resourceName v1.ResourceName + apiSpecResources []v1.ResourceRequirements + apiStatusResources []v1.ResourceRequirements + requiresRestart []bool + invokeUpdateResources bool + expectedCurrentLimits []v1.ResourceList + expectedCurrentRequests []v1.ResourceList + }{ + "Guaranteed QoS Pod - CPU & memory resize requested, update CPU": { + resourceName: v1.ResourceCPU, + apiSpecResources: []v1.ResourceRequirements{ + {Limits: res150m150Mi, Requests: res150m150Mi}, + {Limits: res250m250Mi, Requests: res250m250Mi}, + {Limits: res350m350Mi, Requests: res350m350Mi}, + }, + apiStatusResources: []v1.ResourceRequirements{ + {Limits: res100m100Mi, Requests: res100m100Mi}, + {Limits: res200m200Mi, Requests: res200m200Mi}, + {Limits: res300m300Mi, Requests: res300m300Mi}, + }, + requiresRestart: []bool{false, false, false}, + invokeUpdateResources: true, + expectedCurrentLimits: []v1.ResourceList{res150m100Mi, res250m200Mi, res350m300Mi}, + expectedCurrentRequests: []v1.ResourceList{res150m100Mi, res250m200Mi, res350m300Mi}, + }, + "Guaranteed QoS Pod - CPU & memory resize requested, update memory": { + resourceName: v1.ResourceMemory, + apiSpecResources: []v1.ResourceRequirements{ + {Limits: res150m150Mi, Requests: res150m150Mi}, + {Limits: res250m250Mi, Requests: res250m250Mi}, + {Limits: res350m350Mi, Requests: res350m350Mi}, + }, + apiStatusResources: []v1.ResourceRequirements{ + {Limits: res100m100Mi, Requests: res100m100Mi}, + {Limits: res200m200Mi, Requests: res200m200Mi}, + {Limits: res300m300Mi, Requests: res300m300Mi}, + }, + requiresRestart: []bool{false, false, false}, + invokeUpdateResources: true, + expectedCurrentLimits: []v1.ResourceList{res100m150Mi, res200m250Mi, res300m350Mi}, + expectedCurrentRequests: []v1.ResourceList{res100m150Mi, res200m250Mi, res300m350Mi}, + }, + } { + var containersToUpdate []containerToUpdateInfo + for idx := range pod.Spec.Containers { + // default resize policy when pod resize feature is enabled + pod.Spec.Containers[idx].Resources = tc.apiSpecResources[idx] + pod.Status.ContainerStatuses[idx].Resources = &tc.apiStatusResources[idx] + cInfo := containerToUpdateInfo{ + apiContainerIdx: idx, + kubeContainerID: kubecontainer.ContainerID{}, + desiredContainerResources: containerResources{ + memoryLimit: tc.apiSpecResources[idx].Limits.Memory().Value(), + memoryRequest: tc.apiSpecResources[idx].Requests.Memory().Value(), + cpuLimit: tc.apiSpecResources[idx].Limits.Cpu().MilliValue(), + cpuRequest: tc.apiSpecResources[idx].Requests.Cpu().MilliValue(), + }, + currentContainerResources: &containerResources{ + memoryLimit: tc.apiStatusResources[idx].Limits.Memory().Value(), + memoryRequest: tc.apiStatusResources[idx].Requests.Memory().Value(), + cpuLimit: tc.apiStatusResources[idx].Limits.Cpu().MilliValue(), + cpuRequest: tc.apiStatusResources[idx].Requests.Cpu().MilliValue(), + }, + } + containersToUpdate = append(containersToUpdate, cInfo) + } + fakeRuntime.Called = []string{} + err := m.updatePodContainerResources(pod, tc.resourceName, containersToUpdate) + assert.NoError(t, err, dsc) + + if tc.invokeUpdateResources { + assert.Contains(t, fakeRuntime.Called, "UpdateContainerResources", dsc) + } + for idx := range pod.Spec.Containers { + assert.Equal(t, tc.expectedCurrentLimits[idx].Memory().Value(), containersToUpdate[idx].currentContainerResources.memoryLimit, dsc) + assert.Equal(t, tc.expectedCurrentRequests[idx].Memory().Value(), containersToUpdate[idx].currentContainerResources.memoryRequest, dsc) + assert.Equal(t, tc.expectedCurrentLimits[idx].Cpu().MilliValue(), containersToUpdate[idx].currentContainerResources.cpuLimit, dsc) + assert.Equal(t, tc.expectedCurrentRequests[idx].Cpu().MilliValue(), containersToUpdate[idx].currentContainerResources.cpuRequest, dsc) + } + } +} diff --git a/pkg/kubelet/kuberuntime/labels.go b/pkg/kubelet/kuberuntime/labels.go index 146eadbbb2f..ae1ac604862 100644 --- a/pkg/kubelet/kuberuntime/labels.go +++ b/pkg/kubelet/kuberuntime/labels.go @@ -22,7 +22,9 @@ import ( v1 "k8s.io/api/core/v1" kubetypes "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -33,6 +35,7 @@ const ( podTerminationGracePeriodLabel = "io.kubernetes.pod.terminationGracePeriod" containerHashLabel = "io.kubernetes.container.hash" + containerHashWithoutResourcesLabel = "io.kubernetes.container.hashWithoutResources" containerRestartCountLabel = "io.kubernetes.container.restartCount" containerTerminationMessagePathLabel = "io.kubernetes.container.terminationMessagePath" containerTerminationMessagePolicyLabel = "io.kubernetes.container.terminationMessagePolicy" @@ -62,6 +65,7 @@ type labeledContainerInfo struct { type annotatedContainerInfo struct { Hash uint64 + HashWithoutResources uint64 RestartCount int PodDeletionGracePeriod *int64 PodTerminationGracePeriod *int64 @@ -113,6 +117,9 @@ func newContainerAnnotations(container *v1.Container, pod *v1.Pod, restartCount } annotations[containerHashLabel] = strconv.FormatUint(kubecontainer.HashContainer(container), 16) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + annotations[containerHashWithoutResourcesLabel] = strconv.FormatUint(kubecontainer.HashContainerWithoutResources(container), 16) + } annotations[containerRestartCountLabel] = strconv.Itoa(restartCount) annotations[containerTerminationMessagePathLabel] = container.TerminationMessagePath annotations[containerTerminationMessagePolicyLabel] = string(container.TerminationMessagePolicy) @@ -193,6 +200,11 @@ func getContainerInfoFromAnnotations(annotations map[string]string) *annotatedCo if containerInfo.Hash, err = getUint64ValueFromLabel(annotations, containerHashLabel); err != nil { klog.ErrorS(err, "Unable to get label value from annotations", "label", containerHashLabel, "annotations", annotations) } + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if containerInfo.HashWithoutResources, err = getUint64ValueFromLabel(annotations, containerHashWithoutResourcesLabel); err != nil { + klog.ErrorS(err, "Unable to get label value from annotations", "label", containerHashWithoutResourcesLabel, "annotations", annotations) + } + } if containerInfo.RestartCount, err = getIntValueFromLabel(annotations, containerRestartCountLabel); err != nil { klog.ErrorS(err, "Unable to get label value from annotations", "label", containerRestartCountLabel, "annotations", annotations) } diff --git a/pkg/kubelet/kuberuntime/labels_test.go b/pkg/kubelet/kuberuntime/labels_test.go index 6157bb50d89..8ee21f30f99 100644 --- a/pkg/kubelet/kuberuntime/labels_test.go +++ b/pkg/kubelet/kuberuntime/labels_test.go @@ -23,6 +23,9 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" ) @@ -152,11 +155,13 @@ func TestContainerAnnotations(t *testing.T) { PodDeletionGracePeriod: pod.DeletionGracePeriodSeconds, PodTerminationGracePeriod: pod.Spec.TerminationGracePeriodSeconds, Hash: kubecontainer.HashContainer(container), + HashWithoutResources: kubecontainer.HashContainerWithoutResources(container), RestartCount: restartCount, TerminationMessagePath: container.TerminationMessagePath, PreStopHandler: container.Lifecycle.PreStop, } + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() // Test whether we can get right information from label annotations := newContainerAnnotations(container, pod, restartCount, opts) containerInfo := getContainerInfoFromAnnotations(annotations) @@ -177,6 +182,7 @@ func TestContainerAnnotations(t *testing.T) { expected.PreStopHandler = nil // Because container is changed, the Hash should be updated expected.Hash = kubecontainer.HashContainer(container) + expected.HashWithoutResources = kubecontainer.HashContainerWithoutResources(container) annotations = newContainerAnnotations(container, pod, restartCount, opts) containerInfo = getContainerInfoFromAnnotations(annotations) if !reflect.DeepEqual(containerInfo, expected) { diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 656eafbca39..877f51c72c4 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -76,6 +76,8 @@ type GenericPLEG struct { runningMu sync.Mutex // Indicates relisting related parameters relistDuration *RelistDuration + // Mutex to serialize updateCache called by relist vs UpdateCache interface + podCacheMutex sync.Mutex } // plegContainerState has a one-to-one mapping to the @@ -436,6 +438,8 @@ func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, p return nil, true } + g.podCacheMutex.Lock() + defer g.podCacheMutex.Unlock() timestamp := g.clock.Now() status, err := g.runtime.GetPodStatus(ctx, pod.ID, pod.Name, pod.Namespace) @@ -478,6 +482,16 @@ func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, p return err, g.cache.Set(pod.ID, status, err, timestamp) } +func (g *GenericPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) error { + if !g.cacheEnabled() { + return fmt.Errorf("pod cache disabled") + } + if pod == nil { + return fmt.Errorf("pod cannot be nil") + } + return g.updateCache(pod, pid) +} + func updateEvents(eventsByPodID map[types.UID][]*PodLifecycleEvent, e *PodLifecycleEvent) { if e == nil { return diff --git a/pkg/kubelet/pleg/pleg.go b/pkg/kubelet/pleg/pleg.go index e7e678e6453..b0c60b66c9a 100644 --- a/pkg/kubelet/pleg/pleg.go +++ b/pkg/kubelet/pleg/pleg.go @@ -20,6 +20,7 @@ import ( "time" "k8s.io/apimachinery/pkg/types" + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" ) // PodLifeCycleEventType define the event type of pod life cycle events. @@ -68,4 +69,5 @@ type PodLifecycleEventGenerator interface { Watch() chan *PodLifecycleEvent Healthy() (bool, error) Relist() + UpdateCache(*kubecontainer.Pod, types.UID) error } diff --git a/pkg/kubelet/prober/common_test.go b/pkg/kubelet/prober/common_test.go index 220bd689e0c..22ce2543f5a 100644 --- a/pkg/kubelet/prober/common_test.go +++ b/pkg/kubelet/prober/common_test.go @@ -17,6 +17,7 @@ limitations under the License. package prober import ( + "io/ioutil" "reflect" "sync" @@ -109,8 +110,14 @@ func newTestManager() *manager { podStartupLatencyTracker := kubeletutil.NewPodStartupLatencyTracker() // Add test pod to pod manager, so that status manager can get the pod from pod manager if needed. podManager.AddPod(getTestPod()) + testRootDir := "" + if tempDir, err := ioutil.TempDir("", "kubelet_test."); err != nil { + return nil + } else { + testRootDir = tempDir + } m := NewManager( - status.NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker), + status.NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker, testRootDir), results.NewManager(), results.NewManager(), results.NewManager(), diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index d48dce2609c..277b8f81bb8 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -19,6 +19,7 @@ package prober import ( "context" "fmt" + "io/ioutil" "testing" "time" @@ -153,7 +154,13 @@ func TestDoProbe(t *testing.T) { } // Clean up. - m.statusManager = status.NewManager(&fake.Clientset{}, kubepod.NewBasicPodManager(nil), &statustest.FakePodDeletionSafetyProvider{}, kubeletutil.NewPodStartupLatencyTracker()) + testRootDir := "" + if tempDir, err := ioutil.TempDir("", "kubelet_test."); err != nil { + t.Fatalf("can't make a temp rootdir: %v", err) + } else { + testRootDir = tempDir + } + m.statusManager = status.NewManager(&fake.Clientset{}, kubepod.NewBasicPodManager(nil), &statustest.FakePodDeletionSafetyProvider{}, kubeletutil.NewPodStartupLatencyTracker(), testRootDir) resultsManager(m, probeType).Remove(testContainerID) } } diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 93d0934c280..17f2eb9c8ae 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -18,7 +18,10 @@ package qos import ( v1 "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -60,6 +63,11 @@ func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapa // targets for OOM kills. // Note that this is a heuristic, it won't work if a container has many small processes. memoryRequest := container.Resources.Requests.Memory().Value() + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { + memoryRequest = cs.ResourcesAllocated.Memory().Value() + } + } oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity // A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure // that burstable pods have a higher OOM score adjustment. diff --git a/pkg/kubelet/runonce_test.go b/pkg/kubelet/runonce_test.go index 96adad4b313..690e1287097 100644 --- a/pkg/kubelet/runonce_test.go +++ b/pkg/kubelet/runonce_test.go @@ -85,7 +85,7 @@ func TestRunOnce(t *testing.T) { recorder: &record.FakeRecorder{}, cadvisor: cadvisor, nodeLister: testNodeLister{}, - statusManager: status.NewManager(nil, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker), + statusManager: status.NewManager(nil, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker, basePath), podManager: podManager, podWorkers: &fakePodWorkers{}, os: &containertest.FakeOS{}, diff --git a/pkg/kubelet/status/fake_status_manager.go b/pkg/kubelet/status/fake_status_manager.go new file mode 100644 index 00000000000..ee4b3f5f36f --- /dev/null +++ b/pkg/kubelet/status/fake_status_manager.go @@ -0,0 +1,93 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package status + +import ( + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/status/state" +) + +type fakeManager struct { + state state.State +} + +func (m *fakeManager) Start() { + klog.InfoS("Start()") + return +} + +func (m *fakeManager) GetPodStatus(uid types.UID) (v1.PodStatus, bool) { + klog.InfoS("GetPodStatus()") + return v1.PodStatus{}, false +} + +func (m *fakeManager) SetPodStatus(pod *v1.Pod, status v1.PodStatus) { + klog.InfoS("SetPodStatus()") + return +} + +func (m *fakeManager) SetContainerReadiness(podUID types.UID, containerID kubecontainer.ContainerID, ready bool) { + klog.InfoS("SetContainerReadiness()") + return +} + +func (m *fakeManager) SetContainerStartup(podUID types.UID, containerID kubecontainer.ContainerID, started bool) { + klog.InfoS("SetContainerStartup()") + return +} + +func (m *fakeManager) TerminatePod(pod *v1.Pod) { + klog.InfoS("TerminatePod()") + return +} + +func (m *fakeManager) RemoveOrphanedStatuses(podUIDs map[types.UID]bool) { + klog.InfoS("RemoveOrphanedStatuses()") + return +} + +func (m *fakeManager) State() state.Reader { + klog.InfoS("State()") + return m.state +} + +func (m *fakeManager) SetPodAllocation(pod *v1.Pod) error { + klog.InfoS("SetPodAllocation()") + for _, container := range pod.Spec.Containers { + var alloc v1.ResourceList + if container.Resources.Requests != nil { + alloc = container.Resources.Requests.DeepCopy() + } + m.state.SetContainerResourceAllocation(string(pod.UID), container.Name, alloc) + } + return nil +} + +func (m *fakeManager) SetPodResizeStatus(podUID types.UID, resizeStatus v1.PodResizeStatus) error { + klog.InfoS("SetPodResizeStatus()") + return nil +} + +// NewFakeManager creates empty/fake memory manager +func NewFakeManager() Manager { + return &fakeManager{ + state: state.NewStateMemory(), + } +} diff --git a/pkg/kubelet/status/state/checkpoint.go b/pkg/kubelet/status/state/checkpoint.go new file mode 100644 index 00000000000..6cad6361e28 --- /dev/null +++ b/pkg/kubelet/status/state/checkpoint.go @@ -0,0 +1,65 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + + "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" +) + +var _ checkpointmanager.Checkpoint = &PodResourceAllocationCheckpoint{} + +// PodResourceAllocationCheckpoint is used to store resources allocated to a pod in checkpoint +type PodResourceAllocationCheckpoint struct { + AllocationEntries map[string]map[string]v1.ResourceList `json:"allocationEntries,omitempty"` + ResizeStatusEntries map[string]v1.PodResizeStatus `json:"resizeStatusEntries,omitempty"` + Checksum checksum.Checksum `json:"checksum"` +} + +// NewPodResourceAllocationCheckpoint returns an instance of Checkpoint +func NewPodResourceAllocationCheckpoint() *PodResourceAllocationCheckpoint { + //lint:ignore unexported-type-in-api user-facing error message + return &PodResourceAllocationCheckpoint{ + AllocationEntries: make(map[string]map[string]v1.ResourceList), + ResizeStatusEntries: make(map[string]v1.PodResizeStatus), + } +} + +// MarshalCheckpoint returns marshalled checkpoint +func (prc *PodResourceAllocationCheckpoint) MarshalCheckpoint() ([]byte, error) { + // make sure checksum wasn't set before so it doesn't affect output checksum + prc.Checksum = 0 + prc.Checksum = checksum.New(prc) + return json.Marshal(*prc) +} + +// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint +func (prc *PodResourceAllocationCheckpoint) UnmarshalCheckpoint(blob []byte) error { + return json.Unmarshal(blob, prc) +} + +// VerifyChecksum verifies that current checksum of checkpoint is valid +func (prc *PodResourceAllocationCheckpoint) VerifyChecksum() error { + ck := prc.Checksum + prc.Checksum = 0 + err := ck.Verify(prc) + prc.Checksum = ck + return err +} diff --git a/pkg/kubelet/status/state/state.go b/pkg/kubelet/status/state/state.go new file mode 100644 index 00000000000..2fdbe8a4474 --- /dev/null +++ b/pkg/kubelet/status/state/state.go @@ -0,0 +1,62 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "k8s.io/api/core/v1" +) + +// PodResourceAllocation type is used in tracking resources allocated to pod's containers +type PodResourceAllocation map[string]map[string]v1.ResourceList + +// PodResizeStatus type is used in tracking the last resize decision for pod +type PodResizeStatus map[string]v1.PodResizeStatus + +// Clone returns a copy of PodResourceAllocation +func (pr PodResourceAllocation) Clone() PodResourceAllocation { + prCopy := make(PodResourceAllocation) + for pod := range pr { + prCopy[pod] = make(map[string]v1.ResourceList) + for container, alloc := range pr[pod] { + prCopy[pod][container] = alloc.DeepCopy() + } + } + return prCopy +} + +// Reader interface used to read current pod resource allocation state +type Reader interface { + GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceList, bool) + GetPodResourceAllocation() PodResourceAllocation + GetPodResizeStatus(podUID string) (v1.PodResizeStatus, bool) + GetResizeStatus() PodResizeStatus +} + +type writer interface { + SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceList) error + SetPodResourceAllocation(PodResourceAllocation) error + SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) error + SetResizeStatus(PodResizeStatus) error + Delete(podUID string, containerName string) error + ClearState() error +} + +// State interface provides methods for tracking and setting pod resource allocation +type State interface { + Reader + writer +} diff --git a/pkg/kubelet/status/state/state_checkpoint.go b/pkg/kubelet/status/state/state_checkpoint.go new file mode 100644 index 00000000000..844d0649b4c --- /dev/null +++ b/pkg/kubelet/status/state/state_checkpoint.go @@ -0,0 +1,179 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + "path" + "sync" + + "k8s.io/api/core/v1" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" +) + +var _ State = &stateCheckpoint{} + +type stateCheckpoint struct { + mux sync.RWMutex + cache State + checkpointManager checkpointmanager.CheckpointManager + checkpointName string +} + +// NewStateCheckpoint creates new State for keeping track of pod resource allocations with checkpoint backend +func NewStateCheckpoint(stateDir, checkpointName string) (State, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) + if err != nil { + return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err) + } + stateCheckpoint := &stateCheckpoint{ + cache: NewStateMemory(), + checkpointManager: checkpointManager, + checkpointName: checkpointName, + } + + if err := stateCheckpoint.restoreState(); err != nil { + //lint:ignore ST1005 user-facing error message + return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) + } + return stateCheckpoint, nil +} + +// restores state from a checkpoint and creates it if it doesn't exist +func (sc *stateCheckpoint) restoreState() error { + sc.mux.Lock() + defer sc.mux.Unlock() + var err error + + checkpoint := NewPodResourceAllocationCheckpoint() + + if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil { + if err == errors.ErrCheckpointNotFound { + return sc.storeState() + } + return err + } + + sc.cache.SetPodResourceAllocation(checkpoint.AllocationEntries) + sc.cache.SetResizeStatus(checkpoint.ResizeStatusEntries) + klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint") + return nil +} + +// saves state to a checkpoint, caller is responsible for locking +func (sc *stateCheckpoint) storeState() error { + checkpoint := NewPodResourceAllocationCheckpoint() + + podAllocation := sc.cache.GetPodResourceAllocation() + for pod := range podAllocation { + checkpoint.AllocationEntries[pod] = make(map[string]v1.ResourceList) + for container, alloc := range podAllocation[pod] { + checkpoint.AllocationEntries[pod][container] = alloc + } + } + + podResizeStatus := sc.cache.GetResizeStatus() + checkpoint.ResizeStatusEntries = make(map[string]v1.PodResizeStatus) + for pUID, rStatus := range podResizeStatus { + checkpoint.ResizeStatusEntries[pUID] = rStatus + } + + err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint) + if err != nil { + klog.ErrorS(err, "Failed to save pod allocation checkpoint") + return err + } + return nil +} + +// GetContainerResourceAllocation returns current resources allocated to a pod's container +func (sc *stateCheckpoint) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceList, bool) { + sc.mux.RLock() + defer sc.mux.RUnlock() + return sc.cache.GetContainerResourceAllocation(podUID, containerName) +} + +// GetPodResourceAllocation returns current pod resource allocation +func (sc *stateCheckpoint) GetPodResourceAllocation() PodResourceAllocation { + sc.mux.RLock() + defer sc.mux.RUnlock() + return sc.cache.GetPodResourceAllocation() +} + +// GetPodResizeStatus returns the last resize decision for a pod +func (sc *stateCheckpoint) GetPodResizeStatus(podUID string) (v1.PodResizeStatus, bool) { + sc.mux.RLock() + defer sc.mux.RUnlock() + return sc.cache.GetPodResizeStatus(podUID) +} + +// GetResizeStatus returns the set of resize decisions made +func (sc *stateCheckpoint) GetResizeStatus() PodResizeStatus { + sc.mux.RLock() + defer sc.mux.RUnlock() + return sc.cache.GetResizeStatus() +} + +// SetContainerResourceAllocation sets resources allocated to a pod's container +func (sc *stateCheckpoint) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceList) error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.SetContainerResourceAllocation(podUID, containerName, alloc) + return sc.storeState() +} + +// SetPodResourceAllocation sets pod resource allocation +func (sc *stateCheckpoint) SetPodResourceAllocation(a PodResourceAllocation) error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.SetPodResourceAllocation(a) + return sc.storeState() +} + +// SetPodResizeStatus sets the last resize decision for a pod +func (sc *stateCheckpoint) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.SetPodResizeStatus(podUID, resizeStatus) + return sc.storeState() +} + +// SetResizeStatus sets the resize decisions +func (sc *stateCheckpoint) SetResizeStatus(rs PodResizeStatus) error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.SetResizeStatus(rs) + return sc.storeState() +} + +// Delete deletes allocations for specified pod +func (sc *stateCheckpoint) Delete(podUID string, containerName string) error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.Delete(podUID, containerName) + return sc.storeState() +} + +// ClearState clears the state and saves it in a checkpoint +func (sc *stateCheckpoint) ClearState() error { + sc.mux.Lock() + defer sc.mux.Unlock() + sc.cache.ClearState() + return sc.storeState() +} diff --git a/pkg/kubelet/status/state/state_mem.go b/pkg/kubelet/status/state/state_mem.go new file mode 100644 index 00000000000..6a4047b1739 --- /dev/null +++ b/pkg/kubelet/status/state/state_mem.go @@ -0,0 +1,152 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "sync" + + "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) + +type stateMemory struct { + sync.RWMutex + podAllocation PodResourceAllocation + podResizeStatus PodResizeStatus +} + +var _ State = &stateMemory{} + +// NewStateMemory creates new State to track resources allocated to pods +func NewStateMemory() State { + klog.V(2).InfoS("Initialized new in-memory state store for pod resource allocation tracking") + return &stateMemory{ + podAllocation: PodResourceAllocation{}, + podResizeStatus: PodResizeStatus{}, + } +} + +func (s *stateMemory) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceList, bool) { + s.RLock() + defer s.RUnlock() + + alloc, ok := s.podAllocation[podUID][containerName] + return alloc.DeepCopy(), ok +} + +func (s *stateMemory) GetPodResourceAllocation() PodResourceAllocation { + s.RLock() + defer s.RUnlock() + return s.podAllocation.Clone() +} + +func (s *stateMemory) GetPodResizeStatus(podUID string) (v1.PodResizeStatus, bool) { + s.RLock() + defer s.RUnlock() + + resizeStatus, ok := s.podResizeStatus[podUID] + return resizeStatus, ok +} + +func (s *stateMemory) GetResizeStatus() PodResizeStatus { + s.RLock() + defer s.RUnlock() + prs := make(map[string]v1.PodResizeStatus) + for k, v := range s.podResizeStatus { + prs[k] = v + } + return prs +} + +func (s *stateMemory) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceList) error { + s.Lock() + defer s.Unlock() + + if _, ok := s.podAllocation[podUID]; !ok { + s.podAllocation[podUID] = make(map[string]v1.ResourceList) + } + + s.podAllocation[podUID][containerName] = alloc + klog.V(3).InfoS("Updated container resource allocation", "podUID", podUID, "containerName", containerName, "alloc", alloc) + return nil +} + +func (s *stateMemory) SetPodResourceAllocation(a PodResourceAllocation) error { + s.Lock() + defer s.Unlock() + + s.podAllocation = a.Clone() + klog.V(3).InfoS("Updated pod resource allocation", "allocation", a) + return nil +} + +func (s *stateMemory) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) error { + s.Lock() + defer s.Unlock() + + if resizeStatus != "" { + s.podResizeStatus[podUID] = resizeStatus + } else { + delete(s.podResizeStatus, podUID) + } + klog.V(3).InfoS("Updated pod resize state", "podUID", podUID, "resizeStatus", resizeStatus) + return nil +} + +func (s *stateMemory) SetResizeStatus(rs PodResizeStatus) error { + s.Lock() + defer s.Unlock() + prs := make(map[string]v1.PodResizeStatus) + for k, v := range rs { + prs[k] = v + } + s.podResizeStatus = prs + klog.V(3).InfoS("Updated pod resize state", "resizes", rs) + return nil +} + +func (s *stateMemory) deleteContainer(podUID string, containerName string) { + delete(s.podAllocation[podUID], containerName) + if len(s.podAllocation[podUID]) == 0 { + delete(s.podAllocation, podUID) + delete(s.podResizeStatus, podUID) + } + klog.V(3).InfoS("Deleted pod resource allocation", "podUID", podUID, "containerName", containerName) +} + +func (s *stateMemory) Delete(podUID string, containerName string) error { + s.Lock() + defer s.Unlock() + if len(containerName) == 0 { + delete(s.podAllocation, podUID) + delete(s.podResizeStatus, podUID) + klog.V(3).InfoS("Deleted pod resource allocation and resize state", "podUID", podUID) + return nil + } + s.deleteContainer(podUID, containerName) + return nil +} + +func (s *stateMemory) ClearState() error { + s.Lock() + defer s.Unlock() + + s.podAllocation = make(PodResourceAllocation) + s.podResizeStatus = make(PodResizeStatus) + klog.V(3).InfoS("Cleared state") + return nil +} diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go index 4022b7e806a..fdf99b04074 100644 --- a/pkg/kubelet/status/status_manager.go +++ b/pkg/kubelet/status/status_manager.go @@ -41,10 +41,14 @@ import ( kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/metrics" kubepod "k8s.io/kubernetes/pkg/kubelet/pod" + "k8s.io/kubernetes/pkg/kubelet/status/state" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" statusutil "k8s.io/kubernetes/pkg/util/pod" ) +// podStatusManagerStateFile is the file name where status manager stores its state +const podStatusManagerStateFile = "pod_status_manager_state" + // A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are // not sent to the API server. type versionedPodStatus struct { @@ -79,6 +83,10 @@ type manager struct { podDeletionSafety PodDeletionSafetyProvider podStartupLatencyHelper PodStartupLatencyStateHelper + // state allows to save/restore pod resource allocation and tolerate kubelet restarts. + state state.State + // stateFileDirectory holds the directory where the state file for checkpoints is held. + stateFileDirectory string } // PodStatusProvider knows how to provide status for a pod. It's intended to be used by other components @@ -128,12 +136,21 @@ type Manager interface { // RemoveOrphanedStatuses scans the status cache and removes any entries for pods not included in // the provided podUIDs. RemoveOrphanedStatuses(podUIDs map[types.UID]bool) + + // State returns a read-only interface to the internal status manager state. + State() state.Reader + + // SetPodAllocation checkpoints the resources allocated to a pod's containers. + SetPodAllocation(pod *v1.Pod) error + + // SetPodResizeStatus checkpoints the last resizing decision for the pod. + SetPodResizeStatus(podUID types.UID, resize v1.PodResizeStatus) error } const syncPeriod = 10 * time.Second // NewManager returns a functional Manager. -func NewManager(kubeClient clientset.Interface, podManager kubepod.Manager, podDeletionSafety PodDeletionSafetyProvider, podStartupLatencyHelper PodStartupLatencyStateHelper) Manager { +func NewManager(kubeClient clientset.Interface, podManager kubepod.Manager, podDeletionSafety PodDeletionSafetyProvider, podStartupLatencyHelper PodStartupLatencyStateHelper, stateFileDirectory string) Manager { return &manager{ kubeClient: kubeClient, podManager: podManager, @@ -142,6 +159,7 @@ func NewManager(kubeClient clientset.Interface, podManager kubepod.Manager, podD apiStatusVersions: make(map[kubetypes.MirrorPodUID]uint64), podDeletionSafety: podDeletionSafety, podStartupLatencyHelper: podStartupLatencyHelper, + stateFileDirectory: stateFileDirectory, } } @@ -173,6 +191,15 @@ func (m *manager) Start() { return } + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + stateImpl, err := state.NewStateCheckpoint(m.stateFileDirectory, podStatusManagerStateFile) + if err != nil { + klog.ErrorS(err, "Could not initialize pod allocation checkpoint manager, please drain node and remove policy state file") + return + } + m.state = stateImpl + } + klog.InfoS("Starting to sync pod status with apiserver") //nolint:staticcheck // SA1015 Ticker can leak since this is only called once and doesn't handle termination. @@ -200,6 +227,34 @@ func (m *manager) Start() { }, 0) } +// State returns the pod resources checkpoint state of the pod status manager +func (m *manager) State() state.Reader { + return m.state +} + +// SetPodAllocation checkpoints the resources allocated to a pod's containers +func (m *manager) SetPodAllocation(pod *v1.Pod) error { + m.podStatusesLock.RLock() + defer m.podStatusesLock.RUnlock() + for _, container := range pod.Spec.Containers { + var alloc v1.ResourceList + if container.Resources.Requests != nil { + alloc = container.Resources.Requests.DeepCopy() + } + if err := m.state.SetContainerResourceAllocation(string(pod.UID), container.Name, alloc); err != nil { + return err + } + } + return nil +} + +// SetPodResizeStatus checkpoints the last resizing decision for the pod. +func (m *manager) SetPodResizeStatus(podUID types.UID, resizeStatus v1.PodResizeStatus) error { + m.podStatusesLock.RLock() + defer m.podStatusesLock.RUnlock() + return m.state.SetPodResizeStatus(string(podUID), resizeStatus) +} + func (m *manager) GetPodStatus(uid types.UID) (v1.PodStatus, bool) { m.podStatusesLock.RLock() defer m.podStatusesLock.RUnlock() @@ -616,6 +671,9 @@ func (m *manager) deletePodStatus(uid types.UID) { defer m.podStatusesLock.Unlock() delete(m.podStatuses, uid) m.podStartupLatencyHelper.DeletePodStartupState(uid) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + m.state.Delete(string(uid), "") + } } // TODO(filipg): It'd be cleaner if we can do this without signal from user. @@ -626,6 +684,9 @@ func (m *manager) RemoveOrphanedStatuses(podUIDs map[types.UID]bool) { if _, ok := podUIDs[key]; !ok { klog.V(5).InfoS("Removing pod from status map.", "podUID", key) delete(m.podStatuses, key) + if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { + m.state.Delete(string(key), "") + } } } } diff --git a/pkg/kubelet/status/status_manager_test.go b/pkg/kubelet/status/status_manager_test.go index 9f3ac37b8ff..82b8e9376e3 100644 --- a/pkg/kubelet/status/status_manager_test.go +++ b/pkg/kubelet/status/status_manager_test.go @@ -18,6 +18,7 @@ package status import ( "fmt" + "io/ioutil" "math/rand" "reflect" "strconv" @@ -87,7 +88,13 @@ func newTestManager(kubeClient clientset.Interface) *manager { podManager := kubepod.NewBasicPodManager(podtest.NewFakeMirrorClient()) podManager.AddPod(getTestPod()) podStartupLatencyTracker := util.NewPodStartupLatencyTracker() - return NewManager(kubeClient, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker).(*manager) + testRootDir := "" + if tempDir, err := ioutil.TempDir("", "kubelet_test."); err != nil { + return nil + } else { + testRootDir = tempDir + } + return NewManager(kubeClient, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker, testRootDir).(*manager) } func generateRandomMessage() string { @@ -962,7 +969,7 @@ func TestTerminatePod_DefaultUnknownStatus(t *testing.T) { t.Run(tc.name, func(t *testing.T) { podManager := kubepod.NewBasicPodManager(podtest.NewFakeMirrorClient()) podStartupLatencyTracker := util.NewPodStartupLatencyTracker() - syncer := NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker).(*manager) + syncer := NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker, "").(*manager) original := tc.pod.DeepCopy() syncer.SetPodStatus(original, original.Status) diff --git a/pkg/kubelet/status/testing/mock_pod_status_provider.go b/pkg/kubelet/status/testing/mock_pod_status_provider.go index 7c2f23693ba..3112a4ab625 100644 --- a/pkg/kubelet/status/testing/mock_pod_status_provider.go +++ b/pkg/kubelet/status/testing/mock_pod_status_provider.go @@ -27,6 +27,7 @@ import ( v1 "k8s.io/api/core/v1" types "k8s.io/apimachinery/pkg/types" container "k8s.io/kubernetes/pkg/kubelet/container" + state "k8s.io/kubernetes/pkg/kubelet/status/state" ) // MockPodStatusProvider is a mock of PodStatusProvider interface. @@ -239,6 +240,34 @@ func (mr *MockManagerMockRecorder) SetContainerStartup(podUID, containerID, star return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetContainerStartup", reflect.TypeOf((*MockManager)(nil).SetContainerStartup), podUID, containerID, started) } +// SetPodAllocation mocks base method. +func (m *MockManager) SetPodAllocation(pod *v1.Pod) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetPodAllocation", pod) + ret0, _ := ret[0].(error) + return ret0 +} + +// SetPodAllocation indicates an expected call of SetPodAllocation. +func (mr *MockManagerMockRecorder) SetPodAllocation(pod interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetPodAllocation", reflect.TypeOf((*MockManager)(nil).SetPodAllocation), pod) +} + +// SetPodResizeStatus mocks base method. +func (m *MockManager) SetPodResizeStatus(podUID types.UID, resize v1.PodResizeStatus) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetPodResizeStatus", podUID, resize) + ret0, _ := ret[0].(error) + return ret0 +} + +// SetPodResizeStatus indicates an expected call of SetPodResizeStatus. +func (mr *MockManagerMockRecorder) SetPodResizeStatus(podUID, resize interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetPodResizeStatus", reflect.TypeOf((*MockManager)(nil).SetPodResizeStatus), podUID, resize) +} + // SetPodStatus mocks base method. func (m *MockManager) SetPodStatus(pod *v1.Pod, status v1.PodStatus) { m.ctrl.T.Helper() @@ -263,6 +292,20 @@ func (mr *MockManagerMockRecorder) Start() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Start", reflect.TypeOf((*MockManager)(nil).Start)) } +// State mocks base method. +func (m *MockManager) State() state.Reader { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "State") + ret0, _ := ret[0].(state.Reader) + return ret0 +} + +// State indicates an expected call of State. +func (mr *MockManagerMockRecorder) State() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "State", reflect.TypeOf((*MockManager)(nil).State)) +} + // TerminatePod mocks base method. func (m *MockManager) TerminatePod(pod *v1.Pod) { m.ctrl.T.Helper() diff --git a/test/e2e/node/pod_resize.go b/test/e2e/node/pod_resize.go new file mode 100644 index 00000000000..21c513c0330 --- /dev/null +++ b/test/e2e/node/pod_resize.go @@ -0,0 +1,1447 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/component-base/featuregate" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/features" + kubecm "k8s.io/kubernetes/pkg/kubelet/cm" + + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + imageutils "k8s.io/kubernetes/test/utils/image" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" +) + +const ( + CgroupCPUPeriod string = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" + CgroupCPUShares string = "/sys/fs/cgroup/cpu/cpu.shares" + CgroupCPUQuota string = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" + CgroupMemLimit string = "/sys/fs/cgroup/memory/memory.limit_in_bytes" + Cgroupv2MemLimit string = "/sys/fs/cgroup/memory.max" + Cgroupv2MemRequest string = "/sys/fs/cgroup/memory.min" + Cgroupv2CPULimit string = "/sys/fs/cgroup/cpu.max" + Cgroupv2CPURequest string = "/sys/fs/cgroup/cpu.weight" + + PollInterval time.Duration = 2 * time.Second + PollTimeout time.Duration = 4 * time.Minute +) + +type ContainerResources struct { + CPUReq, CPULim, MemReq, MemLim, EphStorReq, EphStorLim string +} + +type ContainerAllocations struct { + CPUAlloc, MemAlloc, ephStorAlloc string +} + +type TestContainerInfo struct { + Name string + Resources *ContainerResources + Allocations *ContainerAllocations + CPUPolicy *v1.ResourceResizePolicy + MemPolicy *v1.ResourceResizePolicy + RestartCount int32 +} + +func isFeatureGatePostAlpha() bool { + if fs, found := utilfeature.DefaultFeatureGate.DeepCopy().GetAll()[features.InPlacePodVerticalScaling]; found { + if fs.PreRelease == featuregate.Alpha { + return false + } + } + return true +} + +func getTestResourceInfo(tcInfo TestContainerInfo) (v1.ResourceRequirements, v1.ResourceList, []v1.ContainerResizePolicy) { + var res v1.ResourceRequirements + var alloc v1.ResourceList + var resizePol []v1.ContainerResizePolicy + + if tcInfo.Resources != nil { + var lim, req v1.ResourceList + if tcInfo.Resources.CPULim != "" || tcInfo.Resources.MemLim != "" || tcInfo.Resources.EphStorLim != "" { + lim = make(v1.ResourceList) + } + if tcInfo.Resources.CPUReq != "" || tcInfo.Resources.MemReq != "" || tcInfo.Resources.EphStorReq != "" { + req = make(v1.ResourceList) + } + if tcInfo.Resources.CPULim != "" { + lim[v1.ResourceCPU] = resource.MustParse(tcInfo.Resources.CPULim) + } + if tcInfo.Resources.MemLim != "" { + lim[v1.ResourceMemory] = resource.MustParse(tcInfo.Resources.MemLim) + } + if tcInfo.Resources.EphStorLim != "" { + lim[v1.ResourceEphemeralStorage] = resource.MustParse(tcInfo.Resources.EphStorLim) + } + if tcInfo.Resources.CPUReq != "" { + req[v1.ResourceCPU] = resource.MustParse(tcInfo.Resources.CPUReq) + } + if tcInfo.Resources.MemReq != "" { + req[v1.ResourceMemory] = resource.MustParse(tcInfo.Resources.MemReq) + } + if tcInfo.Resources.EphStorReq != "" { + req[v1.ResourceEphemeralStorage] = resource.MustParse(tcInfo.Resources.EphStorReq) + } + res = v1.ResourceRequirements{Limits: lim, Requests: req} + } + if tcInfo.Allocations != nil { + alloc = make(v1.ResourceList) + if tcInfo.Allocations.CPUAlloc != "" { + alloc[v1.ResourceCPU] = resource.MustParse(tcInfo.Allocations.CPUAlloc) + } + if tcInfo.Allocations.MemAlloc != "" { + alloc[v1.ResourceMemory] = resource.MustParse(tcInfo.Allocations.MemAlloc) + } + if tcInfo.Allocations.ephStorAlloc != "" { + alloc[v1.ResourceEphemeralStorage] = resource.MustParse(tcInfo.Allocations.ephStorAlloc) + } + + } + if tcInfo.CPUPolicy != nil { + cpuPol := v1.ContainerResizePolicy{ResourceName: v1.ResourceCPU, Policy: *tcInfo.CPUPolicy} + resizePol = append(resizePol, cpuPol) + } + if tcInfo.MemPolicy != nil { + memPol := v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, Policy: *tcInfo.MemPolicy} + resizePol = append(resizePol, memPol) + } + return res, alloc, resizePol +} + +func initDefaultResizePolicy(containers []TestContainerInfo) { + noRestart := v1.RestartNotRequired + setDefaultPolicy := func(ci *TestContainerInfo) { + if ci.CPUPolicy == nil { + ci.CPUPolicy = &noRestart + } + if ci.MemPolicy == nil { + ci.MemPolicy = &noRestart + } + } + for i := range containers { + setDefaultPolicy(&containers[i]) + } +} + +func makeTestContainer(tcInfo TestContainerInfo) (v1.Container, v1.ContainerStatus) { + cmd := "trap exit TERM; while true; do sleep 1; done" + res, alloc, resizePol := getTestResourceInfo(tcInfo) + bTrue := true + bFalse := false + userID := int64(1001) + tc := v1.Container{ + Name: tcInfo.Name, + Image: imageutils.GetE2EImage(imageutils.BusyBox), + Command: []string{"/bin/sh"}, + Args: []string{"-c", cmd}, + Resources: res, + ResizePolicy: resizePol, + SecurityContext: &v1.SecurityContext{ + Privileged: &bFalse, + AllowPrivilegeEscalation: &bFalse, + RunAsNonRoot: &bTrue, + RunAsUser: &userID, + Capabilities: &v1.Capabilities{ + Drop: []v1.Capability{"ALL"}, + }, + SeccompProfile: &v1.SeccompProfile{ + Type: v1.SeccompProfileTypeRuntimeDefault, + }, + }, + } + + tcStatus := v1.ContainerStatus{ + Name: tcInfo.Name, + ResourcesAllocated: alloc, + } + return tc, tcStatus +} + +func makeTestPod(ns, name, timeStamp string, tcInfo []TestContainerInfo) *v1.Pod { + var testContainers []v1.Container + for _, ci := range tcInfo { + tc, _ := makeTestContainer(ci) + testContainers = append(testContainers, tc) + } + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + Kind: "Pod", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: map[string]string{ + "name": "fooPod", + "time": timeStamp, + }, + }, + Spec: v1.PodSpec{ + Containers: testContainers, + RestartPolicy: v1.RestartPolicyOnFailure, + }, + } + return pod +} + +func verifyPodResizePolicy(pod *v1.Pod, tcInfo []TestContainerInfo) { + cMap := make(map[string]*v1.Container) + for i, c := range pod.Spec.Containers { + cMap[c.Name] = &pod.Spec.Containers[i] + } + for _, ci := range tcInfo { + c, found := cMap[ci.Name] + gomega.Expect(found == true) + tc, _ := makeTestContainer(ci) + framework.ExpectEqual(tc.ResizePolicy, c.ResizePolicy) + } +} + +func verifyPodResources(pod *v1.Pod, tcInfo []TestContainerInfo) { + cMap := make(map[string]*v1.Container) + for i, c := range pod.Spec.Containers { + cMap[c.Name] = &pod.Spec.Containers[i] + } + for _, ci := range tcInfo { + c, found := cMap[ci.Name] + gomega.Expect(found == true) + tc, _ := makeTestContainer(ci) + framework.ExpectEqual(tc.Resources, c.Resources) + } +} + +func verifyPodAllocations(pod *v1.Pod, tcInfo []TestContainerInfo, flagError bool) bool { + cStatusMap := make(map[string]*v1.ContainerStatus) + for i, c := range pod.Status.ContainerStatuses { + cStatusMap[c.Name] = &pod.Status.ContainerStatuses[i] + } + + for _, ci := range tcInfo { + cStatus, found := cStatusMap[ci.Name] + gomega.Expect(found == true) + if ci.Allocations == nil { + if ci.Resources != nil { + alloc := &ContainerAllocations{CPUAlloc: ci.Resources.CPUReq, MemAlloc: ci.Resources.MemReq} + ci.Allocations = alloc + defer func() { + ci.Allocations = nil + }() + } + } + + _, tcStatus := makeTestContainer(ci) + if flagError { + framework.ExpectEqual(tcStatus.ResourcesAllocated, cStatus.ResourcesAllocated) + } + if diff.ObjectDiff(cStatus.ResourcesAllocated, tcStatus.ResourcesAllocated) != "" { + return false + } + } + return true +} + +func verifyPodStatusResources(pod *v1.Pod, tcInfo []TestContainerInfo) { + csMap := make(map[string]*v1.ContainerStatus) + for i, c := range pod.Status.ContainerStatuses { + csMap[c.Name] = &pod.Status.ContainerStatuses[i] + } + for _, ci := range tcInfo { + cs, found := csMap[ci.Name] + gomega.Expect(found == true) + tc, _ := makeTestContainer(ci) + framework.ExpectEqual(tc.Resources, *cs.Resources) + //framework.ExpectEqual(cs.RestartCount, ci.RestartCount) + } +} + +func isPodOnCgroupv2Node(pod *v1.Pod) bool { + // Determine if pod is running on cgroupv2 or cgroupv1 node + cgroupv2File := "/sys/fs/cgroup/cgroup.controllers" + _, err := framework.RunKubectl(pod.Namespace, "exec", pod.Name, "--", "ls", cgroupv2File) + if err == nil { + return true + } + return false +} + +func verifyPodContainersCgroupValues(pod *v1.Pod, tcInfo []TestContainerInfo, flagError bool) bool { + podOnCgroupv2Node := isPodOnCgroupv2Node(pod) + cgroupMemLimit := Cgroupv2MemLimit + cgroupCPULimit := Cgroupv2CPULimit + cgroupCPURequest := Cgroupv2CPURequest + if !podOnCgroupv2Node { + cgroupMemLimit = CgroupMemLimit + cgroupCPULimit = CgroupCPUQuota + cgroupCPURequest = CgroupCPUShares + } + verifyCgroupValue := func(cName, cgPath, expectedCgValue string) bool { + cmd := []string{"head", "-n", "1", cgPath} + cgValue, err := framework.LookForStringInPodExecToContainer(pod.Namespace, pod.Name, cName, cmd, expectedCgValue, PollTimeout) + if flagError { + framework.ExpectNoError(err, "failed to find expected cgroup value in container") + } + cgValue = strings.Trim(cgValue, "\n") + if flagError { + gomega.Expect(cgValue == expectedCgValue) + } + if cgValue != expectedCgValue { + return false + } + return true + } + for _, ci := range tcInfo { + if ci.Resources == nil { + continue + } + tc, _ := makeTestContainer(ci) + if tc.Resources.Limits != nil || tc.Resources.Requests != nil { + var cpuShares int64 + var cpuLimitString, memLimitString string + memLimitInBytes := tc.Resources.Limits.Memory().Value() + cpuRequest := tc.Resources.Requests.Cpu() + cpuLimit := tc.Resources.Limits.Cpu() + if cpuRequest.IsZero() && !cpuLimit.IsZero() { + cpuShares = int64(kubecm.MilliCPUToShares(cpuLimit.MilliValue())) + } else { + cpuShares = int64(kubecm.MilliCPUToShares(cpuRequest.MilliValue())) + } + cpuQuota := kubecm.MilliCPUToQuota(cpuLimit.MilliValue(), kubecm.QuotaPeriod) + if cpuLimit.IsZero() { + cpuQuota = -1 + } + cpuLimitString = strconv.FormatInt(cpuQuota, 10) + if podOnCgroupv2Node && cpuLimitString == "-1" { + cpuLimitString = "max" + } + memLimitString = strconv.FormatInt(memLimitInBytes, 10) + if podOnCgroupv2Node && memLimitString == "0" { + memLimitString = "max" + } + if memLimitString != "0" { + if !verifyCgroupValue(ci.Name, cgroupMemLimit, memLimitString) { + return false + } + } + if !verifyCgroupValue(ci.Name, cgroupCPULimit, cpuLimitString) { + return false + } + if !verifyCgroupValue(ci.Name, cgroupCPURequest, strconv.FormatInt(cpuShares, 10)) { + return false + } + } + } + return true +} + +func waitForPodResizeActuation(podClient *framework.PodClient, pod, patchedPod *v1.Pod, expectedContainers []TestContainerInfo) *v1.Pod { + + waitForContainerRestart := func() error { + var restartContainersExpected []string + for _, ci := range expectedContainers { + if ci.RestartCount > 0 { + restartContainersExpected = append(restartContainersExpected, ci.Name) + } + } + if len(restartContainersExpected) == 0 { + return nil + } + for start := time.Now(); time.Since(start) < PollTimeout; time.Sleep(PollInterval) { + pod, err := podClient.Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err != nil { + return err + } + restartedContainersCount := 0 + for _, cName := range restartContainersExpected { + cs, _ := podutil.GetContainerStatus(pod.Status.ContainerStatuses, cName) + if cs.RestartCount < 1 { + break + } + restartedContainersCount++ + } + if restartedContainersCount == len(restartContainersExpected) { + return nil + } + } + return fmt.Errorf("timed out waiting for expected container restart") + } + waitPodAllocationsEqualsExpected := func() (*v1.Pod, error) { + for start := time.Now(); time.Since(start) < PollTimeout; time.Sleep(PollInterval) { + pod, err := podClient.Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + if verifyPodAllocations(pod, expectedContainers, false) == false { + continue + } + return pod, nil + } + return nil, fmt.Errorf("timed out waiting for pod resource allocation values to match expected") + } + waitContainerCgroupValuesEqualsExpected := func() error { + for start := time.Now(); time.Since(start) < PollTimeout; time.Sleep(PollInterval) { + if verifyPodContainersCgroupValues(patchedPod, expectedContainers, false) == false { + continue + } + return nil + } + return fmt.Errorf("timed out waiting for container cgroup values to match expected") + } + waitPodStatusResourcesEqualSpecResources := func() (*v1.Pod, error) { + for start := time.Now(); time.Since(start) < PollTimeout; time.Sleep(PollInterval) { + pod, err := podClient.Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + differs := false + for idx, c := range pod.Spec.Containers { + if diff.ObjectDiff(c.Resources, *pod.Status.ContainerStatuses[idx].Resources) != "" { + differs = true + break + } + } + if differs { + continue + } + return pod, nil + } + return nil, fmt.Errorf("timed out waiting for pod spec resources to match pod status resources") + } + rsErr := waitForContainerRestart() + framework.ExpectNoError(rsErr, "failed waiting for expected container restart") + // Wait for pod resource allocations to equal expected values after resize + resizedPod, aErr := waitPodAllocationsEqualsExpected() + framework.ExpectNoError(aErr, "failed to verify pod resource allocation values equals expected values") + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this check when cgroupv2 support is added + if !isPodOnCgroupv2Node(pod) { + // Wait for container cgroup values to equal expected cgroup values after resize + cErr := waitContainerCgroupValuesEqualsExpected() + framework.ExpectNoError(cErr, "failed to verify container cgroup values equals expected values") + } + //TODO(vinaykul,InPlacePodVerticalScaling): Remove featureGatePostAlpha upon exiting Alpha. + // containerd needs to add CRI support before Beta (See Node KEP #2273) + if isFeatureGatePostAlpha() { + // Wait for PodSpec container resources to equal PodStatus container resources indicating resize is complete + rPod, rErr := waitPodStatusResourcesEqualSpecResources() + framework.ExpectNoError(rErr, "failed to verify pod spec resources equals pod status resources") + + ginkgo.By("verifying pod status after resize") + verifyPodStatusResources(rPod, expectedContainers) + } + return resizedPod +} + +func doPodResizeTests() { + f := framework.NewDefaultFramework("pod-resize") + + type testCase struct { + name string + containers []TestContainerInfo + patchString string + expected []TestContainerInfo + } + + noRestart := v1.RestartNotRequired + doRestart := v1.RestartRequired + tests := []testCase{ + { + name: "Guaranteed QoS pod, one container - increase CPU & memory", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m","memory":"400Mi"},"limits":{"cpu":"200m","memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "400Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + }, + { + name: "Guaranteed QoS pod, one container - decrease CPU & memory", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "300m", MemReq: "500Mi", MemLim: "500Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"100m","memory":"250Mi"},"limits":{"cpu":"100m","memory":"250Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "250Mi", MemLim: "250Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + }, + { + name: "Guaranteed QoS pod, one container - increase CPU & decrease memory", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m","memory":"100Mi"},"limits":{"cpu":"200m","memory":"100Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "100Mi", MemLim: "100Mi"}, + }, + }, + }, + { + name: "Guaranteed QoS pod, one container - decrease CPU & increase memory", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"50m","memory":"300Mi"},"limits":{"cpu":"50m","memory":"300Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "50m", CPULim: "50m", MemReq: "300Mi", MemLim: "300Mi"}, + }, + }, + }, + { + name: "Guaranteed QoS pod, three containers (c1, c2, c3) - increase: CPU (c1,c3), memory (c2) ; decrease: CPU (c2), memory (c1,c3)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "100Mi", MemLim: "100Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "200Mi", MemLim: "200Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "300m", MemReq: "300Mi", MemLim: "300Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"140m","memory":"50Mi"},"limits":{"cpu":"140m","memory":"50Mi"}}}, + {"name":"c2", "resources":{"requests":{"cpu":"150m","memory":"240Mi"},"limits":{"cpu":"150m","memory":"240Mi"}}}, + {"name":"c3", "resources":{"requests":{"cpu":"340m","memory":"250Mi"},"limits":{"cpu":"340m","memory":"250Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "140m", CPULim: "140m", MemReq: "50Mi", MemLim: "50Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "150m", CPULim: "150m", MemReq: "240Mi", MemLim: "240Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "340m", CPULim: "340m", MemReq: "250Mi", MemLim: "250Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease memory requests only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"200Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "200Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease memory limits only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"limits":{"memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "400Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase memory requests only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"300Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "300Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase memory limits only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"limits":{"memory":"600Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "600Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease CPU requests only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"100m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease CPU limits only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"limits":{"cpu":"300m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase CPU requests only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"150m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "150m", CPULim: "200m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase CPU limits only", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"limits":{"cpu":"500m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "500m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease CPU requests and limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"100m"},"limits":{"cpu":"200m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase CPU requests and limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m"},"limits":{"cpu":"400m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease CPU requests and increase CPU limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"100m"},"limits":{"cpu":"500m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "500m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase CPU requests and decrease CPU limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "400m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m"},"limits":{"cpu":"300m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "250Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease memory requests and limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"100Mi"},"limits":{"memory":"300Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "300Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase memory requests and limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"300Mi"},"limits":{"memory":"500Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "300Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease memory requests and increase memory limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"100Mi"},"limits":{"memory":"500Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase memory requests and decrease memory limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"300Mi"},"limits":{"memory":"300Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "300Mi", MemLim: "300Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease CPU requests and increase memory limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"100m"},"limits":{"memory":"500Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "400m", MemReq: "200Mi", MemLim: "500Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase CPU requests and decrease memory limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "400m", MemReq: "200Mi", MemLim: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m"},"limits":{"memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - decrease memory requests and increase CPU limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"100Mi"},"limits":{"cpu":"300m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "300m", MemReq: "100Mi", MemLim: "400Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests + limits - increase memory requests and decrease CPU limits", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "400m", MemReq: "200Mi", MemLim: "400Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"300Mi"},"limits":{"cpu":"300m"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "300Mi", MemLim: "400Mi"}, + }, + }, + }, + { + name: "Burstable QoS pod, one container with cpu & memory requests - decrease memory request", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", MemReq: "500Mi"}, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", MemReq: "400Mi"}, + }, + }, + }, + { + name: "Guaranteed QoS pod, one container - increase CPU (RestartNotRequired) & memory (RestartRequired)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m","memory":"400Mi"},"limits":{"cpu":"200m","memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "400Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + RestartCount: 1, + }, + }, + }, + { + name: "Burstable QoS pod, one container - decrease CPU (RestartRequired) & memory (RestartNotRequired)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "200Mi", MemLim: "400Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"50m","memory":"100Mi"},"limits":{"cpu":"100m","memory":"200Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "50m", CPULim: "100m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &noRestart, + RestartCount: 1, + }, + }, + }, + { + name: "Burstable QoS pod, three containers - increase c1 resources, no change for c2, decrease c3 resources (no net change for pod)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "200Mi", MemLim: "300Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "400m", MemReq: "300Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"150m","memory":"150Mi"},"limits":{"cpu":"250m","memory":"250Mi"}}}, + {"name":"c3", "resources":{"requests":{"cpu":"250m","memory":"250Mi"},"limits":{"cpu":"350m","memory":"350Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "150m", CPULim: "250m", MemReq: "150Mi", MemLim: "250Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "200Mi", MemLim: "300Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "250m", CPULim: "350m", MemReq: "250Mi", MemLim: "350Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + }, + { + name: "Burstable QoS pod, three containers - decrease c1 resources, increase c2 resources, no change for c3 (net increase for pod)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "200Mi", MemLim: "300Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "400m", MemReq: "300Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"50m","memory":"50Mi"},"limits":{"cpu":"150m","memory":"150Mi"}}}, + {"name":"c2", "resources":{"requests":{"cpu":"350m","memory":"350Mi"},"limits":{"cpu":"450m","memory":"450Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "50m", CPULim: "150m", MemReq: "50Mi", MemLim: "150Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "350m", CPULim: "450m", MemReq: "350Mi", MemLim: "450Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + RestartCount: 1, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "400m", MemReq: "300Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + }, + { + name: "Burstable QoS pod, three containers - no change for c1, increase c2 resources, decrease c3 (net decrease for pod)", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &doRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "300m", MemReq: "200Mi", MemLim: "300Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &noRestart, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "400m", MemReq: "300Mi", MemLim: "400Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &doRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c2", "resources":{"requests":{"cpu":"250m","memory":"250Mi"},"limits":{"cpu":"350m","memory":"350Mi"}}}, + {"name":"c3", "resources":{"requests":{"cpu":"100m","memory":"100Mi"},"limits":{"cpu":"200m","memory":"200Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &doRestart, + }, + { + Name: "c2", + Resources: &ContainerResources{CPUReq: "250m", CPULim: "350m", MemReq: "250Mi", MemLim: "350Mi"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + RestartCount: 1, + }, + { + Name: "c3", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "200m", MemReq: "100Mi", MemLim: "200Mi"}, + CPUPolicy: &doRestart, + MemPolicy: &doRestart, + RestartCount: 1, + }, + }, + }, + } + + for idx := range tests { + tc := tests[idx] + ginkgo.It(tc.name, func() { + var testPod, patchedPod *v1.Pod + var pErr error + + tStamp := strconv.Itoa(time.Now().Nanosecond()) + initDefaultResizePolicy(tc.containers) + initDefaultResizePolicy(tc.expected) + testPod = makeTestPod(f.Namespace.Name, "testpod", tStamp, tc.containers) + + ginkgo.By("creating pod") + newPod := f.PodClient().CreateSync(testPod) + + ginkgo.By("verifying the pod is in kubernetes") + selector := labels.SelectorFromSet(labels.Set(map[string]string{"time": tStamp})) + options := metav1.ListOptions{LabelSelector: selector.String()} + podList, err := f.PodClient().List(context.TODO(), options) + framework.ExpectNoError(err, "failed to query for pods") + gomega.Expect(len(podList.Items) == 1) + + ginkgo.By("verifying initial pod resources, allocations, and policy are as expected") + verifyPodResources(newPod, tc.containers) + verifyPodResizePolicy(newPod, tc.containers) + + ginkgo.By("verifying initial pod status resources and cgroup config are as expected") + verifyPodStatusResources(newPod, tc.containers) + verifyPodContainersCgroupValues(newPod, tc.containers, true) + + ginkgo.By("patching pod for resize") + patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(context.TODO(), newPod.Name, + types.StrategicMergePatchType, []byte(tc.patchString), metav1.PatchOptions{}) + framework.ExpectNoError(pErr, "failed to patch pod for resize") + + ginkgo.By("verifying pod patched for resize") + verifyPodResources(patchedPod, tc.expected) + verifyPodAllocations(patchedPod, tc.containers, true) + + ginkgo.By("waiting for resize to be actuated") + resizedPod := waitForPodResizeActuation(f.PodClient(), newPod, patchedPod, tc.expected) + + ginkgo.By("verifying pod container's cgroup values after resize") + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this check when cgroupv2 support is added + if !isPodOnCgroupv2Node(resizedPod) { + verifyPodContainersCgroupValues(resizedPod, tc.expected, true) + } + + ginkgo.By("verifying pod resources after resize") + verifyPodResources(resizedPod, tc.expected) + + ginkgo.By("verifying pod allocations after resize") + verifyPodAllocations(resizedPod, tc.expected, true) + + ginkgo.By("deleting pod") + err = e2epod.DeletePodWithWait(f.ClientSet, newPod) + framework.ExpectNoError(err, "failed to delete pod") + }) + } +} + +func doPodResizeResourceQuotaTests() { + f := framework.NewDefaultFramework("pod-resize-resource-quota") + + ginkgo.It("pod-resize-resource-quota-test", func() { + resourceQuota := v1.ResourceQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: "resize-resource-quota", + Namespace: f.Namespace.Name, + }, + Spec: v1.ResourceQuotaSpec{ + Hard: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("800m"), + v1.ResourceMemory: resource.MustParse("800Mi"), + }, + }, + } + containers := []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "300m", CPULim: "300m", MemReq: "300Mi", MemLim: "300Mi"}, + }, + } + patchString := `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"400m","memory":"400Mi"},"limits":{"cpu":"400m","memory":"400Mi"}}} + ]}}` + expected := []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "400m", CPULim: "400m", MemReq: "400Mi", MemLim: "400Mi"}, + }, + } + patchStringExceedCPU := `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"600m","memory":"200Mi"},"limits":{"cpu":"600m","memory":"200Mi"}}} + ]}}` + patchStringExceedMemory := `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"250m","memory":"750Mi"},"limits":{"cpu":"250m","memory":"750Mi"}}} + ]}}` + + ginkgo.By("Creating a ResourceQuota") + _, rqErr := f.ClientSet.CoreV1().ResourceQuotas(f.Namespace.Name).Create(context.TODO(), &resourceQuota, metav1.CreateOptions{}) + framework.ExpectNoError(rqErr, "failed to create resource quota") + + tStamp := strconv.Itoa(time.Now().Nanosecond()) + initDefaultResizePolicy(containers) + initDefaultResizePolicy(expected) + testPod1 := makeTestPod(f.Namespace.Name, "testpod1", tStamp, containers) + testPod2 := makeTestPod(f.Namespace.Name, "testpod2", tStamp, containers) + + ginkgo.By("creating pods") + newPod1 := f.PodClient().CreateSync(testPod1) + newPod2 := f.PodClient().CreateSync(testPod2) + + ginkgo.By("verifying the pod is in kubernetes") + selector := labels.SelectorFromSet(labels.Set(map[string]string{"time": tStamp})) + options := metav1.ListOptions{LabelSelector: selector.String()} + podList, listErr := f.PodClient().List(context.TODO(), options) + framework.ExpectNoError(listErr, "failed to query for pods") + gomega.Expect(len(podList.Items) == 2) + + ginkgo.By("verifying initial pod resources, allocations, and policy are as expected") + verifyPodResources(newPod1, containers) + + ginkgo.By("patching pod for resize within resource quota") + patchedPod, pErr := f.ClientSet.CoreV1().Pods(newPod1.Namespace).Patch(context.TODO(), newPod1.Name, + types.StrategicMergePatchType, []byte(patchString), metav1.PatchOptions{}) + framework.ExpectNoError(pErr, "failed to patch pod for resize") + + ginkgo.By("verifying pod patched for resize within resource quota") + verifyPodResources(patchedPod, expected) + verifyPodAllocations(patchedPod, containers, true) + + ginkgo.By("waiting for resize to be actuated") + resizedPod := waitForPodResizeActuation(f.PodClient(), newPod1, patchedPod, expected) + + ginkgo.By("verifying pod container's cgroup values after resize") + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this check when cgroupv2 support is added + if !isPodOnCgroupv2Node(resizedPod) { + verifyPodContainersCgroupValues(resizedPod, expected, true) + } + + ginkgo.By("verifying pod resources after resize") + verifyPodResources(resizedPod, expected) + + ginkgo.By("verifying pod allocations after resize") + verifyPodAllocations(resizedPod, expected, true) + + ginkgo.By(fmt.Sprintf("patching pod %s for resize with CPU exceeding resource quota", resizedPod.Name)) + _, pErrExceedCPU := f.ClientSet.CoreV1().Pods(resizedPod.Namespace).Patch(context.TODO(), + resizedPod.Name, types.StrategicMergePatchType, []byte(patchStringExceedCPU), metav1.PatchOptions{}) + framework.ExpectError(pErrExceedCPU, "exceeded quota: %s, requested: cpu=200m, used: cpu=700m, limited: cpu=800m", + resourceQuota.Name) + + ginkgo.By("verifying pod patched for resize exceeding CPU resource quota remains unchanged") + patchedPodExceedCPU, pErrEx1 := f.PodClient().Get(context.TODO(), resizedPod.Name, metav1.GetOptions{}) + framework.ExpectNoError(pErrEx1, "failed to get pod post exceed CPU resize") + verifyPodResources(patchedPodExceedCPU, expected) + verifyPodAllocations(patchedPodExceedCPU, expected, true) + + ginkgo.By("patching pod for resize with memory exceeding resource quota") + _, pErrExceedMemory := f.ClientSet.CoreV1().Pods(resizedPod.Namespace).Patch(context.TODO(), + resizedPod.Name, types.StrategicMergePatchType, []byte(patchStringExceedMemory), metav1.PatchOptions{}) + framework.ExpectError(pErrExceedMemory, "exceeded quota: %s, requested: memory=350Mi, used: memory=700Mi, limited: memory=800Mi", + resourceQuota.Name) + + ginkgo.By("verifying pod patched for resize exceeding memory resource quota remains unchanged") + patchedPodExceedMemory, pErrEx2 := f.PodClient().Get(context.TODO(), resizedPod.Name, metav1.GetOptions{}) + framework.ExpectNoError(pErrEx2, "failed to get pod post exceed memory resize") + verifyPodResources(patchedPodExceedMemory, expected) + verifyPodAllocations(patchedPodExceedMemory, expected, true) + + ginkgo.By("deleting pods") + delErr1 := e2epod.DeletePodWithWait(f.ClientSet, newPod1) + framework.ExpectNoError(delErr1, "failed to delete pod %s", newPod1.Name) + delErr2 := e2epod.DeletePodWithWait(f.ClientSet, newPod2) + framework.ExpectNoError(delErr2, "failed to delete pod %s", newPod2.Name) + }) +} + +func doPodResizeErrorTests() { + f := framework.NewDefaultFramework("pod-resize-errors") + + type testCase struct { + name string + containers []TestContainerInfo + patchString string + patchError string + expected []TestContainerInfo + } + + tests := []testCase{ + { + name: "BestEffort pod - try requesting memory, expect error", + containers: []TestContainerInfo{ + { + Name: "c1", + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"memory":"400Mi"}}} + ]}}`, + patchError: "Pod QoS is immutable", + expected: []TestContainerInfo{ + { + Name: "c1", + }, + }, + }, + } + + for idx := range tests { + tc := tests[idx] + ginkgo.It(tc.name, func() { + var testPod, patchedPod *v1.Pod + var pErr error + + tStamp := strconv.Itoa(time.Now().Nanosecond()) + initDefaultResizePolicy(tc.containers) + initDefaultResizePolicy(tc.expected) + testPod = makeTestPod(f.Namespace.Name, "testpod", tStamp, tc.containers) + + ginkgo.By("creating pod") + newPod := f.PodClient().CreateSync(testPod) + + ginkgo.By("verifying the pod is in kubernetes") + selector := labels.SelectorFromSet(labels.Set(map[string]string{"time": tStamp})) + options := metav1.ListOptions{LabelSelector: selector.String()} + podList, err := f.PodClient().List(context.TODO(), options) + framework.ExpectNoError(err, "failed to query for pods") + gomega.Expect(len(podList.Items) == 1) + + ginkgo.By("verifying initial pod resources, allocations, and policy are as expected") + verifyPodResources(newPod, tc.containers) + verifyPodResizePolicy(newPod, tc.containers) + + ginkgo.By("verifying initial pod status resources and cgroup config are as expected") + verifyPodStatusResources(newPod, tc.containers) + verifyPodContainersCgroupValues(newPod, tc.containers, true) + + ginkgo.By("patching pod for resize") + patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(context.TODO(), newPod.Name, + types.StrategicMergePatchType, []byte(tc.patchString), metav1.PatchOptions{}) + if tc.patchError == "" { + framework.ExpectNoError(pErr, "failed to patch pod for resize") + } else { + framework.ExpectError(pErr, tc.patchError) + patchedPod = newPod + } + + ginkgo.By("verifying pod container's cgroup values after patch") + //TODO(vinaykul,InPlacePodVerticalScaling): Remove this check when cgroupv2 support is added + if !isPodOnCgroupv2Node(patchedPod) { + verifyPodContainersCgroupValues(patchedPod, tc.expected, true) + } + + ginkgo.By("verifying pod resources after patch") + verifyPodResources(patchedPod, tc.expected) + + ginkgo.By("verifying pod allocations after patch") + verifyPodAllocations(patchedPod, tc.expected, true) + + ginkgo.By("deleting pod") + err = e2epod.DeletePodWithWait(f.ClientSet, newPod) + framework.ExpectNoError(err, "failed to delete pod") + }) + } +} + +var _ = SIGDescribe("Pod InPlace Resize Container [Feature:InPlacePodVerticalScaling]", func() { + doPodResizeTests() + doPodResizeResourceQuotaTests() + doPodResizeErrorTests() +})