Merge pull request #86101 from PatrickLang/fix-cpumaximum

Fix cpu resource limit on Windows
2025-09-17 23:19:26 +00:00 · 2020-02-26 00:20:26 -08:00
parent b6b494b448 886214f48c
commit 16a7650e2b
8 changed files with 182 additions and 79 deletions
--- a/pkg/kubelet/dockershim/BUILD
+++ b/pkg/kubelet/dockershim/BUILD
@@ -78,6 +78,7 @@ go_library(
            "//pkg/kubelet/apis:go_default_library",
            "//pkg/kubelet/winstats:go_default_library",
            "//vendor/github.com/Microsoft/hcsshim:go_default_library",
            "//vendor/github.com/docker/docker/pkg/sysinfo:go_default_library",
            "//vendor/golang.org/x/sys/windows/registry:go_default_library",
        ],
        "//conditions:default": [],
--- a/pkg/kubelet/dockershim/helpers_windows.go
+++ b/pkg/kubelet/dockershim/helpers_windows.go
@@ -25,6 +25,7 @@ import (
 	dockertypes "github.com/docker/docker/api/types"
 	dockercontainer "github.com/docker/docker/api/types/container"
 	dockerfilters "github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/pkg/sysinfo"
 	"k8s.io/klog"
 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
@@ -69,11 +70,12 @@ func (ds *dockerService) updateCreateConfig(
 	if wc := config.GetWindows(); wc != nil {
 		rOpts := wc.GetResources()
 		if rOpts != nil {
 			// Precedence and units for these are described at length in kuberuntime_container_windows.go - generateWindowsContainerConfig()
 			createConfig.HostConfig.Resources = dockercontainer.Resources{
 				Memory:    rOpts.MemoryLimitInBytes,
 				CPUShares: rOpts.CpuShares,
 				CPUCount:  rOpts.CpuCount,
-				CPUPercent: rOpts.CpuMaximum,
+				NanoCPUs:  rOpts.CpuMaximum * int64(sysinfo.NumCPU()) * (1e9 / 10000),
 			}
 		}
--- a/pkg/kubelet/kuberuntime/BUILD
+++ b/pkg/kubelet/kuberuntime/BUILD
@@ -14,7 +14,6 @@ go_library(
        "helpers.go",
        "helpers_linux.go",
        "helpers_unsupported.go",
        "helpers_windows.go",
        "instrumented_services.go",
        "kuberuntime_container.go",
        "kuberuntime_container_linux.go",
--- a/pkg/kubelet/kuberuntime/helpers_unsupported.go
+++ b/pkg/kubelet/kuberuntime/helpers_unsupported.go
@@ -1,4 +1,4 @@
-// +build !linux,!windows
+// +build !linux
 /*
 Copyright 2018 The Kubernetes Authors.
--- a/pkg/kubelet/kuberuntime/helpers_windows.go
+++ b/pkg/kubelet/kuberuntime/helpers_windows.go
@@ -1,55 +0,0 @@
 // +build windows
 /*
 Copyright 2018 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package kuberuntime
 import (
 	"github.com/docker/docker/pkg/sysinfo"
 )
 const (
 	// Taken from https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/resource-controls
 	minSharesProcess = 5000
 	minSharesHyperV  = 10
 	maxShares        = 10000
 	milliCPUToCPU    = 1000
 )
 // milliCPUToShares converts milliCPU to CPU shares
 func milliCPUToShares(milliCPU int64, hyperv bool) int64 {
 	var minShares int64 = minSharesProcess
 	if hyperv {
 		minShares = minSharesHyperV
 	}
 	if milliCPU == 0 {
 		// Return here to really match kernel default for zero milliCPU.
 		return minShares
 	}
 	// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
 	totalCPU := sysinfo.NumCPU()
 	shares := (milliCPU * (maxShares - minShares)) / int64(totalCPU) / milliCPUToCPU
 	if shares < minShares {
 		return minShares
 	}
 	if shares > maxShares {
 		return maxShares
 	}
 	return shares
 }
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go
@@ -53,7 +53,6 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		SecurityContext: &runtimeapi.WindowsContainerSecurityContext{},
 	}
 	cpuRequest := container.Resources.Requests.Cpu()
 	cpuLimit := container.Resources.Limits.Cpu()
 	isolatedByHyperv := kubeletapis.ShouldIsolatedByHyperV(pod.Annotations)
 	if !cpuLimit.IsZero() {
@@ -61,7 +60,35 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		// as only 64 processors are available for execution by a given process. This causes
 		// some oddities on systems with more than 64 processors.
 		// Refer https://msdn.microsoft.com/en-us/library/windows/desktop/dd405503(v=vs.85).aspx.
 		// Since Kubernetes doesn't have any notion of weight in the Pod/Container API, only limits/reserves, then applying CpuMaximum only
 		// will better follow the intent of the user. At one point CpuWeights were set, but this prevented limits from having any effect.
 		// There are 3 parts to how this works:
 		// Part one - Windows kernel
 		//   cpuMaximum is documented at https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/resource-controls
 		//   the range and how it relates to number of CPUs is at https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-jobobject_cpu_rate_control_information
 		//   For process isolation, these are applied to the job object setting JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, which can be set to either
 		//   JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED or JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP. This is why the settings are mutually exclusive.
 		// Part two - Docker (doc: https://docs.docker.com/engine/api/v1.30)
 		//   If both CpuWeight and CpuMaximum are passed to Docker, then it sets
 		//   JOB_OBJECT_CPU_RATE_CONTROL_ENABLE = JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED ignoring CpuMaximum.
 		//   Option a: Set HostConfig.CpuPercent. The units are whole percent of the total CPU capacity of the system, meaning the resolution
 		//      is different based on the number of cores.
 		//   Option b: Set HostConfig.NanoCpus integer <int64> - CPU quota in units of 10e-9 CPUs. Moby scales this to the Windows job object
 		//      resolution of 1-10000, so it's higher resolution than option a.
 		//      src: https://github.com/moby/moby/blob/10866714412aea1bb587d1ad14b2ce1ba4cf4308/daemon/oci_windows.go#L426
 		// Part three - CRI & ContainerD's implementation
 		//   The kubelet sets these directly on CGroups in Linux, but needs to pass them across CRI on Windows.
 		//   There is an existing cpu_maximum field, with a range of percent * 100, so 1-10000. This is different from Docker, but consistent with OCI
 		//   https://github.com/kubernetes/kubernetes/blob/56d1c3b96d0a544130a82caad33dd57629b8a7f8/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L681-L682
 		//   https://github.com/opencontainers/runtime-spec/blob/ad53dcdc39f1f7f7472b10aa0a45648fe4865496/config-windows.md#cpu
 		//   If both CpuWeight and CpuMaximum are set - ContainerD catches this invalid case and returns an error instead.
 		cpuMaximum := 10000 * cpuLimit.MilliValue() / int64(sysinfo.NumCPU()) / 1000
 		// TODO: This should be reviewed or removed once Hyper-V support is implemented with CRI-ContainerD
 		//       in a future release. cpuCount may or may not be required if cpuMaximum is set.
 		if isolatedByHyperv {
 			cpuCount := int64(cpuLimit.MilliValue()+999) / 1000
 			wc.Resources.CpuCount = cpuCount
@@ -80,31 +107,15 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		wc.Resources.CpuMaximum = cpuMaximum
 	}
 	cpuShares := milliCPUToShares(cpuLimit.MilliValue(), isolatedByHyperv)
 	if cpuShares == 0 {
 		cpuShares = milliCPUToShares(cpuRequest.MilliValue(), isolatedByHyperv)
 	}
 	wc.Resources.CpuShares = cpuShares
 	if !isolatedByHyperv {
 		// The processor resource controls are mutually exclusive on
 		// Windows Server Containers, the order of precedence is
-		// CPUCount first, then CPUShares, and CPUMaximum last.
+		// CPUCount first, then CPUMaximum.
 		if wc.Resources.CpuCount > 0 {
 			if wc.Resources.CpuShares > 0 {
 				wc.Resources.CpuShares = 0
 				klog.Warningf("Mutually exclusive options: CPUCount priority > CPUShares priority on Windows Server Containers. CPUShares should be ignored")
 			}
 			if wc.Resources.CpuMaximum > 0 {
 				wc.Resources.CpuMaximum = 0
 				klog.Warningf("Mutually exclusive options: CPUCount priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored")
 			}
 		} else if wc.Resources.CpuShares > 0 {
 			if wc.Resources.CpuMaximum > 0 {
 				wc.Resources.CpuMaximum = 0
 				klog.Warningf("Mutually exclusive options: CPUShares priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored")
 			}
 		}
 	}
--- a/test/e2e/windows/BUILD
+++ b/test/e2e/windows/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
    name = "go_default_library",
    srcs = [
        "cpu_limits.go",
        "density.go",
        "dns.go",
        "framework.go",
--- a/test/e2e/windows/cpu_limits.go
+++ b/test/e2e/windows/cpu_limits.go
@@ -0,0 +1,144 @@
 /*
 Copyright 2020 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package windows
 import (
 	"context"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
 	imageutils "k8s.io/kubernetes/test/utils/image"
 	"time"
 	"github.com/onsi/ginkgo"
 )
 var _ = SIGDescribe("[Feature:Windows] Cpu Resources", func() {
 	f := framework.NewDefaultFramework("cpu-resources-test-windows")
 	// The Windows 'BusyBox' image is PowerShell plus a collection of scripts and utilities to mimic common busybox commands
 	powershellImage := imageutils.GetConfig(imageutils.BusyBox)
 	ginkgo.Context("Container limits", func() {
 		ginkgo.It("should not be exceeded after waiting 2 minutes", func() {
 			ginkgo.By("Creating one pod with limit set to '0.5'")
 			podsDecimal := newCPUBurnPods(1, powershellImage, "0.5", "1Gi")
 			f.PodClient().CreateBatch(podsDecimal)
 			ginkgo.By("Creating one pod with limit set to '500m'")
 			podsMilli := newCPUBurnPods(1, powershellImage, "500m", "1Gi")
 			f.PodClient().CreateBatch(podsMilli)
 			ginkgo.By("Waiting 2 minutes")
 			time.Sleep(2 * time.Minute)
 			ginkgo.By("Ensuring pods are still running")
 			var allPods [](*v1.Pod)
 			for _, p := range podsDecimal {
 				pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(
 					context.TODO(),
 					p.Name,
 					metav1.GetOptions{})
 				framework.ExpectNoError(err, "Error retrieving pod")
 				framework.ExpectEqual(pod.Status.Phase, v1.PodRunning)
 				allPods = append(allPods, pod)
 			}
 			for _, p := range podsMilli {
 				pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(
 					context.TODO(),
 					p.Name,
 					metav1.GetOptions{})
 				framework.ExpectNoError(err, "Error retrieving pod")
 				framework.ExpectEqual(pod.Status.Phase, v1.PodRunning)
 				allPods = append(allPods, pod)
 			}
 			ginkgo.By("Ensuring cpu doesn't exceed limit by >5%")
 			for _, p := range allPods {
 				ginkgo.By("Gathering node summary stats")
 				nodeStats, err := e2ekubelet.GetStatsSummary(f.ClientSet, p.Spec.NodeName)
 				framework.ExpectNoError(err, "Error grabbing node summary stats")
 				found := false
 				cpuUsage := float64(0)
 				for _, pod := range nodeStats.Pods {
 					if pod.PodRef.Name != p.Name || pod.PodRef.Namespace != p.Namespace {
 						continue
 					}
 					cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
 					found = true
 					break
 				}
 				framework.ExpectEqual(found, true, "Found pod in stats summary")
 				framework.Logf("Pod %s usage: %v", p.Name, cpuUsage)
 				framework.ExpectEqual(cpuUsage > 0, true, "Pods reported usage should be > 0")
 				framework.ExpectEqual((.5*1.05) > cpuUsage, true, "Pods reported usage should not exceed limit by >5%")
 			}
 		})
 	})
 })
 // newCPUBurnPods creates a list of pods (specification) with a workload that will consume all available CPU resources up to container limit
 func newCPUBurnPods(numPods int, image imageutils.Config, cpuLimit string, memoryLimit string) []*v1.Pod {
 	var pods []*v1.Pod
 	memLimitQuantity, err := resource.ParseQuantity(memoryLimit)
 	framework.ExpectNoError(err)
 	cpuLimitQuantity, err := resource.ParseQuantity(cpuLimit)
 	framework.ExpectNoError(err)
 	for i := 0; i < numPods; i++ {
 		podName := "cpulimittest-" + string(uuid.NewUUID())
 		pod := v1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: podName,
 				Labels: map[string]string{
 					"name":    podName,
 					"testapp": "cpuburn",
 				},
 			},
 			Spec: v1.PodSpec{
 				// Restart policy is always (default).
 				Containers: []v1.Container{
 					{
 						Image: image.GetE2EImage(),
 						Name:  podName,
 						Resources: v1.ResourceRequirements{
 							Limits: v1.ResourceList{
 								v1.ResourceMemory: memLimitQuantity,
 								v1.ResourceCPU:    cpuLimitQuantity,
 							},
 						},
 						Command: []string{
 							"powershell.exe",
 							"-Command",
 							"foreach ($loopnumber in 1..8) { Start-Job -ScriptBlock { $result = 1; foreach($mm in 1..2147483647){$res1=1;foreach($num in 1..2147483647){$res1=$mm*$num*1340371};$res1} } } ; get-job | wait-job",
 						},
 					},
 				},
 				NodeSelector: map[string]string{
 					"beta.kubernetes.io/os": "windows",
 				},
 			},
 		}
 		pods = append(pods, &pod)
 	}
 	return pods
 }