Merge pull request #86101 from PatrickLang/fix-cpumaximum

Fix cpu resource limit on Windows
2025-07-31 15:25:57 +00:00 · 2020-02-26 00:20:26 -08:00 · 2020-02-26 00:20:26 -08:00 · 16a7650e2b
commit 16a7650e2b
parent b6b494b448 886214f48c
8 changed files with 182 additions and 79 deletions
--- a/pkg/kubelet/dockershim/BUILD
+++ b/pkg/kubelet/dockershim/BUILD
@ -78,6 +78,7 @@ go_library(
            "//pkg/kubelet/apis:go_default_library",
            "//pkg/kubelet/winstats:go_default_library",
            "//vendor/github.com/Microsoft/hcsshim:go_default_library",
+            "//vendor/github.com/docker/docker/pkg/sysinfo:go_default_library",
            "//vendor/golang.org/x/sys/windows/registry:go_default_library",
        ],
        "//conditions:default": [],
--- a/pkg/kubelet/dockershim/helpers_windows.go
+++ b/pkg/kubelet/dockershim/helpers_windows.go
@ -25,6 +25,7 @@ import (
 	dockertypes "github.com/docker/docker/api/types"
 	dockercontainer "github.com/docker/docker/api/types/container"
 	dockerfilters "github.com/docker/docker/api/types/filters"
+	"github.com/docker/docker/pkg/sysinfo"
 	"k8s.io/klog"

 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
@ -69,11 +70,12 @@ func (ds *dockerService) updateCreateConfig(
 	if wc := config.GetWindows(); wc != nil {
 		rOpts := wc.GetResources()
 		if rOpts != nil {
+			// Precedence and units for these are described at length in kuberuntime_container_windows.go - generateWindowsContainerConfig()
 			createConfig.HostConfig.Resources = dockercontainer.Resources{
-				Memory:     rOpts.MemoryLimitInBytes,
-				CPUShares:  rOpts.CpuShares,
-				CPUCount:   rOpts.CpuCount,
-				CPUPercent: rOpts.CpuMaximum,
+				Memory:    rOpts.MemoryLimitInBytes,
+				CPUShares: rOpts.CpuShares,
+				CPUCount:  rOpts.CpuCount,
+				NanoCPUs:  rOpts.CpuMaximum * int64(sysinfo.NumCPU()) * (1e9 / 10000),
 			}
 		}

--- a/pkg/kubelet/kuberuntime/BUILD
+++ b/pkg/kubelet/kuberuntime/BUILD
@ -14,7 +14,6 @@ go_library(
        "helpers.go",
        "helpers_linux.go",
        "helpers_unsupported.go",
-        "helpers_windows.go",
        "instrumented_services.go",
        "kuberuntime_container.go",
        "kuberuntime_container_linux.go",
--- a/pkg/kubelet/kuberuntime/helpers_unsupported.go
+++ b/pkg/kubelet/kuberuntime/helpers_unsupported.go
@ -1,4 +1,4 @@
-// +build !linux,!windows
+// +build !linux

 /*
 Copyright 2018 The Kubernetes Authors.
--- a/pkg/kubelet/kuberuntime/helpers_windows.go
+++ b/pkg/kubelet/kuberuntime/helpers_windows.go
@ -1,55 +0,0 @@
-// +build windows
-
-/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package kuberuntime
-
-import (
-	"github.com/docker/docker/pkg/sysinfo"
-)
-
-const (
-	// Taken from https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/resource-controls
-	minSharesProcess = 5000
-	minSharesHyperV  = 10
-	maxShares        = 10000
-	milliCPUToCPU    = 1000
-)
-
-// milliCPUToShares converts milliCPU to CPU shares
-func milliCPUToShares(milliCPU int64, hyperv bool) int64 {
-	var minShares int64 = minSharesProcess
-	if hyperv {
-		minShares = minSharesHyperV
-	}
-
-	if milliCPU == 0 {
-		// Return here to really match kernel default for zero milliCPU.
-		return minShares
-	}
-
-	// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
-	totalCPU := sysinfo.NumCPU()
-	shares := (milliCPU * (maxShares - minShares)) / int64(totalCPU) / milliCPUToCPU
-	if shares < minShares {
-		return minShares
-	}
-	if shares > maxShares {
-		return maxShares
-	}
-	return shares
-}
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go
@ -53,7 +53,6 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		SecurityContext: &runtimeapi.WindowsContainerSecurityContext{},
 	}

-	cpuRequest := container.Resources.Requests.Cpu()
 	cpuLimit := container.Resources.Limits.Cpu()
 	isolatedByHyperv := kubeletapis.ShouldIsolatedByHyperV(pod.Annotations)
 	if !cpuLimit.IsZero() {
@ -61,7 +60,35 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		// as only 64 processors are available for execution by a given process. This causes
 		// some oddities on systems with more than 64 processors.
 		// Refer https://msdn.microsoft.com/en-us/library/windows/desktop/dd405503(v=vs.85).aspx.
+
+		// Since Kubernetes doesn't have any notion of weight in the Pod/Container API, only limits/reserves, then applying CpuMaximum only
+		// will better follow the intent of the user. At one point CpuWeights were set, but this prevented limits from having any effect.
+
+		// There are 3 parts to how this works:
+		// Part one - Windows kernel
+		//   cpuMaximum is documented at https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/resource-controls
+		//   the range and how it relates to number of CPUs is at https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-jobobject_cpu_rate_control_information
+		//   For process isolation, these are applied to the job object setting JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, which can be set to either
+		//   JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED or JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP. This is why the settings are mutually exclusive.
+		// Part two - Docker (doc: https://docs.docker.com/engine/api/v1.30)
+		//   If both CpuWeight and CpuMaximum are passed to Docker, then it sets
+		//   JOB_OBJECT_CPU_RATE_CONTROL_ENABLE = JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED ignoring CpuMaximum.
+		//   Option a: Set HostConfig.CpuPercent. The units are whole percent of the total CPU capacity of the system, meaning the resolution
+		//      is different based on the number of cores.
+		//   Option b: Set HostConfig.NanoCpus integer <int64> - CPU quota in units of 10e-9 CPUs. Moby scales this to the Windows job object
+		//      resolution of 1-10000, so it's higher resolution than option a.
+		//      src: https://github.com/moby/moby/blob/10866714412aea1bb587d1ad14b2ce1ba4cf4308/daemon/oci_windows.go#L426
+		// Part three - CRI & ContainerD's implementation
+		//   The kubelet sets these directly on CGroups in Linux, but needs to pass them across CRI on Windows.
+		//   There is an existing cpu_maximum field, with a range of percent * 100, so 1-10000. This is different from Docker, but consistent with OCI
+		//   https://github.com/kubernetes/kubernetes/blob/56d1c3b96d0a544130a82caad33dd57629b8a7f8/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L681-L682
+		//   https://github.com/opencontainers/runtime-spec/blob/ad53dcdc39f1f7f7472b10aa0a45648fe4865496/config-windows.md#cpu
+		//   If both CpuWeight and CpuMaximum are set - ContainerD catches this invalid case and returns an error instead.
+
 		cpuMaximum := 10000 * cpuLimit.MilliValue() / int64(sysinfo.NumCPU()) / 1000
+
+		// TODO: This should be reviewed or removed once Hyper-V support is implemented with CRI-ContainerD
+		//       in a future release. cpuCount may or may not be required if cpuMaximum is set.
 		if isolatedByHyperv {
 			cpuCount := int64(cpuLimit.MilliValue()+999) / 1000
 			wc.Resources.CpuCount = cpuCount
@ -80,31 +107,15 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1
 		wc.Resources.CpuMaximum = cpuMaximum
 	}

-	cpuShares := milliCPUToShares(cpuLimit.MilliValue(), isolatedByHyperv)
-	if cpuShares == 0 {
-		cpuShares = milliCPUToShares(cpuRequest.MilliValue(), isolatedByHyperv)
-	}
-	wc.Resources.CpuShares = cpuShares
-
 	if !isolatedByHyperv {
 		// The processor resource controls are mutually exclusive on
 		// Windows Server Containers, the order of precedence is
-		// CPUCount first, then CPUShares, and CPUMaximum last.
+		// CPUCount first, then CPUMaximum.
 		if wc.Resources.CpuCount > 0 {
-			if wc.Resources.CpuShares > 0 {
-				wc.Resources.CpuShares = 0
-				klog.Warningf("Mutually exclusive options: CPUCount priority > CPUShares priority on Windows Server Containers. CPUShares should be ignored")
-			}
 			if wc.Resources.CpuMaximum > 0 {
 				wc.Resources.CpuMaximum = 0
 				klog.Warningf("Mutually exclusive options: CPUCount priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored")
 			}
-		} else if wc.Resources.CpuShares > 0 {
-			if wc.Resources.CpuMaximum > 0 {
-				wc.Resources.CpuMaximum = 0
-				klog.Warningf("Mutually exclusive options: CPUShares priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored")
-			}
-
 		}
 	}

--- a/test/e2e/windows/BUILD
+++ b/test/e2e/windows/BUILD
@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
    name = "go_default_library",
    srcs = [
+        "cpu_limits.go",
        "density.go",
        "dns.go",
        "framework.go",
--- a/test/e2e/windows/cpu_limits.go
+++ b/test/e2e/windows/cpu_limits.go
@ -0,0 +1,144 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package windows
+
+import (
+	"context"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/uuid"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
+	imageutils "k8s.io/kubernetes/test/utils/image"
+	"time"
+
+	"github.com/onsi/ginkgo"
+)
+
+var _ = SIGDescribe("[Feature:Windows] Cpu Resources", func() {
+	f := framework.NewDefaultFramework("cpu-resources-test-windows")
+
+	// The Windows 'BusyBox' image is PowerShell plus a collection of scripts and utilities to mimic common busybox commands
+	powershellImage := imageutils.GetConfig(imageutils.BusyBox)
+
+	ginkgo.Context("Container limits", func() {
+		ginkgo.It("should not be exceeded after waiting 2 minutes", func() {
+			ginkgo.By("Creating one pod with limit set to '0.5'")
+			podsDecimal := newCPUBurnPods(1, powershellImage, "0.5", "1Gi")
+			f.PodClient().CreateBatch(podsDecimal)
+			ginkgo.By("Creating one pod with limit set to '500m'")
+			podsMilli := newCPUBurnPods(1, powershellImage, "500m", "1Gi")
+			f.PodClient().CreateBatch(podsMilli)
+			ginkgo.By("Waiting 2 minutes")
+			time.Sleep(2 * time.Minute)
+			ginkgo.By("Ensuring pods are still running")
+			var allPods [](*v1.Pod)
+			for _, p := range podsDecimal {
+				pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(
+					context.TODO(),
+					p.Name,
+					metav1.GetOptions{})
+				framework.ExpectNoError(err, "Error retrieving pod")
+				framework.ExpectEqual(pod.Status.Phase, v1.PodRunning)
+				allPods = append(allPods, pod)
+			}
+			for _, p := range podsMilli {
+				pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(
+					context.TODO(),
+					p.Name,
+					metav1.GetOptions{})
+				framework.ExpectNoError(err, "Error retrieving pod")
+				framework.ExpectEqual(pod.Status.Phase, v1.PodRunning)
+				allPods = append(allPods, pod)
+			}
+			ginkgo.By("Ensuring cpu doesn't exceed limit by >5%")
+			for _, p := range allPods {
+				ginkgo.By("Gathering node summary stats")
+				nodeStats, err := e2ekubelet.GetStatsSummary(f.ClientSet, p.Spec.NodeName)
+				framework.ExpectNoError(err, "Error grabbing node summary stats")
+				found := false
+				cpuUsage := float64(0)
+				for _, pod := range nodeStats.Pods {
+					if pod.PodRef.Name != p.Name || pod.PodRef.Namespace != p.Namespace {
+						continue
+					}
+					cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
+					found = true
+					break
+				}
+				framework.ExpectEqual(found, true, "Found pod in stats summary")
+				framework.Logf("Pod %s usage: %v", p.Name, cpuUsage)
+				framework.ExpectEqual(cpuUsage > 0, true, "Pods reported usage should be > 0")
+				framework.ExpectEqual((.5*1.05) > cpuUsage, true, "Pods reported usage should not exceed limit by >5%")
+			}
+		})
+	})
+})
+
+// newCPUBurnPods creates a list of pods (specification) with a workload that will consume all available CPU resources up to container limit
+func newCPUBurnPods(numPods int, image imageutils.Config, cpuLimit string, memoryLimit string) []*v1.Pod {
+	var pods []*v1.Pod
+
+	memLimitQuantity, err := resource.ParseQuantity(memoryLimit)
+	framework.ExpectNoError(err)
+
+	cpuLimitQuantity, err := resource.ParseQuantity(cpuLimit)
+	framework.ExpectNoError(err)
+
+	for i := 0; i < numPods; i++ {
+
+		podName := "cpulimittest-" + string(uuid.NewUUID())
+		pod := v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: podName,
+				Labels: map[string]string{
+					"name":    podName,
+					"testapp": "cpuburn",
+				},
+			},
+			Spec: v1.PodSpec{
+				// Restart policy is always (default).
+				Containers: []v1.Container{
+					{
+						Image: image.GetE2EImage(),
+						Name:  podName,
+						Resources: v1.ResourceRequirements{
+							Limits: v1.ResourceList{
+								v1.ResourceMemory: memLimitQuantity,
+								v1.ResourceCPU:    cpuLimitQuantity,
+							},
+						},
+						Command: []string{
+							"powershell.exe",
+							"-Command",
+							"foreach ($loopnumber in 1..8) { Start-Job -ScriptBlock { $result = 1; foreach($mm in 1..2147483647){$res1=1;foreach($num in 1..2147483647){$res1=$mm*$num*1340371};$res1} } } ; get-job | wait-job",
+						},
+					},
+				},
+				NodeSelector: map[string]string{
+					"beta.kubernetes.io/os": "windows",
+				},
+			},
+		}
+
+		pods = append(pods, &pod)
+	}
+
+	return pods
+}