diff --git a/pkg/kubelet/dockershim/BUILD b/pkg/kubelet/dockershim/BUILD index c3d42a1ff47..6fd54ad24d2 100644 --- a/pkg/kubelet/dockershim/BUILD +++ b/pkg/kubelet/dockershim/BUILD @@ -78,6 +78,7 @@ go_library( "//pkg/kubelet/apis:go_default_library", "//pkg/kubelet/winstats:go_default_library", "//vendor/github.com/Microsoft/hcsshim:go_default_library", + "//vendor/github.com/docker/docker/pkg/sysinfo:go_default_library", "//vendor/golang.org/x/sys/windows/registry:go_default_library", ], "//conditions:default": [], diff --git a/pkg/kubelet/dockershim/helpers_windows.go b/pkg/kubelet/dockershim/helpers_windows.go index 09ff2335f8f..42d32839b5b 100644 --- a/pkg/kubelet/dockershim/helpers_windows.go +++ b/pkg/kubelet/dockershim/helpers_windows.go @@ -25,6 +25,7 @@ import ( dockertypes "github.com/docker/docker/api/types" dockercontainer "github.com/docker/docker/api/types/container" dockerfilters "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/pkg/sysinfo" "k8s.io/klog" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" @@ -69,11 +70,12 @@ func (ds *dockerService) updateCreateConfig( if wc := config.GetWindows(); wc != nil { rOpts := wc.GetResources() if rOpts != nil { + // Precedence and units for these are described at length in kuberuntime_container_windows.go - generateWindowsContainerConfig() createConfig.HostConfig.Resources = dockercontainer.Resources{ - Memory: rOpts.MemoryLimitInBytes, - CPUShares: rOpts.CpuShares, - CPUCount: rOpts.CpuCount, - CPUPercent: rOpts.CpuMaximum, + Memory: rOpts.MemoryLimitInBytes, + CPUShares: rOpts.CpuShares, + CPUCount: rOpts.CpuCount, + NanoCPUs: rOpts.CpuMaximum * int64(sysinfo.NumCPU()) * (1e9 / 10000), } } diff --git a/pkg/kubelet/kuberuntime/helpers_windows.go b/pkg/kubelet/kuberuntime/helpers_windows.go index 2009fad8d0f..dc4b212beac 100644 --- a/pkg/kubelet/kuberuntime/helpers_windows.go +++ b/pkg/kubelet/kuberuntime/helpers_windows.go @@ -30,6 +30,7 @@ const ( milliCPUToCPU = 1000 ) +// TODO: remove - may be dead code // milliCPUToShares converts milliCPU to CPU shares func milliCPUToShares(milliCPU int64, hyperv bool) int64 { var minShares int64 = minSharesProcess diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go index a782802d30b..7adb387d4f2 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go @@ -53,7 +53,6 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1 SecurityContext: &runtimeapi.WindowsContainerSecurityContext{}, } - cpuRequest := container.Resources.Requests.Cpu() cpuLimit := container.Resources.Limits.Cpu() isolatedByHyperv := kubeletapis.ShouldIsolatedByHyperV(pod.Annotations) if !cpuLimit.IsZero() { @@ -61,7 +60,35 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1 // as only 64 processors are available for execution by a given process. This causes // some oddities on systems with more than 64 processors. // Refer https://msdn.microsoft.com/en-us/library/windows/desktop/dd405503(v=vs.85).aspx. + + // Since Kubernetes doesn't have any notion of weight in the Pod/Container API, only limits/reserves, then applying CpuMaximum only + // will better follow the intent of the user. At one point CpuWeights were set, but this prevented limits from having any effect. + + // There are 3 parts to how this works: + // Part one - Windows kernel + // cpuMaximum is documented at https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/resource-controls + // the range and how it relates to number of CPUs is at https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-jobobject_cpu_rate_control_information + // For process isolation, these are applied to the job object setting JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, which can be set to either + // JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED or JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP. This is why the settings are mutually exclusive. + // Part two - Docker (doc: https://docs.docker.com/engine/api/v1.30) + // If both CpuWeight and CpuMaximum are passed to Docker, then it sets + // JOB_OBJECT_CPU_RATE_CONTROL_ENABLE = JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED ignoring CpuMaximum. + // Option a: Set HostConfig.CpuPercent. The units are whole percent of the total CPU capacity of the system, meaning the resolution + // is different based on the number of cores. + // Option b: Set HostConfig.NanoCpus integer - CPU quota in units of 10e-9 CPUs. Moby scales this to the Windows job object + // resolution of 1-10000, so it's higher resolution than option a. + // src: https://github.com/moby/moby/blob/10866714412aea1bb587d1ad14b2ce1ba4cf4308/daemon/oci_windows.go#L426 + // Part three - CRI & ContainerD's implementation + // The kubelet sets these directly on CGroups in Linux, but needs to pass them across CRI on Windows. + // There is an existing cpu_maximum field, with a range of percent * 100, so 1-10000. This is different from Docker, but consistent with OCI + // https://github.com/kubernetes/kubernetes/blob/56d1c3b96d0a544130a82caad33dd57629b8a7f8/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L681-L682 + // https://github.com/opencontainers/runtime-spec/blob/ad53dcdc39f1f7f7472b10aa0a45648fe4865496/config-windows.md#cpu + // If both CpuWeight and CpuMaximum are set - ContainerD catches this invalid case and returns an error instead. + cpuMaximum := 10000 * cpuLimit.MilliValue() / int64(sysinfo.NumCPU()) / 1000 + + // TODO: This should be reviewed or removed once Hyper-V support is implemented with CRI-ContainerD + // in a future release. cpuCount may or may not be required if cpuMaximum is set. if isolatedByHyperv { cpuCount := int64(cpuLimit.MilliValue()+999) / 1000 wc.Resources.CpuCount = cpuCount @@ -80,31 +107,15 @@ func (m *kubeGenericRuntimeManager) generateWindowsContainerConfig(container *v1 wc.Resources.CpuMaximum = cpuMaximum } - cpuShares := milliCPUToShares(cpuLimit.MilliValue(), isolatedByHyperv) - if cpuShares == 0 { - cpuShares = milliCPUToShares(cpuRequest.MilliValue(), isolatedByHyperv) - } - wc.Resources.CpuShares = cpuShares - if !isolatedByHyperv { // The processor resource controls are mutually exclusive on // Windows Server Containers, the order of precedence is - // CPUCount first, then CPUShares, and CPUMaximum last. + // CPUCount first, then CPUMaximum. if wc.Resources.CpuCount > 0 { - if wc.Resources.CpuShares > 0 { - wc.Resources.CpuShares = 0 - klog.Warningf("Mutually exclusive options: CPUCount priority > CPUShares priority on Windows Server Containers. CPUShares should be ignored") - } if wc.Resources.CpuMaximum > 0 { wc.Resources.CpuMaximum = 0 klog.Warningf("Mutually exclusive options: CPUCount priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored") } - } else if wc.Resources.CpuShares > 0 { - if wc.Resources.CpuMaximum > 0 { - wc.Resources.CpuMaximum = 0 - klog.Warningf("Mutually exclusive options: CPUShares priority > CPUMaximum priority on Windows Server Containers. CPUMaximum should be ignored") - } - } }