diff --git a/docs/design/README.md b/docs/design/README.md index adcffd0196..a3344d8317 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -7,6 +7,7 @@ Kata Containers design documents: - [Design requirements for Kata Containers](kata-design-requirements.md) - [VSocks](VSocks.md) - [VCPU handling](vcpu-handling.md) +- [VCPU threads pinning](vcpu-threads-pinning.md) - [Host cgroups](host-cgroups.md) - [`Inotify` support](inotify.md) - [Metrics(Kata 2.0)](kata-2-0-metrics.md) diff --git a/docs/design/arch-images/vcpus-pinning-process.png b/docs/design/arch-images/vcpus-pinning-process.png new file mode 100644 index 0000000000..266c34acba Binary files /dev/null and b/docs/design/arch-images/vcpus-pinning-process.png differ diff --git a/docs/design/vcpu-threads-pinning.md b/docs/design/vcpu-threads-pinning.md new file mode 100644 index 0000000000..4de6ae9861 --- /dev/null +++ b/docs/design/vcpu-threads-pinning.md @@ -0,0 +1,37 @@ +# Design Doc for Kata Containers' VCPUs Pinning Feature + +## Background +By now, vCPU threads of Kata Containers are scheduled randomly to CPUs. And each pod would request a specific set of CPUs which we call it CPU set (just the CPU set meaning in Linux cgroups). + +If the number of vCPU threads are equal to that of CPUs claimed in CPU set, we can then pin each vCPU thread to one specified CPU, to reduce the cost of random scheduling. + +## Detailed Design + +### Passing Config Parameters +Two ways are provided to use this vCPU thread pinning feature: through `QEMU` configuration file and through annotations. Finally the pinning parameter is passed to `HypervisorConfig`. + +### Related Linux Thread Scheduling API + +| API Info | Value | +|-------------------|-----------------------------------------------------------| +| Package | `golang.org/x/sys/unix` | +| Method | `unix.SchedSetaffinity(thread_id, &unixCPUSet)` | +| Official Doc Page | https://pkg.go.dev/golang.org/x/sys/unix#SchedSetaffinity | + +### When is VCPUs Pinning Checked? + +As shown in Section 1, when `num(vCPU threads) == num(CPUs in CPU set)`, we shall pin each vCPU thread to a specified CPU. And when this condition is broken, we should restore to the original random scheduling pattern. +So when may `num(CPUs in CPU set)` change? There are 5 possible scenes: + +| Possible scenes | Related Code | +|-----------------------------------|--------------------------------------------| +| when creating a container | File Sandbox.go, in method `CreateContainer` | +| when starting a container | File Sandbox.go, in method `StartContainer` | +| when deleting a container | File Sandbox.go, in method `DeleteContainer` | +| when updating a container | File Sandbox.go, in method `UpdateContainer` | +| when creating multiple containers | File Sandbox.go, in method `createContainers` | + +### Core Pinning Logics + +We can split the whole process into the following steps. Related methods are `checkVCPUsPinning` and `resetVCPUsPinning`, in file Sandbox.go. +![](arch-images/vcpus-pinning-process.png) \ No newline at end of file diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 7b9166adc1..7d0487d2bd 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -96,6 +96,11 @@ machine_accelerators="@MACHINEACCELERATORS@" # For example, `cpu_features = "pmu=off,vmx=off" cpu_features="@CPUFEATURES@" +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +# enable_vcpus_pinning = false + # Default number of vCPUs per SB/VM: # unspecified or 0 --> will be set to @DEFVCPUS@ # < 0 --> will be set to the actual number of physical cores diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index dd996b4032..d334c61713 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -155,6 +155,7 @@ type hypervisor struct { DisableSeccomp bool `toml:"disable_seccomp"` DisableSeLinux bool `toml:"disable_selinux"` LegacySerial bool `toml:"use_legacy_serial"` + EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"` } type runtime struct { @@ -833,6 +834,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Rootless: h.Rootless, LegacySerial: h.LegacySerial, DisableSeLinux: h.DisableSeLinux, + EnableVCPUsPinning: h.EnableVCPUsPinning, }, nil } diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 95bfe2a33a..57c2ed1a15 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -651,6 +651,12 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e return err } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.EnableVCPUsPinning).setBool(func(enableVCPUsPinning bool) { + sbConfig.HypervisorConfig.EnableVCPUsPinning = enableVCPUsPinning + }); err != nil { + return err + } + return newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxVCPUs).setUintWithCheck(func(maxVCPUs uint64) error { max := uint32(maxVCPUs) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 51cf678b8d..eb5abbb3b3 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -559,6 +559,9 @@ type HypervisorConfig struct { // Use legacy serial for the guest console LegacySerial bool + + // EnableVCPUsPinning controls whether each vCPU thread should be scheduled to a fixed CPU + EnableVCPUsPinning bool } // vcpu mapping from vcpu number to thread number diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index d7919a0c8a..d785580d9b 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -143,6 +143,9 @@ const ( // DefaultVCPUs is a sandbox annotation that specifies the maximum number of vCPUs allocated for the VM by the hypervisor. DefaultMaxVCPUs = kataAnnotHypervisorPrefix + "default_max_vcpus" + // EnableVCPUsPinning is a sandbox annotation that controls bundling between vCPU threads and CPUs + EnableVCPUsPinning = kataAnnotationsPrefix + "enable_vcpus_pinning" + // // Memory related annotations // diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index e4a16983ed..32ccc2dbb7 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -44,6 +44,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" + "golang.org/x/sys/unix" ) // sandboxTracingTags defines tags for the trace span @@ -236,6 +237,7 @@ type Sandbox struct { sharePidNs bool seccompSupported bool disableVMShutdown bool + isVCPUsPinningOn bool } // ID returns the sandbox identifier string. @@ -1353,6 +1355,10 @@ func (s *Sandbox) CreateContainer(ctx context.Context, contConfig ContainerConfi return nil, err } + if err = s.checkVCPUsPinning(ctx); err != nil { + return nil, err + } + if err = s.storeSandbox(ctx); err != nil { return nil, err } @@ -1385,6 +1391,10 @@ func (s *Sandbox) StartContainer(ctx context.Context, containerID string) (VCCon return nil, err } + if err = s.checkVCPUsPinning(ctx); err != nil { + return nil, err + } + return c, nil } @@ -1457,6 +1467,10 @@ func (s *Sandbox) DeleteContainer(ctx context.Context, containerID string) (VCCo return nil, err } + if err = s.checkVCPUsPinning(ctx); err != nil { + return nil, err + } + if err = s.storeSandbox(ctx); err != nil { return nil, err } @@ -1522,6 +1536,10 @@ func (s *Sandbox) UpdateContainer(ctx context.Context, containerID string, resou return err } + if err = s.checkVCPUsPinning(ctx); err != nil { + return err + } + if err = s.storeSandbox(ctx); err != nil { return err } @@ -1640,6 +1658,11 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.resourceControllerUpdate(ctx); err != nil { return err } + + if err := s.checkVCPUsPinning(ctx); err != nil { + return err + } + if err := s.storeSandbox(ctx); err != nil { return err } @@ -2459,3 +2482,73 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { return nil } + +// checkVCPUsPinning is used to support CPUSet mode of kata container. +// CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning +// is set to true. Then it fetches sandbox's number of vCPU threads +// and number of CPUs in CPUSet. If the two are equal, each vCPU thread +// is then pinned to one fixed CPU in CPUSet. +func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { + if s.config == nil { + return fmt.Errorf("no hypervisor config found") + } + if !s.config.HypervisorConfig.EnableVCPUsPinning { + return nil + } + + // fetch vCPU thread ids and CPUSet + vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx) + if err != nil { + return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) + } + cpuSetStr, _, err := s.getSandboxCPUSet() + if err != nil { + return fmt.Errorf("failed to get CPUSet config: %v", err) + } + cpuSet, err := cpuset.Parse(cpuSetStr) + if err != nil { + return fmt.Errorf("failed to parse CPUSet string: %v", err) + } + cpuSetSlice := cpuSet.ToSlice() + + // check if vCPU thread numbers and CPU numbers are equal + numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) + // if not equal, we should reset threads scheduling to random pattern + if numVCPUs != numCPUs { + if s.isVCPUsPinningOn { + s.isVCPUsPinningOn = false + return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice) + } + return nil + } + + // if equal, we can now start vCPU threads pinning + i := 0 + for _, tid := range vCPUThreadsMap.vcpus { + unixCPUSet := unix.CPUSet{} + unixCPUSet.Set(cpuSetSlice[i]) + if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err) + } + i++ + } + s.isVCPUsPinningOn = true + return nil +} + +// resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling +func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error { + unixCPUSet := unix.CPUSet{} + for cpuId := range cpuSetSlice { + unixCPUSet.Set(cpuId) + } + for _, tid := range vCPUThreadsMap.vcpus { + if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil { + return fmt.Errorf("failed to reset vcpu thread %d affinity to default mode: %v", tid, err) + } + } + return nil +}