runtime: added vcpus pinning logics

Core VCPU threads pinning logics for issue 4476. Also provided docs.

Fixes:#4476
Signed-off-by: LitFlwr0 <861690705@qq.com>
This commit is contained in:
LitFlwr0 2022-06-09 00:25:49 +08:00
parent 288e337a6f
commit 2508d39b7c
9 changed files with 150 additions and 0 deletions

View File

@ -7,6 +7,7 @@ Kata Containers design documents:
- [Design requirements for Kata Containers](kata-design-requirements.md)
- [VSocks](VSocks.md)
- [VCPU handling](vcpu-handling.md)
- [VCPU threads pinning](vcpu-threads-pinning.md)
- [Host cgroups](host-cgroups.md)
- [`Inotify` support](inotify.md)
- [Metrics(Kata 2.0)](kata-2-0-metrics.md)

Binary file not shown.

After

Width:  |  Height:  |  Size: 193 KiB

View File

@ -0,0 +1,37 @@
# Design Doc for Kata Containers' VCPUs Pinning Feature
## Background
By now, vCPU threads of Kata Containers are scheduled randomly to CPUs. And each pod would request a specific set of CPUs which we call it CPU set (just the CPU set meaning in Linux cgroups).
If the number of vCPU threads are equal to that of CPUs claimed in CPU set, we can then pin each vCPU thread to one specified CPU, to reduce the cost of random scheduling.
## Detailed Design
### Passing Config Parameters
Two ways are provided to use this vCPU thread pinning feature: through `QEMU` configuration file and through annotations. Finally the pinning parameter is passed to `HypervisorConfig`.
### Related Linux Thread Scheduling API
| API Info | Value |
|-------------------|-----------------------------------------------------------|
| Package | `golang.org/x/sys/unix` |
| Method | `unix.SchedSetaffinity(thread_id, &unixCPUSet)` |
| Official Doc Page | https://pkg.go.dev/golang.org/x/sys/unix#SchedSetaffinity |
### When is VCPUs Pinning Checked?
As shown in Section 1, when `num(vCPU threads) == num(CPUs in CPU set)`, we shall pin each vCPU thread to a specified CPU. And when this condition is broken, we should restore to the original random scheduling pattern.
So when may `num(CPUs in CPU set)` change? There are 5 possible scenes:
| Possible scenes | Related Code |
|-----------------------------------|--------------------------------------------|
| when creating a container | File Sandbox.go, in method `CreateContainer` |
| when starting a container | File Sandbox.go, in method `StartContainer` |
| when deleting a container | File Sandbox.go, in method `DeleteContainer` |
| when updating a container | File Sandbox.go, in method `UpdateContainer` |
| when creating multiple containers | File Sandbox.go, in method `createContainers` |
### Core Pinning Logics
We can split the whole process into the following steps. Related methods are `checkVCPUsPinning` and `resetVCPUsPinning`, in file Sandbox.go.
![](arch-images/vcpus-pinning-process.png)

View File

@ -96,6 +96,11 @@ machine_accelerators="@MACHINEACCELERATORS@"
# For example, `cpu_features = "pmu=off,vmx=off"
cpu_features="@CPUFEATURES@"
# vCPUs pinning settings
# if enabled, each vCPU thread will be scheduled to a fixed CPU
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
# enable_vcpus_pinning = false
# Default number of vCPUs per SB/VM:
# unspecified or 0 --> will be set to @DEFVCPUS@
# < 0 --> will be set to the actual number of physical cores

View File

@ -155,6 +155,7 @@ type hypervisor struct {
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"`
}
type runtime struct {
@ -833,6 +834,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Rootless: h.Rootless,
LegacySerial: h.LegacySerial,
DisableSeLinux: h.DisableSeLinux,
EnableVCPUsPinning: h.EnableVCPUsPinning,
}, nil
}

View File

@ -651,6 +651,12 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.EnableVCPUsPinning).setBool(func(enableVCPUsPinning bool) {
sbConfig.HypervisorConfig.EnableVCPUsPinning = enableVCPUsPinning
}); err != nil {
return err
}
return newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxVCPUs).setUintWithCheck(func(maxVCPUs uint64) error {
max := uint32(maxVCPUs)

View File

@ -559,6 +559,9 @@ type HypervisorConfig struct {
// Use legacy serial for the guest console
LegacySerial bool
// EnableVCPUsPinning controls whether each vCPU thread should be scheduled to a fixed CPU
EnableVCPUsPinning bool
}
// vcpu mapping from vcpu number to thread number

View File

@ -143,6 +143,9 @@ const (
// DefaultVCPUs is a sandbox annotation that specifies the maximum number of vCPUs allocated for the VM by the hypervisor.
DefaultMaxVCPUs = kataAnnotHypervisorPrefix + "default_max_vcpus"
// EnableVCPUsPinning is a sandbox annotation that controls bundling between vCPU threads and CPUs
EnableVCPUsPinning = kataAnnotationsPrefix + "enable_vcpus_pinning"
//
// Memory related annotations
//

View File

@ -44,6 +44,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
"golang.org/x/sys/unix"
)
// sandboxTracingTags defines tags for the trace span
@ -236,6 +237,7 @@ type Sandbox struct {
sharePidNs bool
seccompSupported bool
disableVMShutdown bool
isVCPUsPinningOn bool
}
// ID returns the sandbox identifier string.
@ -1353,6 +1355,10 @@ func (s *Sandbox) CreateContainer(ctx context.Context, contConfig ContainerConfi
return nil, err
}
if err = s.checkVCPUsPinning(ctx); err != nil {
return nil, err
}
if err = s.storeSandbox(ctx); err != nil {
return nil, err
}
@ -1385,6 +1391,10 @@ func (s *Sandbox) StartContainer(ctx context.Context, containerID string) (VCCon
return nil, err
}
if err = s.checkVCPUsPinning(ctx); err != nil {
return nil, err
}
return c, nil
}
@ -1457,6 +1467,10 @@ func (s *Sandbox) DeleteContainer(ctx context.Context, containerID string) (VCCo
return nil, err
}
if err = s.checkVCPUsPinning(ctx); err != nil {
return nil, err
}
if err = s.storeSandbox(ctx); err != nil {
return nil, err
}
@ -1522,6 +1536,10 @@ func (s *Sandbox) UpdateContainer(ctx context.Context, containerID string, resou
return err
}
if err = s.checkVCPUsPinning(ctx); err != nil {
return err
}
if err = s.storeSandbox(ctx); err != nil {
return err
}
@ -1640,6 +1658,11 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.resourceControllerUpdate(ctx); err != nil {
return err
}
if err := s.checkVCPUsPinning(ctx); err != nil {
return err
}
if err := s.storeSandbox(ctx); err != nil {
return err
}
@ -2459,3 +2482,73 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
return nil
}
// checkVCPUsPinning is used to support CPUSet mode of kata container.
// CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning
// is set to true. Then it fetches sandbox's number of vCPU threads
// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
// is then pinned to one fixed CPU in CPUSet.
func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
if s.config == nil {
return fmt.Errorf("no hypervisor config found")
}
if !s.config.HypervisorConfig.EnableVCPUsPinning {
return nil
}
// fetch vCPU thread ids and CPUSet
vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx)
if err != nil {
return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
}
cpuSetStr, _, err := s.getSandboxCPUSet()
if err != nil {
return fmt.Errorf("failed to get CPUSet config: %v", err)
}
cpuSet, err := cpuset.Parse(cpuSetStr)
if err != nil {
return fmt.Errorf("failed to parse CPUSet string: %v", err)
}
cpuSetSlice := cpuSet.ToSlice()
// check if vCPU thread numbers and CPU numbers are equal
numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
// if not equal, we should reset threads scheduling to random pattern
if numVCPUs != numCPUs {
if s.isVCPUsPinningOn {
s.isVCPUsPinningOn = false
return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
}
return nil
}
// if equal, we can now start vCPU threads pinning
i := 0
for _, tid := range vCPUThreadsMap.vcpus {
unixCPUSet := unix.CPUSet{}
unixCPUSet.Set(cpuSetSlice[i])
if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
return err
}
return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err)
}
i++
}
s.isVCPUsPinningOn = true
return nil
}
// resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling
func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error {
unixCPUSet := unix.CPUSet{}
for cpuId := range cpuSetSlice {
unixCPUSet.Set(cpuId)
}
for _, tid := range vCPUThreadsMap.vcpus {
if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil {
return fmt.Errorf("failed to reset vcpu thread %d affinity to default mode: %v", tid, err)
}
}
return nil
}