diff --git a/docs/design/README.md b/docs/design/README.md index ad20cd7204..adcffd0196 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -12,7 +12,7 @@ Kata Containers design documents: - [Metrics(Kata 2.0)](kata-2-0-metrics.md) - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md) - [Design for direct-assigned volume](direct-blk-device-assignment.md) - +- [Design for core-scheduling](core-scheduling.md) --- - [Design proposals](proposals) diff --git a/docs/design/core-scheduling.md b/docs/design/core-scheduling.md new file mode 100644 index 0000000000..7602e21cfe --- /dev/null +++ b/docs/design/core-scheduling.md @@ -0,0 +1,12 @@ +# Core scheduling + +Core scheduling is a Linux kernel feature that allows only trusted tasks to run concurrently on +CPUs sharing compute resources (for example, hyper-threads on a core). + +Containerd versions >= 1.6.4 leverage this to treat all of the processes associated with a +given pod or container to be a single group of trusted tasks. To indicate this should be carried +out, containerd sets the `SCHED_CORE` environment variable for each shim it spawns. When this is +set, the Kata Containers shim implementation uses the `prctl` syscall to create a new core scheduling +domain for the shim process itself as well as future VMM processes it will start. + +For more details on the core scheduling feature, see the [Linux documentation](https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html). diff --git a/src/runtime/pkg/containerd-shim-v2/service.go b/src/runtime/pkg/containerd-shim-v2/service.go index 72f3f14a04..27ebe19268 100644 --- a/src/runtime/pkg/containerd-shim-v2/service.go +++ b/src/runtime/pkg/containerd-shim-v2/service.go @@ -10,6 +10,7 @@ import ( "io" "os" sysexec "os/exec" + goruntime "runtime" "sync" "syscall" "time" @@ -31,6 +32,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" + "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" @@ -234,9 +236,19 @@ func (s *service) StartShim(ctx context.Context, opts cdshim.StartOpts) (_ strin cmd.ExtraFiles = append(cmd.ExtraFiles, f) + goruntime.LockOSThread() + if os.Getenv("SCHED_CORE") != "" { + if err := utils.Create(utils.ProcessGroup); err != nil { + return "", errors.Wrap(err, "enable sched core support") + } + } + if err := cmd.Start(); err != nil { return "", err } + + goruntime.UnlockOSThread() + defer func() { if retErr != nil { cmd.Process.Kill() diff --git a/src/runtime/pkg/utils/schedcore.go b/src/runtime/pkg/utils/schedcore.go new file mode 100644 index 0000000000..c35fecef4a --- /dev/null +++ b/src/runtime/pkg/utils/schedcore.go @@ -0,0 +1,36 @@ +// Copyright (c) 2022 Apple Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package utils + +import ( + "golang.org/x/sys/unix" +) + +// PidType is the type of provided pid value and how it should be treated +type PidType int + +const ( + pidTypePid = 0 + pidTypeThreadGroupId = 1 + pidTypeProcessGroupId = 2 + + // Pid affects the current pid + Pid PidType = pidTypePid + // ThreadGroup affects all threads in the group + ThreadGroup PidType = pidTypeThreadGroupId + // ProcessGroup affects all processes in the group + ProcessGroup PidType = pidTypeProcessGroupId +) + +// Create a new sched core domain +func Create(t PidType) error { + return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_CREATE, 0, uintptr(t), 0) +} + +// ShareFrom shares the sched core domain from the provided pid +func ShareFrom(pid uint64, t PidType) error { + return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_SHARE_FROM, uintptr(pid), uintptr(t), 0) +}