From 22b6a94a848b1e06ba5145bf7777b15d124d259f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 24 May 2022 11:23:34 -0700 Subject: [PATCH 1/2] shim: add support for core scheduling In linux 5.14 and hopefully some backports, core scheduling allows processes to be co scheduled within the same domain on SMT enabled systems. Containerd impl sets the core sched domain when launching a shim. This allows a clean way for each shim(container/pod) to be in its own domain and any additional containers, (v2 pods) be be launched with the same domain as well as any exec'd process added to the container. kernel docs: https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html For Kata specifically, we will look for SCHED_CORE environment variable to be set to indicate we shuold create a new schedule core domain. This is equivalent to the containerd shim's PR: https://github.com/containerd/containerd/commit/e48bbe83949a43dedd3e2727452259f99dd81635 Fixes: #4309 Signed-off-by: Eric Ernst Signed-off-by: Michael Crosby --- src/runtime/pkg/containerd-shim-v2/service.go | 12 +++++++ src/runtime/pkg/utils/schedcore.go | 36 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 src/runtime/pkg/utils/schedcore.go diff --git a/src/runtime/pkg/containerd-shim-v2/service.go b/src/runtime/pkg/containerd-shim-v2/service.go index 72f3f14a04..27ebe19268 100644 --- a/src/runtime/pkg/containerd-shim-v2/service.go +++ b/src/runtime/pkg/containerd-shim-v2/service.go @@ -10,6 +10,7 @@ import ( "io" "os" sysexec "os/exec" + goruntime "runtime" "sync" "syscall" "time" @@ -31,6 +32,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" + "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" @@ -234,9 +236,19 @@ func (s *service) StartShim(ctx context.Context, opts cdshim.StartOpts) (_ strin cmd.ExtraFiles = append(cmd.ExtraFiles, f) + goruntime.LockOSThread() + if os.Getenv("SCHED_CORE") != "" { + if err := utils.Create(utils.ProcessGroup); err != nil { + return "", errors.Wrap(err, "enable sched core support") + } + } + if err := cmd.Start(); err != nil { return "", err } + + goruntime.UnlockOSThread() + defer func() { if retErr != nil { cmd.Process.Kill() diff --git a/src/runtime/pkg/utils/schedcore.go b/src/runtime/pkg/utils/schedcore.go new file mode 100644 index 0000000000..e5084bfd9a --- /dev/null +++ b/src/runtime/pkg/utils/schedcore.go @@ -0,0 +1,36 @@ +// Copyright (c) 2022 Apple Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package utils + +import ( + "golang.org/x/sys/unix" +) + +// PidType is the type of provided pid value and how it should be treated +type PidType int + +const ( + pidTypePid = 0 + pidTypeThreadGroupId = 1 + pidTypeProcessGroupId = 2 + + // Pid affects the current pid + Pid PidType = pidtypePid + // ThreadGroup affects all threads in the group + ThreadGroup PidType = pidtypeTgid + // ProcessGroup affects all processes in the group + ProcessGroup PidType = pidtypePgid +) + +// Create a new sched core domain +func Create(t PidType) error { + return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_CREATE, 0, uintptr(t), 0) +} + +// ShareFrom shares the sched core domain from the provided pid +func ShareFrom(pid uint64, t PidType) error { + return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_SHARE_FROM, uintptr(pid), uintptr(t), 0) +} From d2df1209a5a94a5d7851ba825db3031db535c203 Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Tue, 31 May 2022 10:30:22 -0700 Subject: [PATCH 2/2] docs: describe kata handling for core-scheduling Add initial documentation for core-scheduling. Signed-off-by: Eric Ernst --- docs/design/README.md | 2 +- docs/design/core-scheduling.md | 12 ++++++++++++ src/runtime/pkg/utils/schedcore.go | 6 +++--- 3 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 docs/design/core-scheduling.md diff --git a/docs/design/README.md b/docs/design/README.md index ad20cd7204..adcffd0196 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -12,7 +12,7 @@ Kata Containers design documents: - [Metrics(Kata 2.0)](kata-2-0-metrics.md) - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md) - [Design for direct-assigned volume](direct-blk-device-assignment.md) - +- [Design for core-scheduling](core-scheduling.md) --- - [Design proposals](proposals) diff --git a/docs/design/core-scheduling.md b/docs/design/core-scheduling.md new file mode 100644 index 0000000000..7602e21cfe --- /dev/null +++ b/docs/design/core-scheduling.md @@ -0,0 +1,12 @@ +# Core scheduling + +Core scheduling is a Linux kernel feature that allows only trusted tasks to run concurrently on +CPUs sharing compute resources (for example, hyper-threads on a core). + +Containerd versions >= 1.6.4 leverage this to treat all of the processes associated with a +given pod or container to be a single group of trusted tasks. To indicate this should be carried +out, containerd sets the `SCHED_CORE` environment variable for each shim it spawns. When this is +set, the Kata Containers shim implementation uses the `prctl` syscall to create a new core scheduling +domain for the shim process itself as well as future VMM processes it will start. + +For more details on the core scheduling feature, see the [Linux documentation](https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html). diff --git a/src/runtime/pkg/utils/schedcore.go b/src/runtime/pkg/utils/schedcore.go index e5084bfd9a..c35fecef4a 100644 --- a/src/runtime/pkg/utils/schedcore.go +++ b/src/runtime/pkg/utils/schedcore.go @@ -18,11 +18,11 @@ const ( pidTypeProcessGroupId = 2 // Pid affects the current pid - Pid PidType = pidtypePid + Pid PidType = pidTypePid // ThreadGroup affects all threads in the group - ThreadGroup PidType = pidtypeTgid + ThreadGroup PidType = pidTypeThreadGroupId // ProcessGroup affects all processes in the group - ProcessGroup PidType = pidtypePgid + ProcessGroup PidType = pidTypeProcessGroupId ) // Create a new sched core domain