diff --git a/src/runtime-rs/crates/shim/src/core_sched.rs b/src/runtime-rs/crates/shim/src/core_sched.rs new file mode 100644 index 0000000000..1464b86b39 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/core_sched.rs @@ -0,0 +1,105 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +// +// Core Scheduling landed in linux 5.14, this enables that - +// ONLY the processes have the same cookie value can share an SMT core for security +// reasons, since SMT siblings share their cpu caches and many other things. This can +// prevent some malicious processes steal others' private information. +// +// This is enabled by containerd, see https://github.com/containerd/containerd/blob/main/docs/man/containerd-config.toml.5.md#format +// +// This is done by using system call prctl(), for core scheduling purpose, it is defined as +// int prctl(PR_SCHED_CORE, int cs_command, pid_t pid, enum pid_type type, +// unsigned long *cookie); +// +// You may go to https://lwn.net/Articles/861251/, https://lore.kernel.org/lkml/20210422123309.039845339@infradead.org/ +// and kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html for more info. +// + +use anyhow::Result; +use nix::{self, errno::Errno}; + +#[allow(dead_code)] +pub const PID_GROUP: usize = 0; +#[allow(dead_code)] +pub const THREAD_GROUP: usize = 1; +pub const PROCESS_GROUP: usize = 2; + +#[allow(dead_code)] +pub const PR_SCHED_CORE: i32 = 62; +pub const PR_SCHED_CORE_CREATE: usize = 1; +pub const PR_SCHED_CORE_SHARE_FROM: usize = 3; + +// create a new core sched domain, this will NOT succeed if kernel version < 5.14 +pub fn core_sched_create(pidtype: usize) -> Result<(), Errno> { + let errno = unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, pidtype, 0) }; + if errno != 0 { + Err(nix::errno::Errno::from_i32(errno)) + } else { + Ok(()) + } +} + +// shares the domain with *pid* +#[allow(dead_code)] +pub fn core_sched_share_from(pid: usize, pidtype: usize) -> Result<(), Errno> { + let errno = + unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, pid, pidtype, 0) }; + if errno != 0 { + Err(nix::errno::Errno::from_i32(errno)) + } else { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::errno::Errno::{EINVAL, ENODEV, ENOMEM, EPERM, ESRCH}; + + const RELEASE_MAJOR_VERSION: u8 = 5; + const RELEASE_MINOR_VERSION: u8 = 14; + + // since this feature only lands in linux 5.14, we run the test when version is higher + fn core_sched_landed() -> bool { + let vinfo = std::fs::read_to_string("/proc/sys/kernel/osrelease"); + if let Ok(info) = vinfo { + let vnum: Vec<&str> = info.as_str().split('.').collect(); + if vnum.len() >= 2 { + let major: u8 = vnum[0].parse().unwrap(); + let minor: u8 = vnum[1].parse().unwrap(); + return major >= RELEASE_MAJOR_VERSION && minor >= RELEASE_MINOR_VERSION; + } + } + false + } + + #[test] + fn test_core_sched() { + std::env::set_var("SCHED_CORE", "1"); + assert_eq!(std::env::var("SCHED_CORE").unwrap(), "1"); + if core_sched_landed() { + // it is possible that the machine running this test does not support SMT, + // therefore it does not make sense to assert a successful prctl call + // but we can still make sure that the return value is a possible value + let e = core_sched_create(PROCESS_GROUP); + match e { + Err(errno) => { + if errno != EINVAL + && errno != ENODEV + && errno != ENOMEM + && errno != EPERM + && errno != ESRCH + { + panic!("impossible return value {:?}", errno); + } + } + Ok(()) => {} + } + } + } +} diff --git a/src/runtime-rs/crates/shim/src/lib.rs b/src/runtime-rs/crates/shim/src/lib.rs index 000c5620a2..2b419061c8 100644 --- a/src/runtime-rs/crates/shim/src/lib.rs +++ b/src/runtime-rs/crates/shim/src/lib.rs @@ -17,6 +17,7 @@ mod logger; mod panic_hook; mod shim; pub use shim::ShimExecutor; +mod core_sched; #[rustfmt::skip] pub mod config; mod shim_delete; diff --git a/src/runtime-rs/crates/shim/src/shim_run.rs b/src/runtime-rs/crates/shim/src/shim_run.rs index cde365780e..553b90a2c5 100644 --- a/src/runtime-rs/crates/shim/src/shim_run.rs +++ b/src/runtime-rs/crates/shim/src/shim_run.rs @@ -8,9 +8,10 @@ use std::os::unix::io::RawFd; use anyhow::{Context, Result}; use kata_sys_util::spec::get_bundle_path; +use nix::errno::Errno; use crate::{ - logger, + core_sched, logger, shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD}, Error, }; @@ -23,6 +24,12 @@ impl ShimExecutor { let path = bundle_path.join("log"); let _logger_guard = logger::set_logger(path.to_str().unwrap(), &sid, self.args.debug).context("set logger"); + if try_core_sched().is_err() { + warn!( + sl!(), + "Failed to enable core sched since prctl() returns non-zero value." + ); + } self.do_run() .await @@ -62,3 +69,17 @@ fn get_server_fd() -> Result { .map_err(|_| Error::ServerFd(env_fd))?; Ok(fd) } + +// TODO: currently we log a warning on fail (i.e. kernel version < 5.14), maybe just exit +fn try_core_sched() -> Result<(), Errno> { + if let Ok(v) = std::env::var("SCHED_CORE") { + info!( + sl!(), + "containerd wants to enable core scheduling, SCHED_CORE={}(expected 1)", v + ); + if !v.is_empty() { + return core_sched::core_sched_create(core_sched::PROCESS_GROUP); + } + } + Ok(()) +}