runtime-rs: add support for core scheduling

Linux 5.14 supports core scheduling to have better security control
for SMT siblings. This PR supports that.

Fixes: #4429
Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
This commit is contained in:
Ji-Xinyou 2022-08-02 17:54:04 +08:00
parent 993ae24080
commit 591dfa4fe6
3 changed files with 128 additions and 1 deletions

View File

@ -0,0 +1,105 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
//
// Core Scheduling landed in linux 5.14, this enables that -
// ONLY the processes have the same cookie value can share an SMT core for security
// reasons, since SMT siblings share their cpu caches and many other things. This can
// prevent some malicious processes steal others' private information.
//
// This is enabled by containerd, see https://github.com/containerd/containerd/blob/main/docs/man/containerd-config.toml.5.md#format
//
// This is done by using system call prctl(), for core scheduling purpose, it is defined as
// int prctl(PR_SCHED_CORE, int cs_command, pid_t pid, enum pid_type type,
// unsigned long *cookie);
//
// You may go to https://lwn.net/Articles/861251/, https://lore.kernel.org/lkml/20210422123309.039845339@infradead.org/
// and kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html for more info.
//
use anyhow::Result;
use nix::{self, errno::Errno};
#[allow(dead_code)]
pub const PID_GROUP: usize = 0;
#[allow(dead_code)]
pub const THREAD_GROUP: usize = 1;
pub const PROCESS_GROUP: usize = 2;
#[allow(dead_code)]
pub const PR_SCHED_CORE: i32 = 62;
pub const PR_SCHED_CORE_CREATE: usize = 1;
pub const PR_SCHED_CORE_SHARE_FROM: usize = 3;
// create a new core sched domain, this will NOT succeed if kernel version < 5.14
pub fn core_sched_create(pidtype: usize) -> Result<(), Errno> {
let errno = unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, pidtype, 0) };
if errno != 0 {
Err(nix::errno::Errno::from_i32(errno))
} else {
Ok(())
}
}
// shares the domain with *pid*
#[allow(dead_code)]
pub fn core_sched_share_from(pid: usize, pidtype: usize) -> Result<(), Errno> {
let errno =
unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, pid, pidtype, 0) };
if errno != 0 {
Err(nix::errno::Errno::from_i32(errno))
} else {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use nix::errno::Errno::{EINVAL, ENODEV, ENOMEM, EPERM, ESRCH};
const RELEASE_MAJOR_VERSION: u8 = 5;
const RELEASE_MINOR_VERSION: u8 = 14;
// since this feature only lands in linux 5.14, we run the test when version is higher
fn core_sched_landed() -> bool {
let vinfo = std::fs::read_to_string("/proc/sys/kernel/osrelease");
if let Ok(info) = vinfo {
let vnum: Vec<&str> = info.as_str().split('.').collect();
if vnum.len() >= 2 {
let major: u8 = vnum[0].parse().unwrap();
let minor: u8 = vnum[1].parse().unwrap();
return major >= RELEASE_MAJOR_VERSION && minor >= RELEASE_MINOR_VERSION;
}
}
false
}
#[test]
fn test_core_sched() {
std::env::set_var("SCHED_CORE", "1");
assert_eq!(std::env::var("SCHED_CORE").unwrap(), "1");
if core_sched_landed() {
// it is possible that the machine running this test does not support SMT,
// therefore it does not make sense to assert a successful prctl call
// but we can still make sure that the return value is a possible value
let e = core_sched_create(PROCESS_GROUP);
match e {
Err(errno) => {
if errno != EINVAL
&& errno != ENODEV
&& errno != ENOMEM
&& errno != EPERM
&& errno != ESRCH
{
panic!("impossible return value {:?}", errno);
}
}
Ok(()) => {}
}
}
}
}

View File

@ -17,6 +17,7 @@ mod logger;
mod panic_hook;
mod shim;
pub use shim::ShimExecutor;
mod core_sched;
#[rustfmt::skip]
pub mod config;
mod shim_delete;

View File

@ -8,9 +8,10 @@ use std::os::unix::io::RawFd;
use anyhow::{Context, Result};
use kata_sys_util::spec::get_bundle_path;
use nix::errno::Errno;
use crate::{
logger,
core_sched, logger,
shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD},
Error,
};
@ -23,6 +24,12 @@ impl ShimExecutor {
let path = bundle_path.join("log");
let _logger_guard =
logger::set_logger(path.to_str().unwrap(), &sid, self.args.debug).context("set logger");
if try_core_sched().is_err() {
warn!(
sl!(),
"Failed to enable core sched since prctl() returns non-zero value."
);
}
self.do_run()
.await
@ -62,3 +69,17 @@ fn get_server_fd() -> Result<RawFd> {
.map_err(|_| Error::ServerFd(env_fd))?;
Ok(fd)
}
// TODO: currently we log a warning on fail (i.e. kernel version < 5.14), maybe just exit
fn try_core_sched() -> Result<(), Errno> {
if let Ok(v) = std::env::var("SCHED_CORE") {
info!(
sl!(),
"containerd wants to enable core scheduling, SCHED_CORE={}(expected 1)", v
);
if !v.is_empty() {
return core_sched::core_sched_create(core_sched::PROCESS_GROUP);
}
}
Ok(())
}