mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-27 15:57:09 +00:00
runtime-rs: add support for core scheduling
Linux 5.14 supports core scheduling to have better security control for SMT siblings. This PR supports that. Fixes: #4429 Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
This commit is contained in:
parent
993ae24080
commit
591dfa4fe6
105
src/runtime-rs/crates/shim/src/core_sched.rs
Normal file
105
src/runtime-rs/crates/shim/src/core_sched.rs
Normal file
@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2019-2022 Alibaba Cloud
|
||||
// Copyright (c) 2019-2022 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
//
|
||||
// Core Scheduling landed in linux 5.14, this enables that -
|
||||
// ONLY the processes have the same cookie value can share an SMT core for security
|
||||
// reasons, since SMT siblings share their cpu caches and many other things. This can
|
||||
// prevent some malicious processes steal others' private information.
|
||||
//
|
||||
// This is enabled by containerd, see https://github.com/containerd/containerd/blob/main/docs/man/containerd-config.toml.5.md#format
|
||||
//
|
||||
// This is done by using system call prctl(), for core scheduling purpose, it is defined as
|
||||
// int prctl(PR_SCHED_CORE, int cs_command, pid_t pid, enum pid_type type,
|
||||
// unsigned long *cookie);
|
||||
//
|
||||
// You may go to https://lwn.net/Articles/861251/, https://lore.kernel.org/lkml/20210422123309.039845339@infradead.org/
|
||||
// and kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html for more info.
|
||||
//
|
||||
|
||||
use anyhow::Result;
|
||||
use nix::{self, errno::Errno};
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub const PID_GROUP: usize = 0;
|
||||
#[allow(dead_code)]
|
||||
pub const THREAD_GROUP: usize = 1;
|
||||
pub const PROCESS_GROUP: usize = 2;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub const PR_SCHED_CORE: i32 = 62;
|
||||
pub const PR_SCHED_CORE_CREATE: usize = 1;
|
||||
pub const PR_SCHED_CORE_SHARE_FROM: usize = 3;
|
||||
|
||||
// create a new core sched domain, this will NOT succeed if kernel version < 5.14
|
||||
pub fn core_sched_create(pidtype: usize) -> Result<(), Errno> {
|
||||
let errno = unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, pidtype, 0) };
|
||||
if errno != 0 {
|
||||
Err(nix::errno::Errno::from_i32(errno))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// shares the domain with *pid*
|
||||
#[allow(dead_code)]
|
||||
pub fn core_sched_share_from(pid: usize, pidtype: usize) -> Result<(), Errno> {
|
||||
let errno =
|
||||
unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, pid, pidtype, 0) };
|
||||
if errno != 0 {
|
||||
Err(nix::errno::Errno::from_i32(errno))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use nix::errno::Errno::{EINVAL, ENODEV, ENOMEM, EPERM, ESRCH};
|
||||
|
||||
const RELEASE_MAJOR_VERSION: u8 = 5;
|
||||
const RELEASE_MINOR_VERSION: u8 = 14;
|
||||
|
||||
// since this feature only lands in linux 5.14, we run the test when version is higher
|
||||
fn core_sched_landed() -> bool {
|
||||
let vinfo = std::fs::read_to_string("/proc/sys/kernel/osrelease");
|
||||
if let Ok(info) = vinfo {
|
||||
let vnum: Vec<&str> = info.as_str().split('.').collect();
|
||||
if vnum.len() >= 2 {
|
||||
let major: u8 = vnum[0].parse().unwrap();
|
||||
let minor: u8 = vnum[1].parse().unwrap();
|
||||
return major >= RELEASE_MAJOR_VERSION && minor >= RELEASE_MINOR_VERSION;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_core_sched() {
|
||||
std::env::set_var("SCHED_CORE", "1");
|
||||
assert_eq!(std::env::var("SCHED_CORE").unwrap(), "1");
|
||||
if core_sched_landed() {
|
||||
// it is possible that the machine running this test does not support SMT,
|
||||
// therefore it does not make sense to assert a successful prctl call
|
||||
// but we can still make sure that the return value is a possible value
|
||||
let e = core_sched_create(PROCESS_GROUP);
|
||||
match e {
|
||||
Err(errno) => {
|
||||
if errno != EINVAL
|
||||
&& errno != ENODEV
|
||||
&& errno != ENOMEM
|
||||
&& errno != EPERM
|
||||
&& errno != ESRCH
|
||||
{
|
||||
panic!("impossible return value {:?}", errno);
|
||||
}
|
||||
}
|
||||
Ok(()) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -17,6 +17,7 @@ mod logger;
|
||||
mod panic_hook;
|
||||
mod shim;
|
||||
pub use shim::ShimExecutor;
|
||||
mod core_sched;
|
||||
#[rustfmt::skip]
|
||||
pub mod config;
|
||||
mod shim_delete;
|
||||
|
@ -8,9 +8,10 @@ use std::os::unix::io::RawFd;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kata_sys_util::spec::get_bundle_path;
|
||||
use nix::errno::Errno;
|
||||
|
||||
use crate::{
|
||||
logger,
|
||||
core_sched, logger,
|
||||
shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD},
|
||||
Error,
|
||||
};
|
||||
@ -23,6 +24,12 @@ impl ShimExecutor {
|
||||
let path = bundle_path.join("log");
|
||||
let _logger_guard =
|
||||
logger::set_logger(path.to_str().unwrap(), &sid, self.args.debug).context("set logger");
|
||||
if try_core_sched().is_err() {
|
||||
warn!(
|
||||
sl!(),
|
||||
"Failed to enable core sched since prctl() returns non-zero value."
|
||||
);
|
||||
}
|
||||
|
||||
self.do_run()
|
||||
.await
|
||||
@ -62,3 +69,17 @@ fn get_server_fd() -> Result<RawFd> {
|
||||
.map_err(|_| Error::ServerFd(env_fd))?;
|
||||
Ok(fd)
|
||||
}
|
||||
|
||||
// TODO: currently we log a warning on fail (i.e. kernel version < 5.14), maybe just exit
|
||||
fn try_core_sched() -> Result<(), Errno> {
|
||||
if let Ok(v) = std::env::var("SCHED_CORE") {
|
||||
info!(
|
||||
sl!(),
|
||||
"containerd wants to enable core scheduling, SCHED_CORE={}(expected 1)", v
|
||||
);
|
||||
if !v.is_empty() {
|
||||
return core_sched::core_sched_create(core_sched::PROCESS_GROUP);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user