mirror of
				https://github.com/kata-containers/kata-containers.git
				synced 2025-10-24 21:51:37 +00:00 
			
		
		
		
	runtime-rs: add support for core scheduling
Linux 5.14 supports core scheduling to have better security control for SMT siblings. This PR supports that. Fixes: #4429 Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
This commit is contained in:
		
							
								
								
									
										105
									
								
								src/runtime-rs/crates/shim/src/core_sched.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								src/runtime-rs/crates/shim/src/core_sched.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,105 @@ | ||||
| // Copyright (c) 2019-2022 Alibaba Cloud | ||||
| // Copyright (c) 2019-2022 Ant Group | ||||
| // | ||||
| // SPDX-License-Identifier: Apache-2.0 | ||||
| // | ||||
|  | ||||
| // | ||||
| // Core Scheduling landed in linux 5.14, this enables that - | ||||
| // ONLY the processes have the same cookie value can share an SMT core for security | ||||
| // reasons, since SMT siblings share their cpu caches and many other things. This can | ||||
| // prevent some malicious processes steal others' private information. | ||||
| // | ||||
| // This is enabled by containerd, see https://github.com/containerd/containerd/blob/main/docs/man/containerd-config.toml.5.md#format | ||||
| // | ||||
| // This is done by using system call prctl(), for core scheduling purpose, it is defined as | ||||
| // int prctl(PR_SCHED_CORE, int cs_command, pid_t pid, enum pid_type type, | ||||
| //           unsigned long *cookie); | ||||
| // | ||||
| // You may go to https://lwn.net/Articles/861251/, https://lore.kernel.org/lkml/20210422123309.039845339@infradead.org/ | ||||
| // and kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html for more info. | ||||
| // | ||||
|  | ||||
| use anyhow::Result; | ||||
| use nix::{self, errno::Errno}; | ||||
|  | ||||
| #[allow(dead_code)] | ||||
| pub const PID_GROUP: usize = 0; | ||||
| #[allow(dead_code)] | ||||
| pub const THREAD_GROUP: usize = 1; | ||||
| pub const PROCESS_GROUP: usize = 2; | ||||
|  | ||||
| #[allow(dead_code)] | ||||
| pub const PR_SCHED_CORE: i32 = 62; | ||||
| pub const PR_SCHED_CORE_CREATE: usize = 1; | ||||
| pub const PR_SCHED_CORE_SHARE_FROM: usize = 3; | ||||
|  | ||||
| // create a new core sched domain, this will NOT succeed if kernel version < 5.14 | ||||
| pub fn core_sched_create(pidtype: usize) -> Result<(), Errno> { | ||||
|     let errno = unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, pidtype, 0) }; | ||||
|     if errno != 0 { | ||||
|         Err(nix::errno::Errno::from_i32(errno)) | ||||
|     } else { | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| // shares the domain with *pid* | ||||
| #[allow(dead_code)] | ||||
| pub fn core_sched_share_from(pid: usize, pidtype: usize) -> Result<(), Errno> { | ||||
|     let errno = | ||||
|         unsafe { nix::libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, pid, pidtype, 0) }; | ||||
|     if errno != 0 { | ||||
|         Err(nix::errno::Errno::from_i32(errno)) | ||||
|     } else { | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|     use nix::errno::Errno::{EINVAL, ENODEV, ENOMEM, EPERM, ESRCH}; | ||||
|  | ||||
|     const RELEASE_MAJOR_VERSION: u8 = 5; | ||||
|     const RELEASE_MINOR_VERSION: u8 = 14; | ||||
|  | ||||
|     // since this feature only lands in linux 5.14, we run the test when version is higher | ||||
|     fn core_sched_landed() -> bool { | ||||
|         let vinfo = std::fs::read_to_string("/proc/sys/kernel/osrelease"); | ||||
|         if let Ok(info) = vinfo { | ||||
|             let vnum: Vec<&str> = info.as_str().split('.').collect(); | ||||
|             if vnum.len() >= 2 { | ||||
|                 let major: u8 = vnum[0].parse().unwrap(); | ||||
|                 let minor: u8 = vnum[1].parse().unwrap(); | ||||
|                 return major >= RELEASE_MAJOR_VERSION && minor >= RELEASE_MINOR_VERSION; | ||||
|             } | ||||
|         } | ||||
|         false | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_core_sched() { | ||||
|         std::env::set_var("SCHED_CORE", "1"); | ||||
|         assert_eq!(std::env::var("SCHED_CORE").unwrap(), "1"); | ||||
|         if core_sched_landed() { | ||||
|             // it is possible that the machine running this test does not support SMT, | ||||
|             // therefore it does not make sense to assert a successful prctl call | ||||
|             // but we can still make sure that the return value is a possible value | ||||
|             let e = core_sched_create(PROCESS_GROUP); | ||||
|             match e { | ||||
|                 Err(errno) => { | ||||
|                     if errno != EINVAL | ||||
|                         && errno != ENODEV | ||||
|                         && errno != ENOMEM | ||||
|                         && errno != EPERM | ||||
|                         && errno != ESRCH | ||||
|                     { | ||||
|                         panic!("impossible return value {:?}", errno); | ||||
|                     } | ||||
|                 } | ||||
|                 Ok(()) => {} | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -17,6 +17,7 @@ mod logger; | ||||
| mod panic_hook; | ||||
| mod shim; | ||||
| pub use shim::ShimExecutor; | ||||
| mod core_sched; | ||||
| #[rustfmt::skip] | ||||
| pub mod config; | ||||
| mod shim_delete; | ||||
|   | ||||
| @@ -8,9 +8,10 @@ use std::os::unix::io::RawFd; | ||||
|  | ||||
| use anyhow::{Context, Result}; | ||||
| use kata_sys_util::spec::get_bundle_path; | ||||
| use nix::errno::Errno; | ||||
|  | ||||
| use crate::{ | ||||
|     logger, | ||||
|     core_sched, logger, | ||||
|     shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD}, | ||||
|     Error, | ||||
| }; | ||||
| @@ -23,6 +24,12 @@ impl ShimExecutor { | ||||
|         let path = bundle_path.join("log"); | ||||
|         let _logger_guard = | ||||
|             logger::set_logger(path.to_str().unwrap(), &sid, self.args.debug).context("set logger"); | ||||
|         if try_core_sched().is_err() { | ||||
|             warn!( | ||||
|                 sl!(), | ||||
|                 "Failed to enable core sched since prctl() returns non-zero value." | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         self.do_run() | ||||
|             .await | ||||
| @@ -62,3 +69,17 @@ fn get_server_fd() -> Result<RawFd> { | ||||
|         .map_err(|_| Error::ServerFd(env_fd))?; | ||||
|     Ok(fd) | ||||
| } | ||||
|  | ||||
| // TODO: currently we log a warning on fail (i.e. kernel version < 5.14), maybe just exit | ||||
| fn try_core_sched() -> Result<(), Errno> { | ||||
|     if let Ok(v) = std::env::var("SCHED_CORE") { | ||||
|         info!( | ||||
|             sl!(), | ||||
|             "containerd wants to enable core scheduling, SCHED_CORE={}(expected 1)", v | ||||
|         ); | ||||
|         if !v.is_empty() { | ||||
|             return core_sched::core_sched_create(core_sched::PROCESS_GROUP); | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user