From a9c746f28422cfec493cb9a4ad4135c79221c2ea Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Sun, 7 Aug 2022 18:53:56 +0900 Subject: [PATCH 1/5] kernel: Add kernel configs for SELinux Add kernel configs related to SELinux in order to add the support for containers running inside the guest. Fixes: #4812 Signed-off-by: Manabu Sugimoto --- .../kernel/configs/fragments/common/lsm.conf | 12 ++++++++++++ tools/packaging/kernel/kata_config_version | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 tools/packaging/kernel/configs/fragments/common/lsm.conf diff --git a/tools/packaging/kernel/configs/fragments/common/lsm.conf b/tools/packaging/kernel/configs/fragments/common/lsm.conf new file mode 100644 index 0000000000..6dc685fca7 --- /dev/null +++ b/tools/packaging/kernel/configs/fragments/common/lsm.conf @@ -0,0 +1,12 @@ +# SELinux support: +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_LSM_MMAP_MIN_ADDR=6553 +CONFIG_NETWORK_SECMARK=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 +CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index f906e1845d..c17e934b51 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -96 +97 From a75f99d20d6310039d2bf675e275e8a1b9e66177 Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Mon, 8 Aug 2022 09:42:41 +0900 Subject: [PATCH 2/5] osbuilder: Create guest image for SELinux Create a guest image to support SELinux for containers inside the guest if `SELINUX=yes` is specified. This works only if the guest rootfs is CentOS and the init service is systemd, not the agent init. To enable labeling the guest image on the host, selinuxfs must be mounted on the host. The kata-agent will be labeled as `container_runtime_exec_t` type. Fixes: #4812 Signed-off-by: Manabu Sugimoto --- .../osbuilder/image-builder/image_builder.sh | 45 ++++++++++++++++++- .../osbuilder/rootfs-builder/centos/config.sh | 5 +++ tools/osbuilder/rootfs-builder/rootfs.sh | 16 +++++++ tools/osbuilder/scripts/lib.sh | 18 +++++++- 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/tools/osbuilder/image-builder/image_builder.sh b/tools/osbuilder/image-builder/image_builder.sh index 8b65ab4e8f..75b23b1765 100755 --- a/tools/osbuilder/image-builder/image_builder.sh +++ b/tools/osbuilder/image-builder/image_builder.sh @@ -64,6 +64,8 @@ readonly -a systemd_files=( # Set a default value AGENT_INIT=${AGENT_INIT:-no} +SELINUX=${SELINUX:-no} +SELINUXFS="/sys/fs/selinux" # Align image to 128M readonly mem_boundary_mb=128 @@ -93,6 +95,10 @@ Extra environment variables: DEFAULT: not set USE_PODMAN: If set and USE_DOCKER not set, will build image in a Podman Container (requries podman) DEFAULT: not set + SELINUX: If set to "yes", the rootfs is labeled for SELinux. + Make sure that selinuxfs is mounted to /sys/fs/selinux on the host + and the rootfs is built with SELINUX=yes. + DEFAULT value: "no" Following diagram shows how the resulting image will look like @@ -134,6 +140,7 @@ build_with_container() { local nsdax_bin="$9" local container_image_name="image-builder-osbuilder" local shared_files="" + local selinuxfs="" image_dir=$(readlink -f "$(dirname "${image}")") image_name=$(basename "${image}") @@ -157,6 +164,14 @@ build_with_container() { shared_files+="-v ${mke2fs_conf}:${mke2fs_conf}:ro " fi + if [ "${SELINUX}" == "yes" ]; then + if mountpoint $SELINUXFS > /dev/null; then + selinuxfs="-v ${SELINUXFS}:${SELINUXFS}" + else + die "Make sure that SELinux is enabled on the host" + fi + fi + #Make sure we use a compatible runtime to build rootfs # In case Clear Containers Runtime is installed we dont want to hit issue: #https://github.com/clearcontainers/runtime/issues/828 @@ -170,12 +185,14 @@ build_with_container() { --env BLOCK_SIZE="${block_size}" \ --env ROOT_FREE_SPACE="${root_free_space}" \ --env NSDAX_BIN="${nsdax_bin}" \ + --env SELINUX="${SELINUX}" \ --env DEBUG="${DEBUG}" \ -v /dev:/dev \ -v "${script_dir}":"/osbuilder" \ -v "${script_dir}/../scripts":"/scripts" \ -v "${rootfs}":"/rootfs" \ -v "${image_dir}":"/image" \ + ${selinuxfs} \ ${shared_files} \ ${container_image_name} \ bash "/osbuilder/${script_name}" -o "/image/${image_name}" /rootfs @@ -384,6 +401,7 @@ create_rootfs_image() { local img_size="$3" local fs_type="$4" local block_size="$5" + local agent_bin="$6" create_disk "${image}" "${img_size}" "${fs_type}" "${rootfs_start}" @@ -402,6 +420,31 @@ create_rootfs_image() { info "Copying content from rootfs to root partition" cp -a "${rootfs}"/* "${mount_dir}" + + if [ "${SELINUX}" == "yes" ]; then + if [ "${AGENT_INIT}" == "yes" ]; then + die "Guest SELinux with the agent init is not supported yet" + fi + + info "Labeling rootfs for SELinux" + selinuxfs_path="${mount_dir}${SELINUXFS}" + mkdir -p $selinuxfs_path + if mountpoint $SELINUXFS > /dev/null && \ + chroot "${mount_dir}" command -v restorecon > /dev/null; then + mount -t selinuxfs selinuxfs $selinuxfs_path + chroot "${mount_dir}" restorecon -RF -e ${SELINUXFS} / + # TODO: This operation will be removed after the updated container-selinux that + # includes the following commit is released. + # https://github.com/containers/container-selinux/commit/39f83cc74d50bd10ab6be4d0bdd98bc04857469f + # We use chcon as an interim solution until then. + chroot "${mount_dir}" chcon -t container_runtime_exec_t "/usr/bin/${agent_bin}" + umount $selinuxfs_path + else + die "Could not label the rootfs. Make sure that SELinux is enabled on the host \ +and the rootfs is built with SELINUX=yes" + fi + fi + sync OK "rootfs copied" @@ -529,7 +572,7 @@ main() { # consider in calculate_img_size rootfs_img_size=$((img_size - dax_header_sz)) create_rootfs_image "${rootfs}" "${image}" "${rootfs_img_size}" \ - "${fs_type}" "${block_size}" + "${fs_type}" "${block_size}" "${agent_bin}" # insert at the beginning of the image the MBR + DAX header set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}" diff --git a/tools/osbuilder/rootfs-builder/centos/config.sh b/tools/osbuilder/rootfs-builder/centos/config.sh index 7226da047c..2123903a06 100644 --- a/tools/osbuilder/rootfs-builder/centos/config.sh +++ b/tools/osbuilder/rootfs-builder/centos/config.sh @@ -8,10 +8,15 @@ OS_VERSION=${OS_VERSION:-stream9} PACKAGES="chrony iptables" [ "$AGENT_INIT" = no ] && PACKAGES+=" systemd" [ "$SECCOMP" = yes ] && PACKAGES+=" libseccomp" +[ "$SELINUX" = yes ] && PACKAGES+=" container-selinux" # Container registry tag is different from metalink repo, e.g. "stream9" => "9-stream" os_repo_version="$(sed -E "s/(stream)(.+)/\2-\1/" <<< "$OS_VERSION")" METALINK="https://mirrors.centos.org/metalink?repo=centos-baseos-$os_repo_version&arch=\$basearch" +if [ "$SELINUX" == yes ]; then + # AppStream repository is required for the container-selinux package + METALINK_APPSTREAM="https://mirrors.centos.org/metalink?repo=centos-appstream-$os_repo_version&arch=\$basearch" +fi GPG_KEY_FILE=RPM-GPG-KEY-CentOS-Official GPG_KEY_URL="https://centos.org/keys/$GPG_KEY_FILE" diff --git a/tools/osbuilder/rootfs-builder/rootfs.sh b/tools/osbuilder/rootfs-builder/rootfs.sh index c69f8dfef0..43c79fd7d5 100755 --- a/tools/osbuilder/rootfs-builder/rootfs.sh +++ b/tools/osbuilder/rootfs-builder/rootfs.sh @@ -25,6 +25,7 @@ LIBC=${LIBC:-musl} # The kata agent enables seccomp feature. # However, it is not enforced by default: you need to enable that in the main configuration file. SECCOMP=${SECCOMP:-"yes"} +SELINUX=${SELINUX:-"no"} lib_file="${script_dir}/../scripts/lib.sh" source "$lib_file" @@ -142,6 +143,11 @@ ROOTFS_DIR Path to the directory that is populated with the rootfs. SECCOMP When set to "no", the kata-agent is built without seccomp capability. Default value: "yes" +SELINUX When set to "yes", build the rootfs with the required packages to + enable SELinux in the VM. + Make sure the guest kernel is compiled with SELinux enabled. + Default value: "no" + USE_DOCKER If set, build the rootfs inside a container (requires Docker). Default value: @@ -346,6 +352,15 @@ build_rootfs_distro() echo "Required rust version: $RUST_VERSION" + if [ "${SELINUX}" == "yes" ]; then + if [ "${AGENT_INIT}" == "yes" ]; then + die "Guest SELinux with the agent init is not supported yet" + fi + if [ "${distro}" != "centos" ]; then + die "The guest rootfs must be CentOS to enable guest SELinux" + fi + fi + if [ -z "${USE_DOCKER}" ] && [ -z "${USE_PODMAN}" ]; then info "build directly" build_rootfs ${ROOTFS_DIR} @@ -426,6 +441,7 @@ build_rootfs_distro() --env OS_VERSION="${OS_VERSION}" \ --env INSIDE_CONTAINER=1 \ --env SECCOMP="${SECCOMP}" \ + --env SELINUX="${SELINUX}" \ --env DEBUG="${DEBUG}" \ --env HOME="/root" \ -v "${repo_dir}":"/kata-containers" \ diff --git a/tools/osbuilder/scripts/lib.sh b/tools/osbuilder/scripts/lib.sh index 5ed0176771..615ff10a95 100644 --- a/tools/osbuilder/scripts/lib.sh +++ b/tools/osbuilder/scripts/lib.sh @@ -79,7 +79,23 @@ gpgcheck=1 gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE} EOF fi - + if [ "$SELINUX" == "yes" ]; then + cat > "${DNF_CONF}" << EOF +[appstream] +name=${OS_NAME}-${OS_VERSION} upstream +releasever=${OS_VERSION} +EOF + echo "metalink=$METALINK_APPSTREAM" >> "$DNF_CONF" + if [ -n "$GPG_KEY_URL" ]; then + if [ ! -f "${CONFIG_DIR}/${GPG_KEY_FILE}" ]; then + curl -L "${GPG_KEY_URL}" -o "${CONFIG_DIR}/${GPG_KEY_FILE}" + fi + cat >> "${DNF_CONF}" << EOF +gpgcheck=1 +gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE} +EOF + fi + fi } build_rootfs() From 93547692863aea6bbb3374752d75839730692463 Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Sun, 7 Aug 2022 18:14:22 +0900 Subject: [PATCH 3/5] agent: Add SELinux support for containers The kata-agent supports SELinux for containers inside the guest to comply with the OCI runtime specification. Fixes: #4812 Signed-off-by: Manabu Sugimoto --- src/agent/Cargo.lock | 10 ++++ src/agent/rustjail/Cargo.toml | 1 + src/agent/rustjail/src/container.rs | 15 ++++++ src/agent/rustjail/src/lib.rs | 1 + src/agent/rustjail/src/mount.rs | 45 ++++++++++++++-- src/agent/rustjail/src/selinux.rs | 80 +++++++++++++++++++++++++++++ src/agent/rustjail/src/validator.rs | 45 ++++++++++++++-- 7 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 src/agent/rustjail/src/selinux.rs diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index d4407e7ee0..b6949e3296 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -1764,6 +1764,7 @@ dependencies = [ "tempfile", "test-utils", "tokio", + "xattr", "zbus", ] @@ -2552,6 +2553,15 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" +[[package]] +name = "xattr" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" +dependencies = [ + "libc", +] + [[package]] name = "zbus" version = "2.3.2" diff --git a/src/agent/rustjail/Cargo.toml b/src/agent/rustjail/Cargo.toml index 031175e867..851721ce0a 100644 --- a/src/agent/rustjail/Cargo.toml +++ b/src/agent/rustjail/Cargo.toml @@ -35,6 +35,7 @@ inotify = "0.9.2" libseccomp = { version = "0.3.0", optional = true } zbus = "2.3.0" bit-vec= "0.6.3" +xattr = "0.2.3" [dev-dependencies] serial_test = "0.5.0" diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index de92adf4ce..2cc4da9e49 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -30,6 +30,7 @@ use crate::log_child; use crate::process::Process; #[cfg(feature = "seccomp")] use crate::seccomp; +use crate::selinux; use crate::specconv::CreateOpts; use crate::{mount, validator}; @@ -537,6 +538,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> { } } + let selinux_enabled = selinux::is_enabled()?; + sched::unshare(to_new & !CloneFlags::CLONE_NEWUSER)?; if userns { @@ -638,6 +641,18 @@ fn do_init_child(cwfd: RawFd) -> Result<()> { capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?; } + // Set SELinux label + if !oci_process.selinux_label.is_empty() { + if !selinux_enabled { + return Err(anyhow!( + "SELinux label for the process is provided but SELinux is not enabled on the running kernel" + )); + } + + log_child!(cfd_log, "Set SELinux label to the container process"); + selinux::set_exec_label(&oci_process.selinux_label)?; + } + // Log unknown seccomp system calls in advance before the log file descriptor closes. #[cfg(feature = "seccomp")] if let Some(ref scmp) = linux.seccomp { diff --git a/src/agent/rustjail/src/lib.rs b/src/agent/rustjail/src/lib.rs index fb51d9f395..6f96d18c2d 100644 --- a/src/agent/rustjail/src/lib.rs +++ b/src/agent/rustjail/src/lib.rs @@ -38,6 +38,7 @@ pub mod pipestream; pub mod process; #[cfg(feature = "seccomp")] pub mod seccomp; +pub mod selinux; pub mod specconv; pub mod sync; pub mod sync_with_async; diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index d3f87a8b49..a6418a3435 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -25,6 +25,7 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use crate::container::DEFAULT_DEVICES; +use crate::selinux; use crate::sync::write_count; use std::string::ToString; @@ -181,6 +182,8 @@ pub fn init_rootfs( None => flags |= MsFlags::MS_SLAVE, } + let label = &linux.mount_label; + let root = spec .root .as_ref() @@ -244,7 +247,7 @@ pub fn init_rootfs( } } - mount_from(cfd_log, m, rootfs, flags, &data, "")?; + mount_from(cfd_log, m, rootfs, flags, &data, label)?; // bind mount won't change mount options, we need remount to make mount options // effective. // first check that we have non-default options required before attempting a @@ -524,7 +527,6 @@ pub fn pivot_rootfs(path: &P) -> Result<( fn rootfs_parent_mount_private(path: &str) -> Result<()> { let mount_infos = parse_mount_table(MOUNTINFO_PATH)?; - let mut max_len = 0; let mut mount_point = String::from(""); let mut options = String::from(""); @@ -767,9 +769,9 @@ fn mount_from( rootfs: &str, flags: MsFlags, data: &str, - _label: &str, + label: &str, ) -> Result<()> { - let d = String::from(data); + let mut d = String::from(data); let dest = secure_join(rootfs, &m.destination); let src = if m.r#type.as_str() == "bind" { @@ -822,6 +824,37 @@ fn mount_from( e })?; + // Set the SELinux context for the mounts + let mut use_xattr = false; + if !label.is_empty() { + if selinux::is_enabled()? { + let device = Path::new(&m.source) + .file_name() + .ok_or_else(|| anyhow!("invalid device source path: {}", &m.source))? + .to_str() + .ok_or_else(|| anyhow!("failed to convert device source path: {}", &m.source))?; + + match device { + // SELinux does not support labeling of /proc or /sys + "proc" | "sysfs" => (), + // SELinux does not support mount labeling against /dev/mqueue, + // so we use setxattr instead + "mqueue" => { + use_xattr = true; + } + _ => { + log_child!(cfd_log, "add SELinux mount label to {}", dest.as_str()); + selinux::add_mount_label(&mut d, label); + } + } + } else { + log_child!( + cfd_log, + "SELinux label for the mount is provided but SELinux is not enabled on the running kernel" + ); + } + } + mount( Some(src.as_str()), dest.as_str(), @@ -834,6 +867,10 @@ fn mount_from( e })?; + if !label.is_empty() && selinux::is_enabled()? && use_xattr { + xattr::set(dest.as_str(), "security.selinux", label.as_bytes())?; + } + if flags.contains(MsFlags::MS_BIND) && flags.intersects( !(MsFlags::MS_REC diff --git a/src/agent/rustjail/src/selinux.rs b/src/agent/rustjail/src/selinux.rs new file mode 100644 index 0000000000..5a647e3cc4 --- /dev/null +++ b/src/agent/rustjail/src/selinux.rs @@ -0,0 +1,80 @@ +// Copyright 2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use nix::unistd::gettid; +use std::fs::{self, OpenOptions}; +use std::io::prelude::*; +use std::path::Path; + +pub fn is_enabled() -> Result { + let buf = fs::read_to_string("/proc/mounts")?; + let enabled = buf.contains("selinuxfs"); + + Ok(enabled) +} + +pub fn add_mount_label(data: &mut String, label: &str) { + if data.is_empty() { + let context = format!("context=\"{}\"", label); + data.push_str(&context); + } else { + let context = format!(",context=\"{}\"", label); + data.push_str(&context); + } +} + +pub fn set_exec_label(label: &str) -> Result<()> { + let mut attr_path = Path::new("/proc/thread-self/attr/exec").to_path_buf(); + if !attr_path.exists() { + // Fall back to the old convention + attr_path = Path::new("/proc/self/task") + .join(gettid().to_string()) + .join("attr/exec") + } + + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .open(attr_path)?; + file.write_all(label.as_bytes()) + .with_context(|| "failed to apply SELinux label")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST_LABEL: &str = "system_u:system_r:unconfined_t:s0"; + + #[test] + fn test_is_enabled() { + let ret = is_enabled(); + assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret); + } + + #[test] + fn test_add_mount_label() { + let mut data = String::new(); + add_mount_label(&mut data, TEST_LABEL); + assert_eq!(data, format!("context=\"{}\"", TEST_LABEL)); + + let mut data = String::from("defaults"); + add_mount_label(&mut data, TEST_LABEL); + assert_eq!(data, format!("defaults,context=\"{}\"", TEST_LABEL)); + } + + #[test] + fn test_set_exec_label() { + let ret = set_exec_label(TEST_LABEL); + if is_enabled().unwrap() { + assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret); + } else { + assert!(ret.is_err(), "Expecting error, Got {:?}", ret); + } + } +} diff --git a/src/agent/rustjail/src/validator.rs b/src/agent/rustjail/src/validator.rs index aea0f8f063..4955fbf466 100644 --- a/src/agent/rustjail/src/validator.rs +++ b/src/agent/rustjail/src/validator.rs @@ -6,6 +6,7 @@ use crate::container::Config; use anyhow::{anyhow, Context, Result}; use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec}; +use regex::Regex; use std::collections::HashMap; use std::path::{Component, PathBuf}; @@ -86,6 +87,23 @@ fn hostname(oci: &Spec) -> Result<()> { fn security(oci: &Spec) -> Result<()> { let linux = get_linux(oci)?; + let label_pattern = r".*_u:.*_r:.*_t:s[0-9]|1[0-5].*"; + let label_regex = Regex::new(label_pattern)?; + + if let Some(ref process) = oci.process { + if !process.selinux_label.is_empty() && !label_regex.is_match(&process.selinux_label) { + return Err(anyhow!( + "SELinux label for the process is invalid format: {}", + &process.selinux_label + )); + } + } + if !linux.mount_label.is_empty() && !label_regex.is_match(&linux.mount_label) { + return Err(anyhow!( + "SELinux label for the mount is invalid format: {}", + &linux.mount_label + )); + } if linux.masked_paths.is_empty() && linux.readonly_paths.is_empty() { return Ok(()); @@ -95,8 +113,6 @@ fn security(oci: &Spec) -> Result<()> { return Err(anyhow!("Linux namespace does not contain mount")); } - // don't care about selinux at present - Ok(()) } @@ -285,7 +301,7 @@ pub fn validate(conf: &Config) -> Result<()> { #[cfg(test)] mod tests { use super::*; - use oci::Mount; + use oci::{Mount, Process}; #[test] fn test_namespace() { @@ -388,6 +404,29 @@ mod tests { ]; spec.linux = Some(linux); security(&spec).unwrap(); + + // SELinux + let valid_label = "system_u:system_r:container_t:s0:c123,c456"; + let mut process = Process::default(); + process.selinux_label = valid_label.to_string(); + spec.process = Some(process); + security(&spec).unwrap(); + + let mut linux = Linux::default(); + linux.mount_label = valid_label.to_string(); + spec.linux = Some(linux); + security(&spec).unwrap(); + + let invalid_label = "system_u:system_r:container_t"; + let mut process = Process::default(); + process.selinux_label = invalid_label.to_string(); + spec.process = Some(process); + security(&spec).unwrap_err(); + + let mut linux = Linux::default(); + linux.mount_label = invalid_label.to_string(); + spec.linux = Some(linux); + security(&spec).unwrap_err(); } #[test] From c617bbe70dcd8b1787eaeebba3fb1e7e33ac48cd Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Sun, 7 Aug 2022 19:46:07 +0900 Subject: [PATCH 4/5] runtime: Pass SELinux policy for containers to the agent Pass SELinux policy for containers to the agent if `disable_guest_selinux` is set to `false` in the runtime configuration. The `container_t` type is applied to the container process inside the guest by default. Users can also set a custom SELinux policy to the container process using `guest_selinux_label` in the runtime configuration. This will be an alternative configuration of Kubernetes' security context for SELinux because users cannot specify the policy in Kata through Kubernetes's security context. To apply SELinux policy to the container, the guest rootfs must be CentOS that is created and built with `SELINUX=yes`. Fixes: #4812 Signed-off-by: Manabu Sugimoto --- src/runtime/Makefile | 7 +++ src/runtime/cmd/kata-runtime/kata-env.go | 2 + src/runtime/config/configuration-clh.toml.in | 15 ++++++ src/runtime/config/configuration-qemu.toml.in | 16 ++++++ .../pkg/katautils/config-settings.go.in | 1 + src/runtime/pkg/katautils/config.go | 13 +++-- src/runtime/pkg/katautils/config_test.go | 1 + src/runtime/pkg/oci/utils.go | 5 ++ src/runtime/virtcontainers/hypervisor.go | 5 ++ .../hypervisor_config_linux_test.go | 26 +++++----- src/runtime/virtcontainers/kata_agent.go | 52 ++++++++++++++++--- src/runtime/virtcontainers/kata_agent_test.go | 2 +- src/runtime/virtcontainers/persist.go | 2 + .../virtcontainers/persist/api/config.go | 33 ++++++------ .../pkg/annotations/annotations.go | 3 ++ src/runtime/virtcontainers/qemu.go | 17 +++++- src/runtime/virtcontainers/qemu_test.go | 21 ++++---- src/runtime/virtcontainers/sandbox.go | 45 ++++++++-------- 18 files changed, 196 insertions(+), 70 deletions(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 0f49badd44..852b4d5795 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -171,6 +171,11 @@ DEFDISABLEGUESTEMPTYDIR := false DEFAULTEXPFEATURES := [] DEFDISABLESELINUX := false + +# Default guest SELinux configuration +DEFDISABLEGUESTSELINUX := true +DEFGUESTSELINUXLABEL := system_u:system_r:container_t + #Default SeccomSandbox param #The same default policy is used by libvirt #More explanation on https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg03348.html @@ -460,6 +465,8 @@ USER_VARS += DEFNETWORKMODEL_QEMU USER_VARS += DEFDISABLEGUESTEMPTYDIR USER_VARS += DEFDISABLEGUESTSECCOMP USER_VARS += DEFDISABLESELINUX +USER_VARS += DEFDISABLEGUESTSELINUX +USER_VARS += DEFGUESTSELINUXLABEL USER_VARS += DEFAULTEXPFEATURES USER_VARS += DEFDISABLEBLOCK USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index b1421fa006..c129f8f434 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -76,6 +76,7 @@ type RuntimeConfigInfo struct { type RuntimeInfo struct { Config RuntimeConfigInfo Path string + GuestSeLinuxLabel string Experimental []exp.Feature Version RuntimeVersionInfo Debug bool @@ -186,6 +187,7 @@ func getRuntimeInfo(configFile string, config oci.RuntimeConfig) RuntimeInfo { SandboxCgroupOnly: config.SandboxCgroupOnly, Experimental: config.Experimental, DisableGuestSeccomp: config.DisableGuestSeccomp, + GuestSeLinuxLabel: config.GuestSeLinuxLabel, } } diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index e47a1d92a0..cedf2303ad 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -38,6 +38,13 @@ image = "@IMAGEPATH@" # disable applying SELinux on the VMM process (default false) disable_selinux=@DEFDISABLESELINUX@ +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux=@DEFDISABLEGUESTSELINUX@ + # Path to the firmware. # If you want Cloud Hypervisor to use a specific firmware, set its path below. # This is option is only used when confidential_guest is enabled. @@ -321,6 +328,14 @@ internetworking_model="@DEFNETWORKMODEL_CLH@" # (default: true) disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +#guest_selinux_label="@DEFGUESTSELINUXLABEL@" + # If enabled, the runtime will create opentracing.io traces and spans. # (See https://www.jaegertracing.io/docs/getting-started). # (default: disabled) diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 8330042977..f7e70a6d53 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -438,6 +438,14 @@ valid_entropy_sources = @DEFVALIDENTROPYSOURCES@ # disable applying SELinux on the VMM process (default false) disable_selinux=@DEFDISABLESELINUX@ +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux=@DEFDISABLEGUESTSELINUX@ + + [factory] # VM templating support. Once enabled, new VMs are created from template # using vm cloning. They will share the same initial kernel, initramfs and @@ -555,6 +563,14 @@ internetworking_model="@DEFNETWORKMODEL_QEMU@" # (default: true) disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +#guest_selinux_label="@DEFGUESTSELINUXLABEL@" + # If enabled, the runtime will create opentracing.io traces and spans. # (See https://www.jaegertracing.io/docs/getting-started). # (default: disabled) diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 601d95612c..43dd5cc5a4 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -90,6 +90,7 @@ const defaultSevSnpGuest = false const defaultGuestSwap = false const defaultRootlessHypervisor = false const defaultDisableSeccomp = false +const defaultDisableGuestSeLinux = true const defaultVfioMode = "guest-kernel" const defaultLegacySerial = false diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 3fabfe0af1..3ed3177f57 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -59,9 +59,9 @@ const ( type tomlConfig struct { Hypervisor map[string]hypervisor Agent map[string]agent - Runtime runtime Image image Factory factory + Runtime runtime } type image struct { @@ -154,6 +154,7 @@ type hypervisor struct { Rootless bool `toml:"rootless"` DisableSeccomp bool `toml:"disable_seccomp"` DisableSeLinux bool `toml:"disable_selinux"` + DisableGuestSeLinux bool `toml:"disable_guest_selinux"` LegacySerial bool `toml:"use_legacy_serial"` EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"` } @@ -164,12 +165,13 @@ type runtime struct { JaegerUser string `toml:"jaeger_user"` JaegerPassword string `toml:"jaeger_password"` VfioMode string `toml:"vfio_mode"` + GuestSeLinuxLabel string `toml:"guest_selinux_label"` SandboxBindMounts []string `toml:"sandbox_bind_mounts"` Experimental []string `toml:"experimental"` - Debug bool `toml:"enable_debug"` Tracing bool `toml:"enable_tracing"` DisableNewNetNs bool `toml:"disable_new_netns"` DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` + Debug bool `toml:"enable_debug"` SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"` EnablePprof bool `toml:"enable_pprof"` @@ -690,6 +692,7 @@ func newFirecrackerHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { TxRateLimiterMaxRate: txRateLimiterMaxRate, EnableAnnotations: h.EnableAnnotations, DisableSeLinux: h.DisableSeLinux, + DisableGuestSeLinux: true, // Guest SELinux is not supported in Firecracker }, nil } @@ -836,6 +839,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { LegacySerial: h.LegacySerial, DisableSeLinux: h.DisableSeLinux, EnableVCPUsPinning: h.EnableVCPUsPinning, + DisableGuestSeLinux: h.DisableGuestSeLinux, }, nil } @@ -902,6 +906,7 @@ func newAcrnHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { GuestHookPath: h.guestHookPath(), DisableSeLinux: h.DisableSeLinux, EnableAnnotations: h.EnableAnnotations, + DisableGuestSeLinux: true, // Guest SELinux is not supported in ACRN }, nil } @@ -1007,6 +1012,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { DisableSeccomp: h.DisableSeccomp, ConfidentialGuest: h.ConfidentialGuest, DisableSeLinux: h.DisableSeLinux, + DisableGuestSeLinux: h.DisableGuestSeLinux, NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(), NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(), NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(), @@ -1230,6 +1236,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { GuestSwap: defaultGuestSwap, Rootless: defaultRootlessHypervisor, DisableSeccomp: defaultDisableSeccomp, + DisableGuestSeLinux: defaultDisableGuestSeLinux, LegacySerial: defaultLegacySerial, } } @@ -1317,7 +1324,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat } config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp - + config.GuestSeLinuxLabel = tomlConf.Runtime.GuestSeLinuxLabel config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 5e493b40e3..335f077fbb 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -554,6 +554,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { VhostUserStorePath: defaultVhostUserStorePath, VirtioFSCache: defaultVirtioFSCacheMode, BlockDeviceAIO: defaultBlockDeviceAIO, + DisableGuestSeLinux: defaultDisableGuestSeLinux, } expectedAgentConfig := vc.KataAgentConfig{ diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 57c2ed1a15..2cd7c10f53 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -128,6 +128,9 @@ type RuntimeConfig struct { //Determines if seccomp should be applied inside guest DisableGuestSeccomp bool + //SELinux security context applied to the container process inside guest. + GuestSeLinuxLabel string + // Sandbox sizing information which, if provided, indicates the size of // the sandbox needed for the workload(s) SandboxCPUs uint32 @@ -945,6 +948,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st DisableGuestSeccomp: runtime.DisableGuestSeccomp, + GuestSeLinuxLabel: runtime.GuestSeLinuxLabel, + Experimental: runtime.Experimental, } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 548ce6f77d..955da7d107 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -74,6 +74,8 @@ const ( MinHypervisorMemory = 256 defaultMsize9p = 8192 + + defaultDisableGuestSeLinux = true ) var ( @@ -560,6 +562,9 @@ type HypervisorConfig struct { // Disable selinux from the hypervisor process DisableSeLinux bool + // Disable selinux from the container process + DisableGuestSeLinux bool + // Use legacy serial for the guest console LegacySerial bool diff --git a/src/runtime/virtcontainers/hypervisor_config_linux_test.go b/src/runtime/virtcontainers/hypervisor_config_linux_test.go index 609e52fd73..41cabb1c35 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux_test.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux_test.go @@ -92,22 +92,24 @@ func TestHypervisorConfigValidTemplateConfig(t *testing.T) { func TestHypervisorConfigDefaults(t *testing.T) { assert := assert.New(t) hypervisorConfig := &HypervisorConfig{ - KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), - ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), - HypervisorPath: "", + KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), + ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), + HypervisorPath: "", + DisableGuestSeLinux: defaultDisableGuestSeLinux, } testHypervisorConfigValid(t, hypervisorConfig, true) hypervisorConfigDefaultsExpected := &HypervisorConfig{ - KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), - ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), - HypervisorPath: "", - NumVCPUs: defaultVCPUs, - MemorySize: defaultMemSzMiB, - DefaultBridges: defaultBridges, - BlockDeviceDriver: defaultBlockDriver, - DefaultMaxVCPUs: defaultMaxVCPUs, - Msize9p: defaultMsize9p, + KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), + ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), + HypervisorPath: "", + NumVCPUs: defaultVCPUs, + MemorySize: defaultMemSzMiB, + DefaultBridges: defaultBridges, + BlockDeviceDriver: defaultBlockDriver, + DefaultMaxVCPUs: defaultMaxVCPUs, + Msize9p: defaultMsize9p, + DisableGuestSeLinux: defaultDisableGuestSeLinux, } assert.Exactly(hypervisorConfig, hypervisorConfigDefaultsExpected) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 477be9fde4..5746759542 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -36,6 +36,7 @@ import ( "context" "github.com/gogo/protobuf/proto" "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "google.golang.org/grpc/codes" @@ -69,6 +70,9 @@ const ( kernelParamDebugConsole = "agent.debug_console" kernelParamDebugConsoleVPort = "agent.debug_console_vport" kernelParamDebugConsoleVPortValue = "1026" + + // Default SELinux type applied to the container process inside guest + defaultSeLinuxContainerType = "container_t" ) var ( @@ -895,7 +899,7 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st return nil } -func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, stripVfio bool) { +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -907,11 +911,34 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str grpcSpec.Linux.Seccomp = nil } - // Disable SELinux inside of the virtual machine, the label will apply - // to the KVM process + // Pass SELinux label for the container process to the agent. if grpcSpec.Process.SelinuxLabel != "" { - k.Logger().Info("SELinux label from config will be applied to the hypervisor process, not the VM workload") - grpcSpec.Process.SelinuxLabel = "" + if !disableGuestSeLinux { + k.Logger().Info("SELinux label will be applied to the container process inside guest") + + var label string + if guestSeLinuxLabel != "" { + label = guestSeLinuxLabel + } else { + label = grpcSpec.Process.SelinuxLabel + } + + processContext, err := selinux.NewContext(label) + if err != nil { + return err + } + + // Change the type from KVM to container because the type passed from the high-level + // runtime is for KVM process. + if guestSeLinuxLabel == "" { + processContext["type"] = defaultSeLinuxContainerType + } + grpcSpec.Process.SelinuxLabel = processContext.Get() + } else { + k.Logger().Info("Empty SELinux label for the process and the mount because guest SELinux is disabled") + grpcSpec.Process.SelinuxLabel = "" + grpcSpec.Linux.MountLabel = "" + } } // By now only CPU constraints are supported @@ -973,6 +1000,8 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str } grpcSpec.Linux.Devices = linuxDevices } + + return nil } func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) { @@ -1256,9 +1285,20 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported + // Currently, guest SELinux can be enabled only when SELinux is enabled on the host side. + if !sandbox.config.HypervisorConfig.DisableGuestSeLinux && !selinux.GetEnabled() { + return nil, fmt.Errorf("Guest SELinux is enabled, but SELinux is disabled on the host side") + } + if sandbox.config.HypervisorConfig.DisableGuestSeLinux && sandbox.config.GuestSeLinuxLabel != "" { + return nil, fmt.Errorf("Custom SELinux security policy is provided, but guest SELinux is disabled") + } + // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel) + err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel) + if err != nil { + return nil, err + } req := &grpc.CreateContainerRequest{ ContainerId: c.id, diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 9711a5cf55..885fd8acc7 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -619,7 +619,7 @@ func TestConstrainGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constrainGRPCSpec(g, true, true) + k.constrainGRPCSpec(g, true, true, "", true) // Check nil fields assert.Nil(g.Hooks) diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 59c6dda15f..906ed10761 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -189,6 +189,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { SystemdCgroup: sconfig.SystemdCgroup, SandboxCgroupOnly: sconfig.SandboxCgroupOnly, DisableGuestSeccomp: sconfig.DisableGuestSeccomp, + GuestSeLinuxLabel: sconfig.GuestSeLinuxLabel, } ss.Config.SandboxBindMounts = append(ss.Config.SandboxBindMounts, sconfig.SandboxBindMounts...) @@ -429,6 +430,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { SystemdCgroup: savedConf.SystemdCgroup, SandboxCgroupOnly: savedConf.SandboxCgroupOnly, DisableGuestSeccomp: savedConf.DisableGuestSeccomp, + GuestSeLinuxLabel: savedConf.GuestSeLinuxLabel, } sconfig.SandboxBindMounts = append(sconfig.SandboxBindMounts, savedConf.SandboxBindMounts...) diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 1c16b7bd91..44ba820643 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -243,19 +243,6 @@ type ContainerConfig struct { // SandboxConfig is a sandbox configuration. // Refs: virtcontainers/sandbox.go:SandboxConfig type SandboxConfig struct { - // Information for fields not saved: - // * Annotation: this is kind of casual data, we don't need casual data in persist file, - // if you know this data needs to persist, please gives it - // a specific field - - ContainerConfigs []ContainerConfig - - // SandboxBindMounts - list of paths to mount into guest - SandboxBindMounts []string - - // Experimental enables experimental features - Experimental []string - // Cgroups specifies specific cgroup settings for the various subsystems that the container is // placed into to limit the resources the container has available Cgroups *configs.Cgroup `json:"cgroups"` @@ -265,8 +252,24 @@ type SandboxConfig struct { KataShimConfig *ShimConfig - HypervisorType string - NetworkConfig NetworkConfig + // Custom SELinux security policy to the container process inside the VM + GuestSeLinuxLabel string + + HypervisorType string + + // SandboxBindMounts - list of paths to mount into guest + SandboxBindMounts []string + + // Experimental enables experimental features + Experimental []string + + // Information for fields not saved: + // * Annotation: this is kind of casual data, we don't need casual data in persist file, + // if you know this data needs to persist, please gives it a specific field + ContainerConfigs []ContainerConfig + + NetworkConfig NetworkConfig + HypervisorConfig HypervisorConfig ShmSize uint64 diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index d785580d9b..67c81cb1f8 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -247,6 +247,9 @@ const ( // DisableGuestSeccomp is a sandbox annotation that determines if seccomp should be applied inside guest. DisableGuestSeccomp = kataAnnotRuntimePrefix + "disable_guest_seccomp" + // GuestSeLinuxLabel is a SELinux security policy that is applied to a container process inside guest. + GuestSeLinuxLabel = kataAnnotRuntimePrefix + "guest_selinux_label" + // SandboxCgroupOnly is a sandbox annotation that determines if kata processes are managed only in sandbox cgroup. SandboxCgroupOnly = kataAnnotRuntimePrefix + "sandbox_cgroup_only" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d33f02f6ed..75a6731dd1 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -181,6 +181,15 @@ func (q *qemu) kernelParameters() string { // set the maximum number of vCPUs params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)}) + // set the SELinux params in accordance with the runtime configuration, disable_guest_selinux. + if q.config.DisableGuestSeLinux { + q.Logger().Info("Set selinux=0 to kernel params because SELinux on the guest is disabled") + params = append(params, Param{"selinux", "0"}) + } else { + q.Logger().Info("Set selinux=1 to kernel params because SELinux on the guest is enabled") + params = append(params, Param{"selinux", "1"}) + } + // add the params specified by the provided config. As the kernel // honours the last parameter value set and since the config-provided // params are added here, they will take priority over the defaults. @@ -476,6 +485,13 @@ func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) { return nd, nil } + // Set the xattr option for virtiofsd daemon to enable extended attributes + // in virtiofs if SELinux on the guest side is enabled. + if !q.config.DisableGuestSeLinux { + q.Logger().Info("Set the xattr option for virtiofsd") + q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "-o", "xattr") + } + // default use virtiofsd return &virtiofsd{ path: q.config.VirtioFSDaemon, @@ -846,7 +862,6 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error { // the SELinux label. If these processes require privileged, we do // notwant to run them under confinement. if !q.config.DisableSeLinux { - if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil { return err } diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index f30dd0a696..a8bc6a33db 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -27,15 +27,16 @@ import ( func newQemuConfig() HypervisorConfig { return HypervisorConfig{ - KernelPath: testQemuKernelPath, - InitrdPath: testQemuInitrdPath, - HypervisorPath: testQemuPath, - NumVCPUs: defaultVCPUs, - MemorySize: defaultMemSzMiB, - DefaultBridges: defaultBridges, - BlockDeviceDriver: defaultBlockDriver, - DefaultMaxVCPUs: defaultMaxVCPUs, - Msize9p: defaultMsize9p, + KernelPath: testQemuKernelPath, + InitrdPath: testQemuInitrdPath, + HypervisorPath: testQemuPath, + NumVCPUs: defaultVCPUs, + MemorySize: defaultMemSzMiB, + DefaultBridges: defaultBridges, + BlockDeviceDriver: defaultBlockDriver, + DefaultMaxVCPUs: defaultMaxVCPUs, + Msize9p: defaultMsize9p, + DisableGuestSeLinux: defaultDisableGuestSeLinux, } } @@ -58,7 +59,7 @@ func testQemuKernelParameters(t *testing.T, kernelParams []Param, expected strin } func TestQemuKernelParameters(t *testing.T) { - expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d foo=foo bar=bar", govmm.MaxVCPUs()) + expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d selinux=0 foo=foo bar=bar", govmm.MaxVCPUs()) params := []Param{ { Key: "foo", diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 32ccc2dbb7..025537fed9 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -126,14 +126,17 @@ type SandboxResourceSizing struct { // SandboxConfig is a Sandbox configuration. type SandboxConfig struct { - // Volumes is a list of shared volumes between the host and the Sandbox. - Volumes []types.Volume + // Annotations keys must be unique strings and must be name-spaced + Annotations map[string]string - // Containers describe the list of containers within a Sandbox. - // This list can be empty and populated by adding containers - // to the Sandbox a posteriori. - //TODO: this should be a map to avoid duplicated containers - Containers []ContainerConfig + // Custom SELinux security policy to the container process inside the VM + GuestSeLinuxLabel string + + HypervisorType HypervisorType + + ID string + + Hostname string // SandboxBindMounts - list of paths to mount into guest SandboxBindMounts []string @@ -141,31 +144,29 @@ type SandboxConfig struct { // Experimental features enabled Experimental []exp.Feature - // Annotations keys must be unique strings and must be name-spaced - // with e.g. reverse domain notation (org.clearlinux.key). - Annotations map[string]string + // Containers describe the list of containers within a Sandbox. + // This list can be empty and populated by adding containers + // to the Sandbox a posteriori. + // TODO: this should be a map to avoid duplicated containers + Containers []ContainerConfig - ID string - - Hostname string - - HypervisorType HypervisorType - - AgentConfig KataAgentConfig + Volumes []types.Volume NetworkConfig NetworkConfig + AgentConfig KataAgentConfig + HypervisorConfig HypervisorConfig - SandboxResources SandboxResourceSizing - - // StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM) - StaticResourceMgmt bool - ShmSize uint64 + SandboxResources SandboxResourceSizing + VfioMode config.VFIOModeType + // StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM) + StaticResourceMgmt bool + // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool From 78532154d9d277d1618680dd7792ecf9dfc6877f Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Sun, 7 Aug 2022 21:50:07 +0900 Subject: [PATCH 5/5] docs: Add description for guest SELinux support Add the description about how to enable SELinux for containers running inside the guest. Fixes: #4812 Signed-off-by: Manabu Sugimoto --- docs/Developer-Guide.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/Developer-Guide.md b/docs/Developer-Guide.md index a57bd2d72b..ab1c05a2d7 100644 --- a/docs/Developer-Guide.md +++ b/docs/Developer-Guide.md @@ -86,6 +86,27 @@ $ sudo sed -i '/^disable_guest_seccomp/ s/true/false/' /etc/kata-containers/conf This will pass container seccomp profiles to the kata agent. +## Enable SELinux on the guest + +> **Note:** +> +> - To enable SELinux on the guest, SELinux MUST be also enabled on the host. +> - You MUST create and build a rootfs image for SELinux in advance. +> See [Create a rootfs image](#create-a-rootfs-image) and [Build a rootfs image](#build-a-rootfs-image). +> - SELinux on the guest is supported in only a rootfs image currently, so +> you cannot enable SELinux with the agent init (`AGENT_INIT=yes`) yet. + +Enable guest SELinux in Enforcing mode as follows: + +``` +$ sudo sed -i '/^disable_guest_selinux/ s/true/false/g' /etc/kata-containers/configuration.toml +``` + +The runtime automatically will set `selinux=1` to the kernel parameters and `xattr` option to +`virtiofsd` when `disable_guest_selinux` is set to `false`. + +If you want to enable SELinux in Permissive mode, add `enforcing=0` to the kernel parameters. + ## Enable full debug Enable full debug as follows: @@ -256,6 +277,12 @@ If you want to build the agent without seccomp capability, you need to run the ` $ script -fec 'sudo -E AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh "${distro}"' ``` +If you want to enable SELinux on the guest, you MUST choose `centos` and run the `rootfs.sh` script with `SELINUX=yes` as follows. + +``` +$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SELINUX=yes ./rootfs.sh centos' +``` + > **Note:** > > - Check the [compatibility matrix](../tools/osbuilder/README.md#platform-distro-compatibility-matrix) before creating rootfs. @@ -283,6 +310,19 @@ $ script -fec 'sudo -E USE_DOCKER=true ./image_builder.sh "${ROOTFS_DIR}"' $ popd ``` +If you want to enable SELinux on the guest, you MUST run the `image_builder.sh` script with `SELINUX=yes` +to label the guest image as follows. +To label the image on the host, you need to make sure that SELinux is enabled (`selinuxfs` is mounted) on the host +and the rootfs MUST be created by running the `rootfs.sh` with `SELINUX=yes`. + +``` +$ script -fec 'sudo -E USE_DOCKER=true SELINUX=yes ./image_builder.sh ${ROOTFS_DIR}' +``` + +Currently, the `image_builder.sh` uses `chcon` as an interim solution in order to apply `container_runtime_exec_t` +to the `kata-agent`. Hence, if you run `restorecon` to the guest image after running the `image_builder.sh`, +the `kata-agent` needs to be labeled `container_runtime_exec_t` again by yourself. + > **Notes:** > > - You must ensure that the *default Docker runtime* is `runc` to make use of