Merge pull request #4813 from ManaSugi/fix/add-selinux-agent

runtime,agent: Add SELinux support for containers inside the guest
This commit is contained in:
Fabiano Fidêncio 2022-12-13 11:24:53 +01:00 committed by GitHub
commit f1381eb361
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 520 additions and 79 deletions

View File

@ -86,6 +86,27 @@ $ sudo sed -i '/^disable_guest_seccomp/ s/true/false/' /etc/kata-containers/conf
This will pass container seccomp profiles to the kata agent.
## Enable SELinux on the guest
> **Note:**
>
> - To enable SELinux on the guest, SELinux MUST be also enabled on the host.
> - You MUST create and build a rootfs image for SELinux in advance.
> See [Create a rootfs image](#create-a-rootfs-image) and [Build a rootfs image](#build-a-rootfs-image).
> - SELinux on the guest is supported in only a rootfs image currently, so
> you cannot enable SELinux with the agent init (`AGENT_INIT=yes`) yet.
Enable guest SELinux in Enforcing mode as follows:
```
$ sudo sed -i '/^disable_guest_selinux/ s/true/false/g' /etc/kata-containers/configuration.toml
```
The runtime automatically will set `selinux=1` to the kernel parameters and `xattr` option to
`virtiofsd` when `disable_guest_selinux` is set to `false`.
If you want to enable SELinux in Permissive mode, add `enforcing=0` to the kernel parameters.
## Enable full debug
Enable full debug as follows:
@ -256,6 +277,12 @@ If you want to build the agent without seccomp capability, you need to run the `
$ script -fec 'sudo -E AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh "${distro}"'
```
If you want to enable SELinux on the guest, you MUST choose `centos` and run the `rootfs.sh` script with `SELINUX=yes` as follows.
```
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SELINUX=yes ./rootfs.sh centos'
```
> **Note:**
>
> - Check the [compatibility matrix](../tools/osbuilder/README.md#platform-distro-compatibility-matrix) before creating rootfs.
@ -283,6 +310,19 @@ $ script -fec 'sudo -E USE_DOCKER=true ./image_builder.sh "${ROOTFS_DIR}"'
$ popd
```
If you want to enable SELinux on the guest, you MUST run the `image_builder.sh` script with `SELINUX=yes`
to label the guest image as follows.
To label the image on the host, you need to make sure that SELinux is enabled (`selinuxfs` is mounted) on the host
and the rootfs MUST be created by running the `rootfs.sh` with `SELINUX=yes`.
```
$ script -fec 'sudo -E USE_DOCKER=true SELINUX=yes ./image_builder.sh ${ROOTFS_DIR}'
```
Currently, the `image_builder.sh` uses `chcon` as an interim solution in order to apply `container_runtime_exec_t`
to the `kata-agent`. Hence, if you run `restorecon` to the guest image after running the `image_builder.sh`,
the `kata-agent` needs to be labeled `container_runtime_exec_t` again by yourself.
> **Notes:**
>
> - You must ensure that the *default Docker runtime* is `runc` to make use of

10
src/agent/Cargo.lock generated
View File

@ -1705,6 +1705,7 @@ dependencies = [
"tempfile",
"test-utils",
"tokio",
"xattr",
"zbus",
]
@ -2478,6 +2479,15 @@ version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
[[package]]
name = "xattr"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
dependencies = [
"libc",
]
[[package]]
name = "zbus"
version = "2.3.2"

View File

@ -35,6 +35,7 @@ inotify = "0.9.2"
libseccomp = { version = "0.3.0", optional = true }
zbus = "2.3.0"
bit-vec= "0.6.3"
xattr = "0.2.3"
[dev-dependencies]
serial_test = "0.5.0"

View File

@ -30,6 +30,7 @@ use crate::log_child;
use crate::process::Process;
#[cfg(feature = "seccomp")]
use crate::seccomp;
use crate::selinux;
use crate::specconv::CreateOpts;
use crate::{mount, validator};
@ -526,6 +527,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
}
}
let selinux_enabled = selinux::is_enabled()?;
sched::unshare(to_new & !CloneFlags::CLONE_NEWUSER)?;
if userns {
@ -627,6 +630,18 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?;
}
// Set SELinux label
if !oci_process.selinux_label.is_empty() {
if !selinux_enabled {
return Err(anyhow!(
"SELinux label for the process is provided but SELinux is not enabled on the running kernel"
));
}
log_child!(cfd_log, "Set SELinux label to the container process");
selinux::set_exec_label(&oci_process.selinux_label)?;
}
// Log unknown seccomp system calls in advance before the log file descriptor closes.
#[cfg(feature = "seccomp")]
if let Some(ref scmp) = linux.seccomp {

View File

@ -38,6 +38,7 @@ pub mod pipestream;
pub mod process;
#[cfg(feature = "seccomp")]
pub mod seccomp;
pub mod selinux;
pub mod specconv;
pub mod sync;
pub mod sync_with_async;

View File

@ -25,6 +25,7 @@ use std::fs::File;
use std::io::{BufRead, BufReader};
use crate::container::DEFAULT_DEVICES;
use crate::selinux;
use crate::sync::write_count;
use std::string::ToString;
@ -181,6 +182,8 @@ pub fn init_rootfs(
None => flags |= MsFlags::MS_SLAVE,
}
let label = &linux.mount_label;
let root = spec
.root
.as_ref()
@ -244,7 +247,7 @@ pub fn init_rootfs(
}
}
mount_from(cfd_log, m, rootfs, flags, &data, "")?;
mount_from(cfd_log, m, rootfs, flags, &data, label)?;
// bind mount won't change mount options, we need remount to make mount options
// effective.
// first check that we have non-default options required before attempting a
@ -524,7 +527,6 @@ pub fn pivot_rootfs<P: ?Sized + NixPath + std::fmt::Debug>(path: &P) -> Result<(
fn rootfs_parent_mount_private(path: &str) -> Result<()> {
let mount_infos = parse_mount_table(MOUNTINFO_PATH)?;
let mut max_len = 0;
let mut mount_point = String::from("");
let mut options = String::from("");
@ -767,9 +769,9 @@ fn mount_from(
rootfs: &str,
flags: MsFlags,
data: &str,
_label: &str,
label: &str,
) -> Result<()> {
let d = String::from(data);
let mut d = String::from(data);
let dest = secure_join(rootfs, &m.destination);
let src = if m.r#type.as_str() == "bind" {
@ -822,6 +824,37 @@ fn mount_from(
e
})?;
// Set the SELinux context for the mounts
let mut use_xattr = false;
if !label.is_empty() {
if selinux::is_enabled()? {
let device = Path::new(&m.source)
.file_name()
.ok_or_else(|| anyhow!("invalid device source path: {}", &m.source))?
.to_str()
.ok_or_else(|| anyhow!("failed to convert device source path: {}", &m.source))?;
match device {
// SELinux does not support labeling of /proc or /sys
"proc" | "sysfs" => (),
// SELinux does not support mount labeling against /dev/mqueue,
// so we use setxattr instead
"mqueue" => {
use_xattr = true;
}
_ => {
log_child!(cfd_log, "add SELinux mount label to {}", dest.as_str());
selinux::add_mount_label(&mut d, label);
}
}
} else {
log_child!(
cfd_log,
"SELinux label for the mount is provided but SELinux is not enabled on the running kernel"
);
}
}
mount(
Some(src.as_str()),
dest.as_str(),
@ -834,6 +867,10 @@ fn mount_from(
e
})?;
if !label.is_empty() && selinux::is_enabled()? && use_xattr {
xattr::set(dest.as_str(), "security.selinux", label.as_bytes())?;
}
if flags.contains(MsFlags::MS_BIND)
&& flags.intersects(
!(MsFlags::MS_REC

View File

@ -0,0 +1,80 @@
// Copyright 2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{Context, Result};
use nix::unistd::gettid;
use std::fs::{self, OpenOptions};
use std::io::prelude::*;
use std::path::Path;
pub fn is_enabled() -> Result<bool> {
let buf = fs::read_to_string("/proc/mounts")?;
let enabled = buf.contains("selinuxfs");
Ok(enabled)
}
pub fn add_mount_label(data: &mut String, label: &str) {
if data.is_empty() {
let context = format!("context=\"{}\"", label);
data.push_str(&context);
} else {
let context = format!(",context=\"{}\"", label);
data.push_str(&context);
}
}
pub fn set_exec_label(label: &str) -> Result<()> {
let mut attr_path = Path::new("/proc/thread-self/attr/exec").to_path_buf();
if !attr_path.exists() {
// Fall back to the old convention
attr_path = Path::new("/proc/self/task")
.join(gettid().to_string())
.join("attr/exec")
}
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
.open(attr_path)?;
file.write_all(label.as_bytes())
.with_context(|| "failed to apply SELinux label")?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_LABEL: &str = "system_u:system_r:unconfined_t:s0";
#[test]
fn test_is_enabled() {
let ret = is_enabled();
assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret);
}
#[test]
fn test_add_mount_label() {
let mut data = String::new();
add_mount_label(&mut data, TEST_LABEL);
assert_eq!(data, format!("context=\"{}\"", TEST_LABEL));
let mut data = String::from("defaults");
add_mount_label(&mut data, TEST_LABEL);
assert_eq!(data, format!("defaults,context=\"{}\"", TEST_LABEL));
}
#[test]
fn test_set_exec_label() {
let ret = set_exec_label(TEST_LABEL);
if is_enabled().unwrap() {
assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret);
} else {
assert!(ret.is_err(), "Expecting error, Got {:?}", ret);
}
}
}

View File

@ -6,6 +6,7 @@
use crate::container::Config;
use anyhow::{anyhow, Context, Result};
use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec};
use regex::Regex;
use std::collections::HashMap;
use std::path::{Component, PathBuf};
@ -86,6 +87,23 @@ fn hostname(oci: &Spec) -> Result<()> {
fn security(oci: &Spec) -> Result<()> {
let linux = get_linux(oci)?;
let label_pattern = r".*_u:.*_r:.*_t:s[0-9]|1[0-5].*";
let label_regex = Regex::new(label_pattern)?;
if let Some(ref process) = oci.process {
if !process.selinux_label.is_empty() && !label_regex.is_match(&process.selinux_label) {
return Err(anyhow!(
"SELinux label for the process is invalid format: {}",
&process.selinux_label
));
}
}
if !linux.mount_label.is_empty() && !label_regex.is_match(&linux.mount_label) {
return Err(anyhow!(
"SELinux label for the mount is invalid format: {}",
&linux.mount_label
));
}
if linux.masked_paths.is_empty() && linux.readonly_paths.is_empty() {
return Ok(());
@ -95,8 +113,6 @@ fn security(oci: &Spec) -> Result<()> {
return Err(anyhow!("Linux namespace does not contain mount"));
}
// don't care about selinux at present
Ok(())
}
@ -285,7 +301,7 @@ pub fn validate(conf: &Config) -> Result<()> {
#[cfg(test)]
mod tests {
use super::*;
use oci::Mount;
use oci::{Mount, Process};
#[test]
fn test_namespace() {
@ -388,6 +404,29 @@ mod tests {
];
spec.linux = Some(linux);
security(&spec).unwrap();
// SELinux
let valid_label = "system_u:system_r:container_t:s0:c123,c456";
let mut process = Process::default();
process.selinux_label = valid_label.to_string();
spec.process = Some(process);
security(&spec).unwrap();
let mut linux = Linux::default();
linux.mount_label = valid_label.to_string();
spec.linux = Some(linux);
security(&spec).unwrap();
let invalid_label = "system_u:system_r:container_t";
let mut process = Process::default();
process.selinux_label = invalid_label.to_string();
spec.process = Some(process);
security(&spec).unwrap_err();
let mut linux = Linux::default();
linux.mount_label = invalid_label.to_string();
spec.linux = Some(linux);
security(&spec).unwrap_err();
}
#[test]

View File

@ -171,6 +171,11 @@ DEFDISABLEGUESTEMPTYDIR := false
DEFAULTEXPFEATURES := []
DEFDISABLESELINUX := false
# Default guest SELinux configuration
DEFDISABLEGUESTSELINUX := true
DEFGUESTSELINUXLABEL := system_u:system_r:container_t
#Default SeccomSandbox param
#The same default policy is used by libvirt
#More explanation on https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg03348.html
@ -460,6 +465,8 @@ USER_VARS += DEFNETWORKMODEL_QEMU
USER_VARS += DEFDISABLEGUESTEMPTYDIR
USER_VARS += DEFDISABLEGUESTSECCOMP
USER_VARS += DEFDISABLESELINUX
USER_VARS += DEFDISABLEGUESTSELINUX
USER_VARS += DEFGUESTSELINUXLABEL
USER_VARS += DEFAULTEXPFEATURES
USER_VARS += DEFDISABLEBLOCK
USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN

View File

@ -76,6 +76,7 @@ type RuntimeConfigInfo struct {
type RuntimeInfo struct {
Config RuntimeConfigInfo
Path string
GuestSeLinuxLabel string
Experimental []exp.Feature
Version RuntimeVersionInfo
Debug bool
@ -186,6 +187,7 @@ func getRuntimeInfo(configFile string, config oci.RuntimeConfig) RuntimeInfo {
SandboxCgroupOnly: config.SandboxCgroupOnly,
Experimental: config.Experimental,
DisableGuestSeccomp: config.DisableGuestSeccomp,
GuestSeLinuxLabel: config.GuestSeLinuxLabel,
}
}

View File

@ -38,6 +38,13 @@ image = "@IMAGEPATH@"
# disable applying SELinux on the VMM process (default false)
disable_selinux=@DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux=@DEFDISABLEGUESTSELINUX@
# Path to the firmware.
# If you want Cloud Hypervisor to use a specific firmware, set its path below.
# This is option is only used when confidential_guest is enabled.
@ -321,6 +328,14 @@ internetworking_model="@DEFNETWORKMODEL_CLH@"
# (default: true)
disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
#guest_selinux_label="@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled)

View File

@ -438,6 +438,14 @@ valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
# disable applying SELinux on the VMM process (default false)
disable_selinux=@DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux=@DEFDISABLEGUESTSELINUX@
[factory]
# VM templating support. Once enabled, new VMs are created from template
# using vm cloning. They will share the same initial kernel, initramfs and
@ -555,6 +563,14 @@ internetworking_model="@DEFNETWORKMODEL_QEMU@"
# (default: true)
disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
#guest_selinux_label="@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled)

View File

@ -90,6 +90,7 @@ const defaultSevSnpGuest = false
const defaultGuestSwap = false
const defaultRootlessHypervisor = false
const defaultDisableSeccomp = false
const defaultDisableGuestSeLinux = true
const defaultVfioMode = "guest-kernel"
const defaultLegacySerial = false

View File

@ -59,9 +59,9 @@ const (
type tomlConfig struct {
Hypervisor map[string]hypervisor
Agent map[string]agent
Runtime runtime
Image image
Factory factory
Runtime runtime
}
type image struct {
@ -154,6 +154,7 @@ type hypervisor struct {
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"`
}
@ -164,12 +165,13 @@ type runtime struct {
JaegerUser string `toml:"jaeger_user"`
JaegerPassword string `toml:"jaeger_password"`
VfioMode string `toml:"vfio_mode"`
GuestSeLinuxLabel string `toml:"guest_selinux_label"`
SandboxBindMounts []string `toml:"sandbox_bind_mounts"`
Experimental []string `toml:"experimental"`
Debug bool `toml:"enable_debug"`
Tracing bool `toml:"enable_tracing"`
DisableNewNetNs bool `toml:"disable_new_netns"`
DisableGuestSeccomp bool `toml:"disable_guest_seccomp"`
Debug bool `toml:"enable_debug"`
SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"`
StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"`
EnablePprof bool `toml:"enable_pprof"`
@ -690,6 +692,7 @@ func newFirecrackerHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
TxRateLimiterMaxRate: txRateLimiterMaxRate,
EnableAnnotations: h.EnableAnnotations,
DisableSeLinux: h.DisableSeLinux,
DisableGuestSeLinux: true, // Guest SELinux is not supported in Firecracker
}, nil
}
@ -836,6 +839,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
LegacySerial: h.LegacySerial,
DisableSeLinux: h.DisableSeLinux,
EnableVCPUsPinning: h.EnableVCPUsPinning,
DisableGuestSeLinux: h.DisableGuestSeLinux,
}, nil
}
@ -902,6 +906,7 @@ func newAcrnHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
GuestHookPath: h.guestHookPath(),
DisableSeLinux: h.DisableSeLinux,
EnableAnnotations: h.EnableAnnotations,
DisableGuestSeLinux: true, // Guest SELinux is not supported in ACRN
}, nil
}
@ -1007,6 +1012,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
DisableSeccomp: h.DisableSeccomp,
ConfidentialGuest: h.ConfidentialGuest,
DisableSeLinux: h.DisableSeLinux,
DisableGuestSeLinux: h.DisableGuestSeLinux,
NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(),
NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(),
NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(),
@ -1230,6 +1236,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
GuestSwap: defaultGuestSwap,
Rootless: defaultRootlessHypervisor,
DisableSeccomp: defaultDisableSeccomp,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
LegacySerial: defaultLegacySerial,
}
}
@ -1317,7 +1324,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
}
config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp
config.GuestSeLinuxLabel = tomlConf.Runtime.GuestSeLinuxLabel
config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt
config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly
config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs

View File

@ -554,6 +554,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
VhostUserStorePath: defaultVhostUserStorePath,
VirtioFSCache: defaultVirtioFSCacheMode,
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
}
expectedAgentConfig := vc.KataAgentConfig{

View File

@ -128,6 +128,9 @@ type RuntimeConfig struct {
//Determines if seccomp should be applied inside guest
DisableGuestSeccomp bool
//SELinux security context applied to the container process inside guest.
GuestSeLinuxLabel string
// Sandbox sizing information which, if provided, indicates the size of
// the sandbox needed for the workload(s)
SandboxCPUs uint32
@ -945,6 +948,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st
DisableGuestSeccomp: runtime.DisableGuestSeccomp,
GuestSeLinuxLabel: runtime.GuestSeLinuxLabel,
Experimental: runtime.Experimental,
}

View File

@ -74,6 +74,8 @@ const (
MinHypervisorMemory = 256
defaultMsize9p = 8192
defaultDisableGuestSeLinux = true
)
var (
@ -560,6 +562,9 @@ type HypervisorConfig struct {
// Disable selinux from the hypervisor process
DisableSeLinux bool
// Disable selinux from the container process
DisableGuestSeLinux bool
// Use legacy serial for the guest console
LegacySerial bool

View File

@ -92,22 +92,24 @@ func TestHypervisorConfigValidTemplateConfig(t *testing.T) {
func TestHypervisorConfigDefaults(t *testing.T) {
assert := assert.New(t)
hypervisorConfig := &HypervisorConfig{
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "",
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "",
DisableGuestSeLinux: defaultDisableGuestSeLinux,
}
testHypervisorConfigValid(t, hypervisorConfig, true)
hypervisorConfigDefaultsExpected := &HypervisorConfig{
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "",
NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p,
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "",
NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
}
assert.Exactly(hypervisorConfig, hypervisorConfigDefaultsExpected)

View File

@ -36,6 +36,7 @@ import (
"context"
"github.com/gogo/protobuf/proto"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"google.golang.org/grpc/codes"
@ -69,6 +70,9 @@ const (
kernelParamDebugConsole = "agent.debug_console"
kernelParamDebugConsoleVPort = "agent.debug_console_vport"
kernelParamDebugConsoleVPortValue = "1026"
// Default SELinux type applied to the container process inside guest
defaultSeLinuxContainerType = "container_t"
)
var (
@ -895,7 +899,7 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
return nil
}
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, stripVfio bool) {
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error {
// Disable Hooks since they have been handled on the host and there is
// no reason to send them to the agent. It would make no sense to try
// to apply them on the guest.
@ -907,11 +911,34 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str
grpcSpec.Linux.Seccomp = nil
}
// Disable SELinux inside of the virtual machine, the label will apply
// to the KVM process
// Pass SELinux label for the container process to the agent.
if grpcSpec.Process.SelinuxLabel != "" {
k.Logger().Info("SELinux label from config will be applied to the hypervisor process, not the VM workload")
grpcSpec.Process.SelinuxLabel = ""
if !disableGuestSeLinux {
k.Logger().Info("SELinux label will be applied to the container process inside guest")
var label string
if guestSeLinuxLabel != "" {
label = guestSeLinuxLabel
} else {
label = grpcSpec.Process.SelinuxLabel
}
processContext, err := selinux.NewContext(label)
if err != nil {
return err
}
// Change the type from KVM to container because the type passed from the high-level
// runtime is for KVM process.
if guestSeLinuxLabel == "" {
processContext["type"] = defaultSeLinuxContainerType
}
grpcSpec.Process.SelinuxLabel = processContext.Get()
} else {
k.Logger().Info("Empty SELinux label for the process and the mount because guest SELinux is disabled")
grpcSpec.Process.SelinuxLabel = ""
grpcSpec.Linux.MountLabel = ""
}
}
// By now only CPU constraints are supported
@ -973,6 +1000,8 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str
}
grpcSpec.Linux.Devices = linuxDevices
}
return nil
}
func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) {
@ -1256,9 +1285,20 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported
// Currently, guest SELinux can be enabled only when SELinux is enabled on the host side.
if !sandbox.config.HypervisorConfig.DisableGuestSeLinux && !selinux.GetEnabled() {
return nil, fmt.Errorf("Guest SELinux is enabled, but SELinux is disabled on the host side")
}
if sandbox.config.HypervisorConfig.DisableGuestSeLinux && sandbox.config.GuestSeLinuxLabel != "" {
return nil, fmt.Errorf("Custom SELinux security policy is provided, but guest SELinux is disabled")
}
// We need to constrain the spec to make sure we're not
// passing irrelevant information to the agent.
k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
if err != nil {
return nil, err
}
req := &grpc.CreateContainerRequest{
ContainerId: c.id,

View File

@ -619,7 +619,7 @@ func TestConstrainGRPCSpec(t *testing.T) {
}
k := kataAgent{}
k.constrainGRPCSpec(g, true, true)
k.constrainGRPCSpec(g, true, true, "", true)
// Check nil fields
assert.Nil(g.Hooks)

View File

@ -189,6 +189,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
SystemdCgroup: sconfig.SystemdCgroup,
SandboxCgroupOnly: sconfig.SandboxCgroupOnly,
DisableGuestSeccomp: sconfig.DisableGuestSeccomp,
GuestSeLinuxLabel: sconfig.GuestSeLinuxLabel,
}
ss.Config.SandboxBindMounts = append(ss.Config.SandboxBindMounts, sconfig.SandboxBindMounts...)
@ -429,6 +430,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
SystemdCgroup: savedConf.SystemdCgroup,
SandboxCgroupOnly: savedConf.SandboxCgroupOnly,
DisableGuestSeccomp: savedConf.DisableGuestSeccomp,
GuestSeLinuxLabel: savedConf.GuestSeLinuxLabel,
}
sconfig.SandboxBindMounts = append(sconfig.SandboxBindMounts, savedConf.SandboxBindMounts...)

View File

@ -243,19 +243,6 @@ type ContainerConfig struct {
// SandboxConfig is a sandbox configuration.
// Refs: virtcontainers/sandbox.go:SandboxConfig
type SandboxConfig struct {
// Information for fields not saved:
// * Annotation: this is kind of casual data, we don't need casual data in persist file,
// if you know this data needs to persist, please gives it
// a specific field
ContainerConfigs []ContainerConfig
// SandboxBindMounts - list of paths to mount into guest
SandboxBindMounts []string
// Experimental enables experimental features
Experimental []string
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
Cgroups *configs.Cgroup `json:"cgroups"`
@ -265,8 +252,24 @@ type SandboxConfig struct {
KataShimConfig *ShimConfig
HypervisorType string
NetworkConfig NetworkConfig
// Custom SELinux security policy to the container process inside the VM
GuestSeLinuxLabel string
HypervisorType string
// SandboxBindMounts - list of paths to mount into guest
SandboxBindMounts []string
// Experimental enables experimental features
Experimental []string
// Information for fields not saved:
// * Annotation: this is kind of casual data, we don't need casual data in persist file,
// if you know this data needs to persist, please gives it a specific field
ContainerConfigs []ContainerConfig
NetworkConfig NetworkConfig
HypervisorConfig HypervisorConfig
ShmSize uint64

View File

@ -247,6 +247,9 @@ const (
// DisableGuestSeccomp is a sandbox annotation that determines if seccomp should be applied inside guest.
DisableGuestSeccomp = kataAnnotRuntimePrefix + "disable_guest_seccomp"
// GuestSeLinuxLabel is a SELinux security policy that is applied to a container process inside guest.
GuestSeLinuxLabel = kataAnnotRuntimePrefix + "guest_selinux_label"
// SandboxCgroupOnly is a sandbox annotation that determines if kata processes are managed only in sandbox cgroup.
SandboxCgroupOnly = kataAnnotRuntimePrefix + "sandbox_cgroup_only"

View File

@ -181,6 +181,15 @@ func (q *qemu) kernelParameters() string {
// set the maximum number of vCPUs
params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)})
// set the SELinux params in accordance with the runtime configuration, disable_guest_selinux.
if q.config.DisableGuestSeLinux {
q.Logger().Info("Set selinux=0 to kernel params because SELinux on the guest is disabled")
params = append(params, Param{"selinux", "0"})
} else {
q.Logger().Info("Set selinux=1 to kernel params because SELinux on the guest is enabled")
params = append(params, Param{"selinux", "1"})
}
// add the params specified by the provided config. As the kernel
// honours the last parameter value set and since the config-provided
// params are added here, they will take priority over the defaults.
@ -476,6 +485,13 @@ func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) {
return nd, nil
}
// Set the xattr option for virtiofsd daemon to enable extended attributes
// in virtiofs if SELinux on the guest side is enabled.
if !q.config.DisableGuestSeLinux {
q.Logger().Info("Set the xattr option for virtiofsd")
q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "-o", "xattr")
}
// default use virtiofsd
return &virtiofsd{
path: q.config.VirtioFSDaemon,
@ -846,7 +862,6 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error {
// the SELinux label. If these processes require privileged, we do
// notwant to run them under confinement.
if !q.config.DisableSeLinux {
if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil {
return err
}

View File

@ -27,15 +27,16 @@ import (
func newQemuConfig() HypervisorConfig {
return HypervisorConfig{
KernelPath: testQemuKernelPath,
InitrdPath: testQemuInitrdPath,
HypervisorPath: testQemuPath,
NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p,
KernelPath: testQemuKernelPath,
InitrdPath: testQemuInitrdPath,
HypervisorPath: testQemuPath,
NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
}
}
@ -58,7 +59,7 @@ func testQemuKernelParameters(t *testing.T, kernelParams []Param, expected strin
}
func TestQemuKernelParameters(t *testing.T) {
expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d foo=foo bar=bar", govmm.MaxVCPUs())
expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d selinux=0 foo=foo bar=bar", govmm.MaxVCPUs())
params := []Param{
{
Key: "foo",

View File

@ -126,14 +126,17 @@ type SandboxResourceSizing struct {
// SandboxConfig is a Sandbox configuration.
type SandboxConfig struct {
// Volumes is a list of shared volumes between the host and the Sandbox.
Volumes []types.Volume
// Annotations keys must be unique strings and must be name-spaced
Annotations map[string]string
// Containers describe the list of containers within a Sandbox.
// This list can be empty and populated by adding containers
// to the Sandbox a posteriori.
//TODO: this should be a map to avoid duplicated containers
Containers []ContainerConfig
// Custom SELinux security policy to the container process inside the VM
GuestSeLinuxLabel string
HypervisorType HypervisorType
ID string
Hostname string
// SandboxBindMounts - list of paths to mount into guest
SandboxBindMounts []string
@ -141,31 +144,29 @@ type SandboxConfig struct {
// Experimental features enabled
Experimental []exp.Feature
// Annotations keys must be unique strings and must be name-spaced
// with e.g. reverse domain notation (org.clearlinux.key).
Annotations map[string]string
// Containers describe the list of containers within a Sandbox.
// This list can be empty and populated by adding containers
// to the Sandbox a posteriori.
// TODO: this should be a map to avoid duplicated containers
Containers []ContainerConfig
ID string
Hostname string
HypervisorType HypervisorType
AgentConfig KataAgentConfig
Volumes []types.Volume
NetworkConfig NetworkConfig
AgentConfig KataAgentConfig
HypervisorConfig HypervisorConfig
SandboxResources SandboxResourceSizing
// StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM)
StaticResourceMgmt bool
ShmSize uint64
SandboxResources SandboxResourceSizing
VfioMode config.VFIOModeType
// StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM)
StaticResourceMgmt bool
// SharePidNs sets all containers to share the same sandbox level pid namespace.
SharePidNs bool

View File

@ -64,6 +64,8 @@ readonly -a systemd_files=(
# Set a default value
AGENT_INIT=${AGENT_INIT:-no}
SELINUX=${SELINUX:-no}
SELINUXFS="/sys/fs/selinux"
# Align image to 128M
readonly mem_boundary_mb=128
@ -93,6 +95,10 @@ Extra environment variables:
DEFAULT: not set
USE_PODMAN: If set and USE_DOCKER not set, will build image in a Podman Container (requries podman)
DEFAULT: not set
SELINUX: If set to "yes", the rootfs is labeled for SELinux.
Make sure that selinuxfs is mounted to /sys/fs/selinux on the host
and the rootfs is built with SELINUX=yes.
DEFAULT value: "no"
Following diagram shows how the resulting image will look like
@ -134,6 +140,7 @@ build_with_container() {
local nsdax_bin="$9"
local container_image_name="image-builder-osbuilder"
local shared_files=""
local selinuxfs=""
image_dir=$(readlink -f "$(dirname "${image}")")
image_name=$(basename "${image}")
@ -157,6 +164,14 @@ build_with_container() {
shared_files+="-v ${mke2fs_conf}:${mke2fs_conf}:ro "
fi
if [ "${SELINUX}" == "yes" ]; then
if mountpoint $SELINUXFS > /dev/null; then
selinuxfs="-v ${SELINUXFS}:${SELINUXFS}"
else
die "Make sure that SELinux is enabled on the host"
fi
fi
#Make sure we use a compatible runtime to build rootfs
# In case Clear Containers Runtime is installed we dont want to hit issue:
#https://github.com/clearcontainers/runtime/issues/828
@ -170,12 +185,14 @@ build_with_container() {
--env BLOCK_SIZE="${block_size}" \
--env ROOT_FREE_SPACE="${root_free_space}" \
--env NSDAX_BIN="${nsdax_bin}" \
--env SELINUX="${SELINUX}" \
--env DEBUG="${DEBUG}" \
-v /dev:/dev \
-v "${script_dir}":"/osbuilder" \
-v "${script_dir}/../scripts":"/scripts" \
-v "${rootfs}":"/rootfs" \
-v "${image_dir}":"/image" \
${selinuxfs} \
${shared_files} \
${container_image_name} \
bash "/osbuilder/${script_name}" -o "/image/${image_name}" /rootfs
@ -384,6 +401,7 @@ create_rootfs_image() {
local img_size="$3"
local fs_type="$4"
local block_size="$5"
local agent_bin="$6"
create_disk "${image}" "${img_size}" "${fs_type}" "${rootfs_start}"
@ -402,6 +420,31 @@ create_rootfs_image() {
info "Copying content from rootfs to root partition"
cp -a "${rootfs}"/* "${mount_dir}"
if [ "${SELINUX}" == "yes" ]; then
if [ "${AGENT_INIT}" == "yes" ]; then
die "Guest SELinux with the agent init is not supported yet"
fi
info "Labeling rootfs for SELinux"
selinuxfs_path="${mount_dir}${SELINUXFS}"
mkdir -p $selinuxfs_path
if mountpoint $SELINUXFS > /dev/null && \
chroot "${mount_dir}" command -v restorecon > /dev/null; then
mount -t selinuxfs selinuxfs $selinuxfs_path
chroot "${mount_dir}" restorecon -RF -e ${SELINUXFS} /
# TODO: This operation will be removed after the updated container-selinux that
# includes the following commit is released.
# https://github.com/containers/container-selinux/commit/39f83cc74d50bd10ab6be4d0bdd98bc04857469f
# We use chcon as an interim solution until then.
chroot "${mount_dir}" chcon -t container_runtime_exec_t "/usr/bin/${agent_bin}"
umount $selinuxfs_path
else
die "Could not label the rootfs. Make sure that SELinux is enabled on the host \
and the rootfs is built with SELINUX=yes"
fi
fi
sync
OK "rootfs copied"
@ -529,7 +572,7 @@ main() {
# consider in calculate_img_size
rootfs_img_size=$((img_size - dax_header_sz))
create_rootfs_image "${rootfs}" "${image}" "${rootfs_img_size}" \
"${fs_type}" "${block_size}"
"${fs_type}" "${block_size}" "${agent_bin}"
# insert at the beginning of the image the MBR + DAX header
set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}"

View File

@ -8,10 +8,15 @@ OS_VERSION=${OS_VERSION:-stream9}
PACKAGES="chrony iptables"
[ "$AGENT_INIT" = no ] && PACKAGES+=" systemd"
[ "$SECCOMP" = yes ] && PACKAGES+=" libseccomp"
[ "$SELINUX" = yes ] && PACKAGES+=" container-selinux"
# Container registry tag is different from metalink repo, e.g. "stream9" => "9-stream"
os_repo_version="$(sed -E "s/(stream)(.+)/\2-\1/" <<< "$OS_VERSION")"
METALINK="https://mirrors.centos.org/metalink?repo=centos-baseos-$os_repo_version&arch=\$basearch"
if [ "$SELINUX" == yes ]; then
# AppStream repository is required for the container-selinux package
METALINK_APPSTREAM="https://mirrors.centos.org/metalink?repo=centos-appstream-$os_repo_version&arch=\$basearch"
fi
GPG_KEY_FILE=RPM-GPG-KEY-CentOS-Official
GPG_KEY_URL="https://centos.org/keys/$GPG_KEY_FILE"

View File

@ -25,6 +25,7 @@ LIBC=${LIBC:-musl}
# The kata agent enables seccomp feature.
# However, it is not enforced by default: you need to enable that in the main configuration file.
SECCOMP=${SECCOMP:-"yes"}
SELINUX=${SELINUX:-"no"}
lib_file="${script_dir}/../scripts/lib.sh"
source "$lib_file"
@ -142,6 +143,11 @@ ROOTFS_DIR Path to the directory that is populated with the rootfs.
SECCOMP When set to "no", the kata-agent is built without seccomp capability.
Default value: "yes"
SELINUX When set to "yes", build the rootfs with the required packages to
enable SELinux in the VM.
Make sure the guest kernel is compiled with SELinux enabled.
Default value: "no"
USE_DOCKER If set, build the rootfs inside a container (requires
Docker).
Default value: <not set>
@ -346,6 +352,15 @@ build_rootfs_distro()
echo "Required rust version: $RUST_VERSION"
if [ "${SELINUX}" == "yes" ]; then
if [ "${AGENT_INIT}" == "yes" ]; then
die "Guest SELinux with the agent init is not supported yet"
fi
if [ "${distro}" != "centos" ]; then
die "The guest rootfs must be CentOS to enable guest SELinux"
fi
fi
if [ -z "${USE_DOCKER}" ] && [ -z "${USE_PODMAN}" ]; then
info "build directly"
build_rootfs ${ROOTFS_DIR}
@ -426,6 +441,7 @@ build_rootfs_distro()
--env OS_VERSION="${OS_VERSION}" \
--env INSIDE_CONTAINER=1 \
--env SECCOMP="${SECCOMP}" \
--env SELINUX="${SELINUX}" \
--env DEBUG="${DEBUG}" \
--env HOME="/root" \
-v "${repo_dir}":"/kata-containers" \

View File

@ -79,7 +79,23 @@ gpgcheck=1
gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE}
EOF
fi
if [ "$SELINUX" == "yes" ]; then
cat > "${DNF_CONF}" << EOF
[appstream]
name=${OS_NAME}-${OS_VERSION} upstream
releasever=${OS_VERSION}
EOF
echo "metalink=$METALINK_APPSTREAM" >> "$DNF_CONF"
if [ -n "$GPG_KEY_URL" ]; then
if [ ! -f "${CONFIG_DIR}/${GPG_KEY_FILE}" ]; then
curl -L "${GPG_KEY_URL}" -o "${CONFIG_DIR}/${GPG_KEY_FILE}"
fi
cat >> "${DNF_CONF}" << EOF
gpgcheck=1
gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE}
EOF
fi
fi
}
build_rootfs()

View File

@ -0,0 +1,12 @@
# SELinux support:
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_LSM_MMAP_MIN_ADDR=6553
CONFIG_NETWORK_SECMARK=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DEVELOP=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0
CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9
CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256