diff --git a/src/runtime/cli/config/configuration-qemu.toml.in b/src/runtime/cli/config/configuration-qemu.toml.in index b195701c89..8975206b43 100644 --- a/src/runtime/cli/config/configuration-qemu.toml.in +++ b/src/runtime/cli/config/configuration-qemu.toml.in @@ -16,6 +16,14 @@ kernel = "@KERNELPATH@" image = "@IMAGEPATH@" machine_type = "@MACHINETYPE@" +# Enable confidential guest support. +# Toggling that setting may trigger different hardware features, ranging +# from memory encryption to both memory and CPU-state encryption and integrity. +# The Kata Containers runtime dynamically detects the available feature set and +# aims at enabling the largest possible one. +# Default false +# confidential_guest = true + # List of valid annotation names for the hypervisor # Each member of the list is a regular expression, which is the base name # of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" @@ -532,3 +540,30 @@ experimental=@DEFAULTEXPFEATURES@ # If enabled, user can run pprof tools with shim v2 process through kata-monitor. # (default: false) # enable_pprof = true + +# WARNING: All the options in the following section have not been implemented yet. +# This section was added as a placeholder. DO NOT USE IT! +[image] +# Container image service. +# +# Offload the CRI image management service to the Kata agent. +# (default: false) +#service_offload = true + +# Container image decryption keys provisioning. +# Applies only if service_offload is true. +# Keys can be provisioned locally (e.g. through a special command or +# a local file) or remotely (usually after the guest is remotely attested). +# The provision setting is a complete URL that lets the Kata agent decide +# which method to use in order to fetch the keys. +# +# Keys can be stored in a local file, in a measured and attested initrd: +#provision=data:///local/key/file +# +# Keys could be fetched through a special command or binary from the +# initrd (guest) image, e.g. a firmware call: +#provision=file:///path/to/bin/fetcher/in/guest +# +# Keys can be remotely provisioned. The Kata agent fetches them from e.g. +# a HTTPS URL: +#provision=https://my-key-broker.foo/tenant/ diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 7cd9138baa..470527998f 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -54,6 +54,7 @@ const defaultDisableImageNvdimm = false const defaultVhostUserStorePath string = "/var/run/kata-containers/vhost-user/" const defaultRxRateLimiterMaxRate = uint64(0) const defaultTxRateLimiterMaxRate = uint64(0) +const defaultConfidentialGuest = false var defaultSGXEPCSize = int64(0) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index e3f3b3bc9c..e1db0da8c0 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2018-2021 Intel Corporation // Copyright (c) 2018 HyperHQ Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -61,6 +61,12 @@ type tomlConfig struct { Runtime runtime Factory factory Netmon netmon + Image image +} + +type image struct { + ServiceOffload bool `toml:"service_offload"` + Provision string `toml:"provision"` } type factory struct { @@ -130,6 +136,7 @@ type hypervisor struct { HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` } type runtime struct { @@ -702,6 +709,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableAnnotations: h.EnableAnnotations, GuestMemoryDumpPath: h.GuestMemoryDumpPath, GuestMemoryDumpPaging: h.GuestMemoryDumpPaging, + ConfidentialGuest: h.ConfidentialGuest, }, nil } @@ -1055,6 +1063,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { RxRateLimiterMaxRate: defaultRxRateLimiterMaxRate, TxRateLimiterMaxRate: defaultTxRateLimiterMaxRate, SGXEPCSize: defaultSGXEPCSize, + ConfidentialGuest: defaultConfidentialGuest, } } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 767215b689..0b8cac7761 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -453,6 +453,11 @@ type HypervisorConfig struct { // GuestMemoryDumpPaging is used to indicate if enable paging // for QEMU dump-guest-memory command GuestMemoryDumpPaging bool + + // Enable confidential guest support. + // Enable or disable different hardware features, ranging + // from memory encryption to both memory and CPU-state encryption and integrity. + ConfidentialGuest bool } // vcpu mapping from vcpu number to thread number @@ -717,21 +722,16 @@ func getHostMemorySizeKb(memInfoPath string) (uint64, error) { return 0, fmt.Errorf("unable get MemTotal from %s", memInfoPath) } -// RunningOnVMM checks if the system is running inside a VM. -func RunningOnVMM(cpuInfoPath string) (bool, error) { - if runtime.GOARCH == "arm64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" { - virtLog.Info("Unable to know if the system is running inside a VM") - return false, nil - } - +func CPUFlags(cpuInfoPath string) (map[string]bool, error) { flagsField := "flags" f, err := os.Open(cpuInfoPath) if err != nil { - return false, err + return map[string]bool{}, err } defer f.Close() + flags := make(map[string]bool) scanner := bufio.NewScanner(f) for scanner.Scan() { // Expected format: ["flags", ":", ...] or ["flags:", ...] @@ -745,23 +745,31 @@ func RunningOnVMM(cpuInfoPath string) (bool, error) { } for _, field := range fields[1:] { - if field == "hypervisor" { - return true, nil - } + flags[field] = true } - // As long as we have been able to analyze the fields from - // "flags", there is no reason to check what comes next from - // /proc/cpuinfo, because we already know we are not running - // on a VMM. - return false, nil + return flags, nil } if err := scanner.Err(); err != nil { - return false, err + return map[string]bool{}, err } - return false, fmt.Errorf("Couldn't find %q from %q output", flagsField, cpuInfoPath) + return map[string]bool{}, fmt.Errorf("Couldn't find %q from %q output", flagsField, cpuInfoPath) +} + +// RunningOnVMM checks if the system is running inside a VM. +func RunningOnVMM(cpuInfoPath string) (bool, error) { + if runtime.GOARCH == "amd64" { + flags, err := CPUFlags(cpuInfoPath) + if err != nil { + return false, err + } + return flags["hypervisor"], nil + } + + virtLog.WithField("arch", runtime.GOARCH).Info("Unable to know if the system is running inside a VM") + return false, nil } func getHypervisorPid(h hypervisor) int { diff --git a/src/runtime/virtcontainers/hypervisor_amd64.go b/src/runtime/virtcontainers/hypervisor_amd64.go new file mode 100644 index 0000000000..4b75a08cfd --- /dev/null +++ b/src/runtime/virtcontainers/hypervisor_amd64.go @@ -0,0 +1,25 @@ +// Copyright (c) 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import "os" + +// Implementation of this function is architecture specific +func availableGuestProtection() (guestProtection, error) { + flags, err := CPUFlags(procCPUInfo) + if err != nil { + return noneProtection, err + } + + // TDX is supported and properly loaded when the firmware directory exists or `tdx` is part of the CPU flags + if d, err := os.Stat(tdxSysFirmwareDir); (err == nil && d.IsDir()) || flags[tdxCPUFlag] { + return tdxProtection, nil + } + + // TODO: Add support for other technologies: SEV + + return noneProtection, nil +} diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index c5b26facfc..cf1c4377ee 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -605,6 +605,11 @@ func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNa PidFile: filepath.Join(q.store.RunVMStoragePath(), q.id, "pid"), } + qemuConfig.Devices, qemuConfig.Bios, err = q.arch.appendProtectionDevice(qemuConfig.Devices, firmwarePath) + if err != nil { + return err + } + if ioThread != nil { qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread} } diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 1a045fae08..37b0748b31 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -11,6 +11,7 @@ import ( "time" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" + "github.com/sirupsen/logrus" govmmQemu "github.com/kata-containers/govmm/qemu" ) @@ -20,6 +21,8 @@ type qemuAmd64 struct { qemuArchBase vmFactory bool + + devLoadersCount uint32 } const ( @@ -30,6 +33,10 @@ const ( defaultQemuMachineOptions = "accel=kvm,kernel_irqchip" qmpMigrationWaitTimeout = 5 * time.Second + + tdxSysFirmwareDir = "/sys/firmware/tdx_seam/" + + tdxCPUFlag = "tdx" ) var qemuPaths = map[string]string{ @@ -106,17 +113,17 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { factory = true } - if config.IOMMU { - var q35QemuIOMMUOptions = "accel=kvm,kernel_irqchip=split" + // IOMMU and Guest Protection require a split IRQ controller for handling interrupts + // otherwise QEMU won't be able to create the kernel irqchip + if config.IOMMU || config.ConfidentialGuest { + mp.Options = "accel=kvm,kernel_irqchip=split" + } + if config.IOMMU { kernelParams = append(kernelParams, Param{"intel_iommu", "on"}) kernelParams = append(kernelParams, Param{"iommu", "pt"}) - - if mp.Type == QemuQ35 { - mp.Options = q35QemuIOMMUOptions - } } q := &qemuAmd64{ @@ -129,10 +136,17 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { kernelParams: kernelParams, disableNvdimm: config.DisableImageNvdimm, dax: true, + protection: noneProtection, }, vmFactory: factory, } + if config.ConfidentialGuest { + if err := q.enableProtection(); err != nil { + return nil, err + } + } + q.handleImagePath(config) return q, nil @@ -191,3 +205,55 @@ func (q *qemuAmd64) appendImage(ctx context.Context, devices []govmmQemu.Device, func (q *qemuAmd64) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) } + +// enable protection +func (q *qemuAmd64) enableProtection() error { + var err error + q.protection, err = availableGuestProtection() + if err != nil { + return err + } + + switch q.protection { + case tdxProtection: + if q.qemuMachine.Options != "" { + q.qemuMachine.Options += "," + } + q.qemuMachine.Options += "kvm-type=tdx,confidential-guest-support=tdx" + q.kernelParams = append(q.kernelParams, Param{"tdx_guest", ""}) + virtLog.WithFields(logrus.Fields{ + "subsystem": "qemuAmd64", + "machine": q.qemuMachine, + "kernel-params": q.kernelParameters}). + Info("Enabling TDX guest protection") + return nil + + // TODO: Add support for other x86_64 technologies: SEV + + default: + return fmt.Errorf("This system doesn't support Confidential Computing (Guest Protection)") + } +} + +// append protection device +func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error) { + switch q.protection { + case tdxProtection: + id := q.devLoadersCount + q.devLoadersCount += 1 + return append(devices, + govmmQemu.Object{ + Driver: govmmQemu.Loader, + Type: govmmQemu.TDXGuest, + ID: "tdx", + DeviceID: fmt.Sprintf("fd%d", id), + Debug: false, + File: firmware, + }), "", nil + case noneProtection: + return devices, firmware, nil + + default: + return devices, "", fmt.Errorf("Unsupported guest protection technology: %v", q.protection) + } +} diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index 1d321e9353..8772361cba 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -276,3 +276,53 @@ func TestQemuAmd64Microvm(t *testing.T) { assert.False(amd64.supportGuestMemoryHotplug()) } + +func TestQemuAmd64AppendProtectionDevice(t *testing.T) { + var devices []govmmQemu.Device + assert := assert.New(t) + + amd64 := newTestQemu(assert, QemuPC) + + id := amd64.(*qemuAmd64).devLoadersCount + firmware := "tdvf.fd" + var bios string + var err error + devices, bios, err = amd64.appendProtectionDevice(devices, firmware) + assert.NoError(err) + + // non-protection + assert.NotEmpty(bios) + + // pef protection + amd64.(*qemuAmd64).protection = pefProtection + devices, bios, err = amd64.appendProtectionDevice(devices, firmware) + assert.Error(err) + assert.Empty(bios) + + // sev protection + // TODO: update once it's supported + amd64.(*qemuAmd64).protection = sevProtection + devices, bios, err = amd64.appendProtectionDevice(devices, firmware) + assert.Error(err) + assert.Empty(bios) + + // tdxProtection + amd64.(*qemuAmd64).protection = tdxProtection + + devices, bios, err = amd64.appendProtectionDevice(devices, firmware) + assert.NoError(err) + assert.Empty(bios) + + expectedOut := []govmmQemu.Device{ + govmmQemu.Object{ + Driver: govmmQemu.Loader, + Type: govmmQemu.TDXGuest, + ID: "tdx", + DeviceID: fmt.Sprintf("fd%d", id), + Debug: false, + File: firmware, + }, + } + + assert.Equal(expectedOut, devices) +} diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index d2ffac4a1d..5c7b1218c4 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -11,6 +11,7 @@ import ( "errors" "fmt" "os" + "runtime" "strconv" "strings" @@ -142,8 +143,33 @@ type qemuArch interface { // append pvpanic device appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error) + + // append protection device. + // This implementation is architecture specific, some archs may need + // a firmware, returns a string containing the path to the firmware that should + // be used with the -bios option, ommit -bios option if the path is empty. + appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error) } +// Kind of guest protection +type guestProtection uint8 + +const ( + noneProtection guestProtection = iota + + //Intel Trust Domain Extensions + //https://software.intel.com/content/www/us/en/develop/articles/intel-trust-domain-extensions.html + tdxProtection + + // AMD Secure Encrypted Virtualization + // https://developer.amd.com/sev/ + sevProtection + + // IBM POWER 9 Protected Execution Facility + // https://www.kernel.org/doc/html/latest/powerpc/ultravisor.html + pefProtection +) + type qemuArchBase struct { qemuMachine govmmQemu.Machine qemuExePath string @@ -158,6 +184,7 @@ type qemuArchBase struct { kernelParams []Param Bridges []types.Bridge PFlash []string + protection guestProtection } const ( @@ -813,3 +840,9 @@ func (q *qemuArchBase) getPFlash() ([]string, error) { func (q *qemuArchBase) setPFlash(p []string) { q.PFlash = p } + +// append protection device +func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error) { + virtLog.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture") + return devices, firmware, nil +}