Merge pull request #1590 from devimc/2021-02-02/ConfidentialComputing

Support TDx
2025-08-02 08:17:01 +00:00 · 2021-05-10 22:19:40 +02:00 · 2021-05-10 22:19:40 +02:00 · 2c4e4ca1ac
commit 2c4e4ca1ac
parent becd270ccf 4f61f4b490
9 changed files with 257 additions and 25 deletions
--- a/src/runtime/cli/config/configuration-qemu.toml.in
+++ b/src/runtime/cli/config/configuration-qemu.toml.in
@ -16,6 +16,14 @@ kernel = "@KERNELPATH@"
 image = "@IMAGEPATH@"
 machine_type = "@MACHINETYPE@"

+# Enable confidential guest support.
+# Toggling that setting may trigger different hardware features, ranging
+# from memory encryption to both memory and CPU-state encryption and integrity.
+# The Kata Containers runtime dynamically detects the available feature set and
+# aims at enabling the largest possible one.
+# Default false
+# confidential_guest = true
+
 # List of valid annotation names for the hypervisor
 # Each member of the list is a regular expression, which is the base name
 # of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
@ -532,3 +540,30 @@ experimental=@DEFAULTEXPFEATURES@
 # If enabled, user can run pprof tools with shim v2 process through kata-monitor.
 # (default: false)
 # enable_pprof = true
+
+# WARNING: All the options in the following section have not been implemented yet.
+# This section was added as a placeholder. DO NOT USE IT!
+[image]
+# Container image service.
+#
+# Offload the CRI image management service to the Kata agent.
+# (default: false)
+#service_offload = true
+
+# Container image decryption keys provisioning.
+# Applies only if service_offload is true.
+# Keys can be provisioned locally (e.g. through a special command or
+# a local file) or remotely (usually after the guest is remotely attested).
+# The provision setting is a complete URL that lets the Kata agent decide
+# which method to use in order to fetch the keys.
+#
+# Keys can be stored in a local file, in a measured and attested initrd:
+#provision=data:///local/key/file
+#
+# Keys could be fetched through a special command or binary from the
+# initrd (guest) image, e.g. a firmware call:
+#provision=file:///path/to/bin/fetcher/in/guest
+#
+# Keys can be remotely provisioned. The Kata agent fetches them from e.g.
+# a HTTPS URL:
+#provision=https://my-key-broker.foo/tenant/<tenant-id>
--- a/src/runtime/pkg/katautils/config-settings.go.in
+++ b/src/runtime/pkg/katautils/config-settings.go.in
@ -54,6 +54,7 @@ const defaultDisableImageNvdimm = false
 const defaultVhostUserStorePath string = "/var/run/kata-containers/vhost-user/"
 const defaultRxRateLimiterMaxRate = uint64(0)
 const defaultTxRateLimiterMaxRate = uint64(0)
+const defaultConfidentialGuest = false

 var defaultSGXEPCSize = int64(0)

--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2021 Intel Corporation
 // Copyright (c) 2018 HyperHQ Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
@ -61,6 +61,12 @@ type tomlConfig struct {
 	Runtime    runtime
 	Factory    factory
 	Netmon     netmon
+	Image      image
+}
+
+type image struct {
+	ServiceOffload bool   `toml:"service_offload"`
+	Provision      string `toml:"provision"`
 }

 type factory struct {
@ -130,6 +136,7 @@ type hypervisor struct {
 	HotplugVFIOOnRootBus    bool     `toml:"hotplug_vfio_on_root_bus"`
 	DisableVhostNet         bool     `toml:"disable_vhost_net"`
 	GuestMemoryDumpPaging   bool     `toml:"guest_memory_dump_paging"`
+	ConfidentialGuest       bool     `toml:"confidential_guest"`
 }

 type runtime struct {
@ -702,6 +709,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		EnableAnnotations:       h.EnableAnnotations,
 		GuestMemoryDumpPath:     h.GuestMemoryDumpPath,
 		GuestMemoryDumpPaging:   h.GuestMemoryDumpPaging,
+		ConfidentialGuest:       h.ConfidentialGuest,
 	}, nil
 }

@ -1055,6 +1063,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
 		RxRateLimiterMaxRate:    defaultRxRateLimiterMaxRate,
 		TxRateLimiterMaxRate:    defaultTxRateLimiterMaxRate,
 		SGXEPCSize:              defaultSGXEPCSize,
+		ConfidentialGuest:       defaultConfidentialGuest,
 	}
 }

--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@ -453,6 +453,11 @@ type HypervisorConfig struct {
 	// GuestMemoryDumpPaging is used to indicate if enable paging
 	// for QEMU dump-guest-memory command
 	GuestMemoryDumpPaging bool
+
+	// Enable confidential guest support.
+	// Enable or disable different hardware features, ranging
+	// from memory encryption to both memory and CPU-state encryption and integrity.
+	ConfidentialGuest bool
 }

 // vcpu mapping from vcpu number to thread number
@ -717,21 +722,16 @@ func getHostMemorySizeKb(memInfoPath string) (uint64, error) {
 	return 0, fmt.Errorf("unable get MemTotal from %s", memInfoPath)
 }

-// RunningOnVMM checks if the system is running inside a VM.
-func RunningOnVMM(cpuInfoPath string) (bool, error) {
-	if runtime.GOARCH == "arm64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" {
-		virtLog.Info("Unable to know if the system is running inside a VM")
-		return false, nil
-	}
-
+func CPUFlags(cpuInfoPath string) (map[string]bool, error) {
 	flagsField := "flags"

 	f, err := os.Open(cpuInfoPath)
 	if err != nil {
-		return false, err
+		return map[string]bool{}, err
 	}
 	defer f.Close()

+	flags := make(map[string]bool)
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
 		// Expected format: ["flags", ":", ...] or ["flags:", ...]
@ -745,23 +745,31 @@ func RunningOnVMM(cpuInfoPath string) (bool, error) {
 		}

 		for _, field := range fields[1:] {
-			if field == "hypervisor" {
-				return true, nil
-			}
+			flags[field] = true
 		}

-		// As long as we have been able to analyze the fields from
-		// "flags", there is no reason to check what comes next from
-		// /proc/cpuinfo, because we already know we are not running
-		// on a VMM.
-		return false, nil
+		return flags, nil
 	}

 	if err := scanner.Err(); err != nil {
-		return false, err
+		return map[string]bool{}, err
 	}

-	return false, fmt.Errorf("Couldn't find %q from %q output", flagsField, cpuInfoPath)
+	return map[string]bool{}, fmt.Errorf("Couldn't find %q from %q output", flagsField, cpuInfoPath)
+}
+
+// RunningOnVMM checks if the system is running inside a VM.
+func RunningOnVMM(cpuInfoPath string) (bool, error) {
+	if runtime.GOARCH == "amd64" {
+		flags, err := CPUFlags(cpuInfoPath)
+		if err != nil {
+			return false, err
+		}
+		return flags["hypervisor"], nil
+	}
+
+	virtLog.WithField("arch", runtime.GOARCH).Info("Unable to know if the system is running inside a VM")
+	return false, nil
 }

 func getHypervisorPid(h hypervisor) int {
--- a/src/runtime/virtcontainers/hypervisor_amd64.go
+++ b/src/runtime/virtcontainers/hypervisor_amd64.go
@ -0,0 +1,25 @@
+// Copyright (c) 2021 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package virtcontainers
+
+import "os"
+
+// Implementation of this function is architecture specific
+func availableGuestProtection() (guestProtection, error) {
+	flags, err := CPUFlags(procCPUInfo)
+	if err != nil {
+		return noneProtection, err
+	}
+
+	// TDX is supported and properly loaded when the firmware directory exists or `tdx` is part of the CPU flags
+	if d, err := os.Stat(tdxSysFirmwareDir); (err == nil && d.IsDir()) || flags[tdxCPUFlag] {
+		return tdxProtection, nil
+	}
+
+	// TODO: Add support for other technologies: SEV
+
+	return noneProtection, nil
+}
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@ -605,6 +605,11 @@ func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNa
 		PidFile:     filepath.Join(q.store.RunVMStoragePath(), q.id, "pid"),
 	}

+	qemuConfig.Devices, qemuConfig.Bios, err = q.arch.appendProtectionDevice(qemuConfig.Devices, firmwarePath)
+	if err != nil {
+		return err
+	}
+
 	if ioThread != nil {
 		qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread}
 	}
--- a/src/runtime/virtcontainers/qemu_amd64.go
+++ b/src/runtime/virtcontainers/qemu_amd64.go
@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
+	"github.com/sirupsen/logrus"

 	govmmQemu "github.com/kata-containers/govmm/qemu"
 )
@ -20,6 +21,8 @@ type qemuAmd64 struct {
 	qemuArchBase

 	vmFactory bool
+
+	devLoadersCount uint32
 }

 const (
@ -30,6 +33,10 @@ const (
 	defaultQemuMachineOptions = "accel=kvm,kernel_irqchip"

 	qmpMigrationWaitTimeout = 5 * time.Second
+
+	tdxSysFirmwareDir = "/sys/firmware/tdx_seam/"
+
+	tdxCPUFlag = "tdx"
 )

 var qemuPaths = map[string]string{
@ -106,17 +113,17 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
 		factory = true
 	}

-	if config.IOMMU {
-		var q35QemuIOMMUOptions = "accel=kvm,kernel_irqchip=split"
+	// IOMMU and Guest Protection require a split IRQ controller for handling interrupts
+	// otherwise QEMU won't be able to create the kernel irqchip
+	if config.IOMMU || config.ConfidentialGuest {
+		mp.Options = "accel=kvm,kernel_irqchip=split"
+	}

+	if config.IOMMU {
 		kernelParams = append(kernelParams,
 			Param{"intel_iommu", "on"})
 		kernelParams = append(kernelParams,
 			Param{"iommu", "pt"})
-
-		if mp.Type == QemuQ35 {
-			mp.Options = q35QemuIOMMUOptions
-		}
 	}

 	q := &qemuAmd64{
@ -129,10 +136,17 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
 			kernelParams:         kernelParams,
 			disableNvdimm:        config.DisableImageNvdimm,
 			dax:                  true,
+			protection:           noneProtection,
 		},
 		vmFactory: factory,
 	}

+	if config.ConfidentialGuest {
+		if err := q.enableProtection(); err != nil {
+			return nil, err
+		}
+	}
+
 	q.handleImagePath(config)

 	return q, nil
@ -191,3 +205,55 @@ func (q *qemuAmd64) appendImage(ctx context.Context, devices []govmmQemu.Device,
 func (q *qemuAmd64) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device {
 	return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type)
 }
+
+// enable protection
+func (q *qemuAmd64) enableProtection() error {
+	var err error
+	q.protection, err = availableGuestProtection()
+	if err != nil {
+		return err
+	}
+
+	switch q.protection {
+	case tdxProtection:
+		if q.qemuMachine.Options != "" {
+			q.qemuMachine.Options += ","
+		}
+		q.qemuMachine.Options += "kvm-type=tdx,confidential-guest-support=tdx"
+		q.kernelParams = append(q.kernelParams, Param{"tdx_guest", ""})
+		virtLog.WithFields(logrus.Fields{
+			"subsystem":     "qemuAmd64",
+			"machine":       q.qemuMachine,
+			"kernel-params": q.kernelParameters}).
+			Info("Enabling TDX guest protection")
+		return nil
+
+	// TODO: Add support for other x86_64 technologies: SEV
+
+	default:
+		return fmt.Errorf("This system doesn't support Confidential Computing (Guest Protection)")
+	}
+}
+
+// append protection device
+func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error) {
+	switch q.protection {
+	case tdxProtection:
+		id := q.devLoadersCount
+		q.devLoadersCount += 1
+		return append(devices,
+			govmmQemu.Object{
+				Driver:   govmmQemu.Loader,
+				Type:     govmmQemu.TDXGuest,
+				ID:       "tdx",
+				DeviceID: fmt.Sprintf("fd%d", id),
+				Debug:    false,
+				File:     firmware,
+			}), "", nil
+	case noneProtection:
+		return devices, firmware, nil
+
+	default:
+		return devices, "", fmt.Errorf("Unsupported guest protection technology: %v", q.protection)
+	}
+}
--- a/src/runtime/virtcontainers/qemu_amd64_test.go
+++ b/src/runtime/virtcontainers/qemu_amd64_test.go
@ -276,3 +276,53 @@ func TestQemuAmd64Microvm(t *testing.T) {

 	assert.False(amd64.supportGuestMemoryHotplug())
 }
+
+func TestQemuAmd64AppendProtectionDevice(t *testing.T) {
+	var devices []govmmQemu.Device
+	assert := assert.New(t)
+
+	amd64 := newTestQemu(assert, QemuPC)
+
+	id := amd64.(*qemuAmd64).devLoadersCount
+	firmware := "tdvf.fd"
+	var bios string
+	var err error
+	devices, bios, err = amd64.appendProtectionDevice(devices, firmware)
+	assert.NoError(err)
+
+	// non-protection
+	assert.NotEmpty(bios)
+
+	// pef protection
+	amd64.(*qemuAmd64).protection = pefProtection
+	devices, bios, err = amd64.appendProtectionDevice(devices, firmware)
+	assert.Error(err)
+	assert.Empty(bios)
+
+	// sev protection
+	// TODO: update once it's supported
+	amd64.(*qemuAmd64).protection = sevProtection
+	devices, bios, err = amd64.appendProtectionDevice(devices, firmware)
+	assert.Error(err)
+	assert.Empty(bios)
+
+	// tdxProtection
+	amd64.(*qemuAmd64).protection = tdxProtection
+
+	devices, bios, err = amd64.appendProtectionDevice(devices, firmware)
+	assert.NoError(err)
+	assert.Empty(bios)
+
+	expectedOut := []govmmQemu.Device{
+		govmmQemu.Object{
+			Driver:   govmmQemu.Loader,
+			Type:     govmmQemu.TDXGuest,
+			ID:       "tdx",
+			DeviceID: fmt.Sprintf("fd%d", id),
+			Debug:    false,
+			File:     firmware,
+		},
+	}
+
+	assert.Equal(expectedOut, devices)
+}
--- a/src/runtime/virtcontainers/qemu_arch_base.go
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
@ -11,6 +11,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"

@ -142,8 +143,33 @@ type qemuArch interface {

 	// append pvpanic device
 	appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
+
+	// append protection device.
+	// This implementation is architecture specific, some archs may need
+	// a firmware, returns a string containing the path to the firmware that should
+	// be used with the -bios option, ommit -bios option if the path is empty.
+	appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error)
 }

+// Kind of guest protection
+type guestProtection uint8
+
+const (
+	noneProtection guestProtection = iota
+
+	//Intel Trust Domain Extensions
+	//https://software.intel.com/content/www/us/en/develop/articles/intel-trust-domain-extensions.html
+	tdxProtection
+
+	// AMD Secure Encrypted Virtualization
+	// https://developer.amd.com/sev/
+	sevProtection
+
+	// IBM POWER 9 Protected Execution Facility
+	// https://www.kernel.org/doc/html/latest/powerpc/ultravisor.html
+	pefProtection
+)
+
 type qemuArchBase struct {
 	qemuMachine          govmmQemu.Machine
 	qemuExePath          string
@ -158,6 +184,7 @@ type qemuArchBase struct {
 	kernelParams         []Param
 	Bridges              []types.Bridge
 	PFlash               []string
+	protection           guestProtection
 }

 const (
@ -813,3 +840,9 @@ func (q *qemuArchBase) getPFlash() ([]string, error) {
 func (q *qemuArchBase) setPFlash(p []string) {
 	q.PFlash = p
 }
+
+// append protection device
+func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmware string) ([]govmmQemu.Device, string, error) {
+	virtLog.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture")
+	return devices, firmware, nil
+}