//go:build linux // Copyright (c) 2016 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 // package virtcontainers import ( "bufio" "bytes" "compress/gzip" "context" "encoding/binary" "encoding/hex" "encoding/json" "fmt" "io" "math" "net" "os" "os/exec" "os/user" "path/filepath" "regexp" "strconv" "strings" "sync" "sync/atomic" "syscall" "time" "unsafe" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) // qemuTracingTags defines tags for the trace span var qemuTracingTags = map[string]string{ "source": "runtime", "package": "virtcontainers", "subsystem": "hypervisor", "type": "qemu", } // romFile is the file name of the ROM that can be used for virtio-pci devices. // If this file name is empty, this means we expect the firmware used by Qemu, // such as SeaBIOS or OVMF for instance, to handle this directly. const romFile = "" // disable-modern is a option to QEMU that will fall back to using 0.9 version // of virtio. Since moving to QEMU4.0, we can start using virtio 1.0 version. // Default value is false. const defaultDisableModern = false type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context disconn chan struct{} path string sync.Mutex } // QemuState keeps Qemu's state type QemuState struct { UUID string HotPlugVFIO config.PCIePort Bridges []types.Bridge HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int VirtiofsDaemonPid int HotplugVFIO config.PCIePort ColdPlugVFIO config.PCIePort PCIeRootPort uint32 PCIeSwitchPort uint32 } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. // nolint: govet type qemu struct { arch qemuArch virtiofsDaemon VirtiofsDaemon ctx context.Context // fds is a list of file descriptors inherited by QEMU process // they'll be closed once QEMU process is running fds []*os.File id string state QemuState qmpMonitorCh qmpChannel qemuConfig govmmQemu.Config config HypervisorConfig // if in memory dump progress memoryDumpFlag sync.Mutex nvdimmCount int stopped int32 mu sync.Mutex } const ( consoleSocket = "console.sock" qmpSocket = "qmp.sock" extraMonitorSocket = "extra-monitor.sock" vhostFSSocket = "vhost-fs.sock" nydusdAPISock = "nydusd-api.sock" // memory dump format will be set to elf memoryDumpFormat = "elf" qmpCapErrMsg = "Failed to negotiate QMP Capabilities" qmpExecCatCmd = "exec:cat" scsiControllerID = "scsi0" rngID = "rng0" fallbackFileBackedMemDir = "/dev/shm" qemuStopSandboxTimeoutSecs = 15 qomPathPrefix = "/machine/peripheral/" ) // agnostic list of kernel parameters var defaultKernelParameters = []Param{ {"panic", "1"}, } type qmpLogger struct { logger *logrus.Entry } func newQMPLogger() qmpLogger { return qmpLogger{ logger: hvLogger.WithField("subsystem", "qmp"), } } func (l qmpLogger) V(level int32) bool { return level != 0 } func (l qmpLogger) Infof(format string, v ...interface{}) { l.logger.Infof(format, v...) } func (l qmpLogger) Warningf(format string, v ...interface{}) { l.logger.Warnf(format, v...) } func (l qmpLogger) Errorf(format string, v ...interface{}) { l.logger.Errorf(format, v...) } // Logger returns a logrus logger appropriate for logging qemu messages func (q *qemu) Logger() *logrus.Entry { return hvLogger.WithField("subsystem", "qemu") } func (q *qemu) kernelParameters() string { // get a list of arch kernel parameters params := q.arch.kernelParameters(q.config.Debug) // use default parameters params = append(params, defaultKernelParameters...) // set the maximum number of vCPUs params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)}) // set the SELinux params in accordance with the runtime configuration, disable_guest_selinux. if q.config.DisableGuestSeLinux { q.Logger().Info("Set selinux=0 to kernel params because SELinux on the guest is disabled") params = append(params, Param{"selinux", "0"}) } else { q.Logger().Info("Set selinux=1 to kernel params because SELinux on the guest is enabled") params = append(params, Param{"selinux", "1"}) } // add the params specified by the provided config. As the kernel // honours the last parameter value set and since the config-provided // params are added here, they will take priority over the defaults. params = append(params, q.config.KernelParams...) paramsStr := SerializeParams(params, "=") return strings.Join(paramsStr, " ") } // Adds all capabilities supported by qemu implementation of hypervisor interface func (q *qemu) Capabilities(ctx context.Context) types.Capabilities { span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() return q.arch.capabilities(q.config) } func (q *qemu) HypervisorConfig() HypervisorConfig { return q.config } // get the QEMU binary path func (q *qemu) qemuPath() (string, error) { p, err := q.config.HypervisorAssetPath() if err != nil { return "", err } if p == "" { p = q.arch.qemuPath() } if _, err = os.Stat(p); os.IsNotExist(err) { return "", fmt.Errorf("QEMU path (%s) does not exist", p) } return p, nil } // setup sets the Qemu structure up. func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *HypervisorConfig) error { span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() if err := q.setConfig(hypervisorConfig); err != nil { return err } q.id = id var err error q.arch, err = newQemuArch(q.config) if err != nil { return err } initrdPath, err := q.config.InitrdAssetPath() if err != nil { return err } imagePath, err := q.config.ImageAssetPath() if err != nil { return err } if initrdPath == "" && imagePath != "" && !q.config.DisableImageNvdimm { q.nvdimmCount = 1 } else { q.nvdimmCount = 0 } var create bool if q.state.UUID == "" { create = true } q.arch.setBridges(q.state.Bridges) q.arch.setPFlash(q.config.PFlash) if create { q.Logger().Debug("Creating bridges") q.arch.bridges(q.config.DefaultBridges) q.Logger().Debug("Creating UUID") q.state.UUID = uuid.Generate().String() q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.PCIeRootPort = q.config.PCIeRootPort q.state.PCIeSwitchPort = q.config.PCIeSwitchPort // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. if err = utils.MkdirAllWithInheritedOwner(filepath.Join(q.config.RunStorePath, id), DirMode); err != nil { return err } } nested, err := RunningOnVMM(procCPUInfo) if err != nil { return err } if !q.config.DisableNestingChecks && nested { q.arch.enableNestingChecks() } else { q.Logger().WithField("inside-vm", fmt.Sprintf("%t", nested)).Debug("Disable nesting environment checks") q.arch.disableNestingChecks() } if !q.config.DisableVhostNet { q.arch.enableVhostNet() } else { q.Logger().Debug("Disable vhost_net") q.arch.disableVhostNet() } return nil } func (q *qemu) cpuTopology() govmmQemu.SMP { return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs) } func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { hostMemMb := q.config.DefaultMaxMemorySize memMb := uint64(q.config.MemorySize) return q.arch.memoryTopology(memMb, hostMemMb, uint8(q.config.MemSlots)), nil } func (q *qemu) qmpSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket) } func (q *qemu) extraMonitorSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, extraMonitorSocket) } func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) { machine := q.arch.machine() accelerators := q.config.MachineAccelerators if accelerators != "" { if !strings.HasPrefix(accelerators, ",") { accelerators = fmt.Sprintf(",%s", accelerators) } machine.Options += accelerators } return machine, nil } func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { monitorSockPath, err := q.qmpSocketPath(q.id) if err != nil { return nil, err } q.qmpMonitorCh = qmpChannel{ ctx: q.ctx, path: monitorSockPath, } var sockets []govmmQemu.QMPSocket sockets = append(sockets, govmmQemu.QMPSocket{ Type: "unix", Protocol: govmmQemu.Qmp, Server: true, NoWait: true, }) // The extra monitor socket allows an external user to take full // control on Qemu and silently break the VM in all possible ways. // It should only ever be used for debugging purposes, hence the // check on Debug. if q.HypervisorConfig().Debug && q.config.ExtraMonitorSocket != "" { extraMonitorSockPath, err := q.extraMonitorSocketPath(q.id) if err != nil { return nil, err } sockets = append(sockets, govmmQemu.QMPSocket{ Type: "unix", Protocol: q.config.ExtraMonitorSocket, Name: extraMonitorSockPath, Server: true, NoWait: true, }) q.Logger().Warn("QEMU configured to start with an untrusted monitor") } return sockets, nil } func (q *qemu) buildInitdataDevice(devices []govmmQemu.Device, InitdataImage string) []govmmQemu.Device { device := govmmQemu.BlockDevice{ Driver: govmmQemu.VirtioBlock, Transport: govmmQemu.TransportPCI, ID: "initdata", File: InitdataImage, SCSI: false, WCE: false, AIO: govmmQemu.Threads, Interface: "none", Format: "raw", } devices = append(devices, device) return devices } func (q *qemu) buildDevices(ctx context.Context, kernelPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, *govmmQemu.Kernel, error) { var devices []govmmQemu.Device kernel := &govmmQemu.Kernel{ Path: kernelPath, } _, console, err := q.GetVMConsole(ctx, q.id) if err != nil { return nil, nil, nil, err } // Add bridges before any other devices. This way we make sure that // bridge gets the first available PCI address i.e bridgePCIStartAddr devices = q.arch.appendBridges(devices) devices, err = q.arch.appendConsole(ctx, devices, console) if err != nil { return nil, nil, nil, err } assetPath, assetType, err := q.config.ImageOrInitrdAssetPath() if err != nil { return nil, nil, nil, err } if assetType == types.ImageAsset { devices, err = q.arch.appendImage(ctx, devices, assetPath) if err != nil { return nil, nil, nil, err } } else if assetType == types.InitrdAsset { // InitrdAsset, need to set kernel initrd path kernel.InitrdPath = assetPath } else if assetType == types.SecureBootAsset { // SecureBootAsset, no need to set image or initrd path q.Logger().Info("For IBM Z Secure Execution, initrd path should not be set") kernel.InitrdPath = "" } if q.config.IOMMU { devices, err = q.arch.appendIOMMU(devices) if err != nil { return nil, nil, nil, err } } if q.config.IfPVPanicEnabled() { // there should have no errors for pvpanic device devices, _ = q.arch.appendPVPanicDevice(devices) } var ioThread *govmmQemu.IOThread if q.config.BlockDeviceDriver == config.VirtioSCSI { devices, ioThread, err = q.arch.appendSCSIController(ctx, devices, q.config.EnableIOThreads) if err != nil { return nil, nil, nil, err } } return devices, ioThread, kernel, nil } func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming { incoming := govmmQemu.Incoming{} if q.config.BootToBeTemplate || q.config.BootFromTemplate { knobs.FileBackedMem = true memory.Path = q.config.MemoryPath if q.config.BootToBeTemplate { knobs.MemShared = true } if q.config.BootFromTemplate { incoming.MigrationType = govmmQemu.MigrationDefer } } return incoming } func (q *qemu) setupFileBackedMem(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) { var target string if q.config.FileBackedMemRootDir != "" { target = q.config.FileBackedMemRootDir } else { target = fallbackFileBackedMemDir } if _, err := os.Stat(target); err != nil { q.Logger().WithError(err).Error("File backed memory location does not exist") return } knobs.FileBackedMem = true knobs.MemShared = true memory.Path = target } func (q *qemu) setConfig(config *HypervisorConfig) error { q.config = *config return nil } func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) { virtiofsdSocketPath, err := q.vhostFSSocketPath(q.id) if err != nil { return nil, err } if q.config.SharedFS == config.VirtioFSNydus { apiSockPath, err := q.nydusdAPISocketPath(q.id) if err != nil { return nil, err } nd := &nydusd{ path: q.config.VirtioFSDaemon, sockPath: virtiofsdSocketPath, apiSockPath: apiSockPath, sourcePath: sharedPath, debug: q.config.Debug, extraArgs: q.config.VirtioFSExtraArgs, startFn: startInShimNS, } nd.setupShareDirFn = nd.setupPassthroughFS return nd, nil } // Set the xattr option for virtiofsd daemon to enable extended attributes // in virtiofs if SELinux on the guest side is enabled. if !q.config.DisableGuestSeLinux { q.Logger().Info("Set the xattr option for virtiofsd") q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "--xattr") } // default use virtiofsd return &virtiofsd{ path: q.config.VirtioFSDaemon, sourcePath: sharedPath, socketPath: virtiofsdSocketPath, extraArgs: q.config.VirtioFSExtraArgs, cache: q.config.VirtioFSCache, }, nil } // prepareInitdataImage will create an image with a very simple layout // // There will be multiple sectors. The first 8 bytes are Magic number "initdata". // Then a "length" field of 8 bytes follows (unsigned int64). // Finally the gzipped initdata toml. The image will be padded to an // integer multiple of the sector size for alignment. // // offset 0 8 16 // 0 'i' 'n' 'i' 't' 'd' 'a' 't' 'a' | gzip length in le | // 16 gzip(initdata toml) ... // (end of the last sector) '\0' paddings func prepareInitdataImage(initdata string, imagePath string) error { SectorSize := 512 var buf bytes.Buffer gzipper := gzip.NewWriter(&buf) defer gzipper.Close() gzipper.Write([]byte(initdata)) err := gzipper.Close() if err != nil { return fmt.Errorf("failed to compress initdata: %v", err) } compressedInitdata := buf.Bytes() compressedInitdataLength := len(compressedInitdata) lengthBuffer := make([]byte, 8) binary.LittleEndian.PutUint64(lengthBuffer, uint64(compressedInitdataLength)) paddingLength := (compressedInitdataLength+16+SectorSize-1)/SectorSize*SectorSize - (compressedInitdataLength + 16) paddingBuffer := make([]byte, paddingLength) file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_RDWR, 0640) if err != nil { return fmt.Errorf("failed to create initdata image: %v", err) } defer file.Close() _, err = file.Write([]byte("initdata")) if err != nil { return fmt.Errorf("failed to write magic number to initdata image: %v", err) } _, err = file.Write(lengthBuffer) if err != nil { return fmt.Errorf("failed to write data length to initdata image: %v", err) } _, err = file.Write([]byte(compressedInitdata)) if err != nil { return fmt.Errorf("failed to write compressed initdata to initdata image: %v", err) } _, err = file.Write(paddingBuffer) if err != nil { return fmt.Errorf("failed to write compressed initdata to initdata image: %v", err) } return nil } func (q *qemu) prepareInitdataMount(config *HypervisorConfig) error { if len(config.Initdata) == 0 { q.Logger().Info("No initdata provided. Skip prepare initdata device") return nil } q.Logger().Info("Start to prepare initdata") initdataWorkdir := filepath.Join("/run/kata-containers/shared/initdata", q.id) initdataImagePath := filepath.Join(initdataWorkdir, "data.img") err := os.MkdirAll(initdataWorkdir, 0755) if err != nil { q.Logger().WithField("initdata", "create initdata image path").WithError(err) return err } err = prepareInitdataImage(config.Initdata, initdataImagePath) if err != nil { q.Logger().WithField("initdata", "prepare initdata image").WithError(err) return err } config.InitdataImage = initdataImagePath return nil } // CreateVM is the Hypervisor VM creation implementation for govmmQemu. func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { // Save the tracing context q.ctx = ctx span, ctx := katatrace.Trace(ctx, q.Logger(), "CreateVM", qemuTracingTags, map[string]string{"VM_ID": q.id}) defer span.End() if err := q.setup(ctx, id, hypervisorConfig); err != nil { return err } if err := q.prepareInitdataMount(hypervisorConfig); err != nil { return err } machine, err := q.getQemuMachine() if err != nil { return err } smp := q.cpuTopology() memory, err := q.memoryTopology() if err != nil { return err } knobs := govmmQemu.Knobs{ NoUserConfig: true, NoDefaults: true, NoGraphic: true, NoReboot: true, MemPrealloc: q.config.MemPrealloc, HugePages: q.config.HugePages, IOMMUPlatform: q.config.IOMMUPlatform, } incoming := q.setupTemplate(&knobs, &memory) // With the current implementations, VM templating will not work with file // based memory (stand-alone) or virtiofs. This is because VM templating // builds the first VM with file-backed memory and shared=on and the // subsequent ones with shared=off. virtio-fs always requires shared=on for // memory. if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus || q.config.FileBackedMemRootDir != "" { if !(q.config.BootToBeTemplate || q.config.BootFromTemplate) { q.setupFileBackedMem(&knobs, &memory) } else { return errors.New("VM templating has been enabled with either virtio-fs or file backed memory and this configuration will not work") } if q.config.HugePages { knobs.MemPrealloc = true } } // Vhost-user-blk/scsi process which can improve performance, like SPDK, // requires shared-on hugepage to work with Qemu. if q.config.EnableVhostUserStore { if !q.config.HugePages { return errors.New("Vhost-user-blk/scsi is enabled without HugePages. This configuration will not work") } knobs.MemShared = true } rtc := govmmQemu.RTC{ Base: govmmQemu.UTC, Clock: govmmQemu.Host, DriftFix: govmmQemu.Slew, } if q.state.UUID == "" { return fmt.Errorf("UUID should not be empty") } qmpSockets, err := q.createQmpSocket() if err != nil { return err } kernelPath, err := q.config.KernelAssetPath() if err != nil { return err } devices, ioThread, kernel, err := q.buildDevices(ctx, kernelPath) if err != nil { return err } cpuModel := q.arch.cpuModel() cpuModel += "," + q.config.CPUFeatures firmwarePath, err := q.config.FirmwareAssetPath() if err != nil { return err } firmwareVolumePath, err := q.config.FirmwareVolumeAssetPath() if err != nil { return err } pflash, err := q.arch.getPFlash() if err != nil { return err } qemuPath, err := q.qemuPath() if err != nil { return err } if len(hypervisorConfig.Initdata) > 0 { devices = q.buildInitdataDevice(devices, hypervisorConfig.InitdataImage) } // some devices configuration may also change kernel params, make sure this is called afterwards kernel.Params = q.kernelParameters() q.checkBpfEnabled() qemuConfig := govmmQemu.Config{ Name: fmt.Sprintf("sandbox-%s", q.id), UUID: q.state.UUID, Path: qemuPath, Ctx: q.qmpMonitorCh.ctx, Uid: q.config.Uid, Gid: q.config.Gid, Groups: q.config.Groups, Machine: machine, SMP: smp, Memory: memory, Devices: devices, CPUModel: cpuModel, SeccompSandbox: q.config.SeccompSandbox, Kernel: *kernel, RTC: rtc, QMPSockets: qmpSockets, Knobs: knobs, Incoming: incoming, VGA: "none", GlobalParam: "kvm-pit.lost_tick_policy=discard", Bios: firmwarePath, PFlash: pflash, PidFile: filepath.Join(q.config.VMStorePath, q.id, "pid"), Debug: hypervisorConfig.Debug, } qemuConfig.Devices, qemuConfig.Bios, err = q.arch.appendProtectionDevice(qemuConfig.Devices, firmwarePath, firmwareVolumePath, hypervisorConfig.InitdataDigest) if err != nil { return err } if ioThread != nil { qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread} } // Add RNG device to hypervisor // Skip for s390x (as CPACF is used) or when Confidential Guest is enabled if machine.Type != QemuCCWVirtio && !q.config.ConfidentialGuest { rngDev := config.RNGDev{ ID: rngID, Filename: q.config.EntropySource, } qemuConfig.Devices, err = q.arch.appendRNGDevice(ctx, qemuConfig.Devices, rngDev) if err != nil { return err } } if machine.Type == QemuQ35 || machine.Type == QemuVirt { if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type, network); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") return err } } q.qemuConfig = qemuConfig q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) return err } func (q *qemu) checkBpfEnabled() { if q.config.SeccompSandbox != "" { out, err := os.ReadFile("/proc/sys/net/core/bpf_jit_enable") if err != nil { q.Logger().WithError(err).Warningf("failed to get bpf_jit_enable status") return } enabled, err := strconv.Atoi(strings.TrimSpace(string(out))) if err != nil { q.Logger().WithError(err).Warningf("failed to convert bpf_jit_enable status to integer") return } if enabled == 0 { q.Logger().Warningf("bpf_jit_enable is disabled. " + "It's recommended to turn on bpf_jit_enable to reduce the performance impact of QEMU seccomp sandbox.") } } } // If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need // to hotplug 32 devices. We do not have enough PCIe root bus slots to // accomplish this task. Kata will use already some slots for vfio-xxxx-pci // devices. // Max PCI slots per root bus is 32 // Max PCIe root ports is 16 // Max PCIe switch ports is 16 // There is only 64kB of IO memory each root,switch port will consume 4k hence // only 16 ports possible. func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string, network Network) error { // If no-port set just return no need to add PCIe Root Port or PCIe Switches if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 { return nil } // Add PCIe Root Port or PCIe Switches to the hypervisor // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged // into a PCIe Root Port or PCIe Switch. // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt // Deduce the right values for mem-reserve and pref-64-reserve memory regions memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() // The default OVMF MMIO aperture is too small for some PCIe devices // with huge BARs so we need to increase it. // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) fwCfg := govmmQemu.FwCfg{ Name: "opt/ovmf/X-PciMmio64Mb", Str: pciMmio64Mb, } qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } // Get the number of hot(cold)-pluggable ports needed from the provided // VFIO devices var numOfPluggablePorts uint32 = 0 // Fow now, pcie native hotplug is the only way for Arm to hotadd pci device. if machineType == QemuVirt { epNum, err := network.GetEndpointsNum() if err != nil { q.Logger().Warn("Fail to get network endpoints number") } virtPcieRootPortNum := len(hypervisorConfig.VhostUserBlkDevices) + epNum if hypervisorConfig.VirtioMem { virtPcieRootPortNum++ } numOfPluggablePorts += uint32(virtPcieRootPortNum) } for _, dev := range hypervisorConfig.VFIODevices { var err error dev.HostPath, err = config.GetHostPath(dev, false, "") if err != nil { return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } var vfioDevices []*config.VFIODev // This works for IOMMUFD enabled kernels > 6.x // In the case of IOMMUFD the device.HostPath will look like // /dev/vfio/devices/vfio0 // (1) Check if we have the new IOMMUFD or old container based VFIO if strings.HasPrefix(dev.HostPath, drivers.IommufdDevPath) { q.Logger().Infof("### IOMMUFD Path: %s", dev.HostPath) vfioDevices, err = drivers.GetDeviceFromVFIODev(dev) if err != nil { return fmt.Errorf("Cannot get VFIO device from IOMMUFD with device: %v err: %v", dev, err) } } else { vfioDevices, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) if err != nil { return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) } } for _, vfioDevice := range vfioDevices { if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } } } vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort) vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) // If the devices are not advertised via CRI or cold-plugged we need to // get the number of pluggable root/switch ports from the config numPCIeRootPorts := hypervisorConfig.PCIeRootPort numPCIeSwitchPorts := hypervisorConfig.PCIeSwitchPort // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 // max slots on the complete pci(e) topology is 256 in QEMU if vfioOnRootPort { if numOfPluggablePorts < numPCIeRootPorts { numOfPluggablePorts = numPCIeRootPorts } if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) return nil } if vfioOnSwitchPort { if numOfPluggablePorts < numPCIeSwitchPorts { numOfPluggablePorts = numPCIeSwitchPorts } if numOfPluggablePorts > maxPCIeSwitchPort { return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) } qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) return nil } // If both Root Port and Switch Port are not enabled, check if QemuVirt need add pcie root port. if machineType == QemuVirt { qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } return nil } func (q *qemu) vhostFSSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket) } func (q *qemu) nydusdAPISocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, nydusdAPISock) } func (q *qemu) setupVirtiofsDaemon(ctx context.Context) (err error) { pid, err := q.virtiofsDaemon.Start(ctx, func() { q.StopVM(ctx, false) }) if err != nil { return err } q.state.VirtiofsDaemonPid = pid return nil } func (q *qemu) stopVirtiofsDaemon(ctx context.Context) (err error) { if q.state.VirtiofsDaemonPid == 0 { q.Logger().Warn("The virtiofsd had stopped") return nil } err = q.virtiofsDaemon.Stop(ctx) if err != nil { return err } q.state.VirtiofsDaemonPid = 0 return nil } func (q *qemu) getMemArgs() (bool, string, string, error) { share := false target := "" memoryBack := "memory-backend-ram" if q.qemuConfig.Knobs.HugePages { // we are setting all the bits that govmm sets when hugepages are enabled. // https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677 target = "/dev/hugepages" memoryBack = "memory-backend-file" share = true } else { if q.config.EnableVhostUserStore { // Vhost-user-blk/scsi process which can improve performance, like SPDK, // requires shared-on hugepage to work with Qemu. return share, target, "", fmt.Errorf("Vhost-user-blk/scsi requires hugepage memory") } if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus || q.config.FileBackedMemRootDir != "" { target = q.qemuConfig.Memory.Path memoryBack = "memory-backend-file" } } if q.qemuConfig.Knobs.MemShared { share = true } return share, target, memoryBack, nil } func (q *qemu) setupVirtioMem(ctx context.Context) error { // backend memory size must be multiple of 4Mib sizeMB := (int(q.config.DefaultMaxMemorySize) - int(q.config.MemorySize)) >> 2 << 2 share, target, memoryBack, err := q.getMemArgs() if err != nil { return err } if err = q.qmpSetup(); err != nil { return err } addr, bridge, err := q.arch.addDeviceToBridge(ctx, "virtiomem-dev", types.PCI) if err != nil { return err } defer func() { if err != nil { q.arch.removeDeviceFromBridge("virtiomem-dev") } }() bridgeID := bridge.ID // Hot add virtioMem dev to pcie-root-port for QemuVirt machineType := q.HypervisorConfig().HypervisorMachineType if machineType == QemuVirt { addr = "00" bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) dev := config.VFIODev{ID: "virtiomem"} config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) } err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0", addr, bridgeID) if err == nil { q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB) } else { help := "" if strings.Contains(err.Error(), "Cannot allocate memory") { help = ". Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it." } err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help) } return err } // setupEarlyQmpConnection creates a listener socket to be passed to QEMU // as a QMP listening endpoint. An initial connection is established, to // be used as the QMP client socket. This allows to detect an early failure // of QEMU instead of looping on connect until some timeout expires. func (q *qemu) setupEarlyQmpConnection() (net.Conn, error) { monitorSockPath := q.qmpMonitorCh.path qmpListener, err := net.Listen("unix", monitorSockPath) if err != nil { q.Logger().WithError(err).Errorf("Unable to listen on unix socket address (%s)", monitorSockPath) return nil, err } // A duplicate fd of this socket will be passed to QEMU. We must // close the original one when we're done. defer qmpListener.Close() if rootless.IsRootless() { err = syscall.Chown(monitorSockPath, int(q.config.Uid), int(q.config.Gid)) if err != nil { q.Logger().WithError(err).Errorf("Unable to make unix socket (%s) rootless", monitorSockPath) return nil, err } } VMFd, err := qmpListener.(*net.UnixListener).File() if err != nil { return nil, err } defer func() { if err != nil { VMFd.Close() } }() // This socket will be used to establish the initial QMP connection dialer := net.Dialer{Cancel: q.qmpMonitorCh.ctx.Done()} conn, err := dialer.Dial("unix", monitorSockPath) if err != nil { q.Logger().WithError(err).Errorf("Unable to connect to unix socket (%s)", monitorSockPath) return nil, err } // We need to keep the socket file around to be able to re-connect qmpListener.(*net.UnixListener).SetUnlinkOnClose(false) // Pass the duplicated fd of the listener socket to QEMU q.qemuConfig.QMPSockets[0].FD = VMFd q.fds = append(q.fds, q.qemuConfig.QMPSockets[0].FD) return conn, nil } func (q *qemu) LogAndWait(qemuCmd *exec.Cmd, reader io.ReadCloser) { pid := qemuCmd.Process.Pid q.Logger().Infof("Start logging QEMU (qemuPid=%d)", pid) scanner := bufio.NewScanner(reader) warnRE := regexp.MustCompile("(^[^:]+: )warning: ") for scanner.Scan() { text := scanner.Text() if warnRE.MatchString(text) { text = warnRE.ReplaceAllString(text, "$1") q.Logger().WithField("qemuPid", pid).Warning(text) } else { q.Logger().WithField("qemuPid", pid).Error(text) } } q.Logger().Infof("Stop logging QEMU (qemuPid=%d)", pid) qemuCmd.Wait() } // StartVM will start the Sandbox's VM. func (q *qemu) StartVM(ctx context.Context, timeout int) error { span, ctx := katatrace.Trace(ctx, q.Logger(), "StartVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() if q.config.Debug { params := q.arch.kernelParameters(q.config.Debug) strParams := SerializeParams(params, "=") formatted := strings.Join(strParams, " ") // The name of this field matches a similar one generated by // the runtime and allows users to identify which parameters // are set here, which come from the runtime and which are set // by the user. q.Logger().WithField("default-kernel-parameters", formatted).Debug() } defer func() { for _, fd := range q.fds { if err := fd.Close(); err != nil { q.Logger().WithError(err).Error("After launching Qemu") } } q.fds = []*os.File{} }() vmPath := filepath.Join(q.config.VMStorePath, q.id) err := utils.MkdirAllWithInheritedOwner(vmPath, DirMode) if err != nil { return err } q.Logger().WithField("vm path", vmPath).Info("created vm path") defer func() { if err != nil { if err := os.RemoveAll(vmPath); err != nil { q.Logger().WithError(err).Error("Fail to clean up vm directory") } } }() var qmpConn net.Conn qmpConn, err = q.setupEarlyQmpConnection() if err != nil { return err } // This needs to be done as late as possible, just before launching // virtiofsd are executed by kata-runtime after this call, run with // the SELinux label. If these processes require privileged, we do // notwant to run them under confinement. if !q.config.DisableSeLinux { if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil { return err } defer label.SetProcessLabel("") } if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus { err = q.setupVirtiofsDaemon(ctx) if err != nil { return err } defer func() { if err != nil { if shutdownErr := q.stopVirtiofsDaemon(ctx); shutdownErr != nil { q.Logger().WithError(shutdownErr).Warn("failed to stop virtiofsDaemon") } } }() } qemuCmd, reader, err := govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger()) if err != nil { q.Logger().WithError(err).Error("failed to launch qemu") return fmt.Errorf("failed to launch qemu: %s", err) } // Log QEMU errors and ensure the QEMU process is reaped after // termination. go q.LogAndWait(qemuCmd, reader) err = q.waitVM(ctx, qmpConn, timeout) if err != nil { return err } if q.config.BootFromTemplate { if err = q.bootFromTemplate(); err != nil { return err } } if q.config.VirtioMem { err = q.setupVirtioMem(ctx) } return err } func (q *qemu) bootFromTemplate() error { if err := q.qmpSetup(); err != nil { return err } defer q.qmpShutdown() err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp) if err != nil { q.Logger().WithError(err).Error("set migration ignore shared memory") return err } uri := fmt.Sprintf("exec:cat %s", q.config.DevicesStatePath) err = q.qmpMonitorCh.qmp.ExecuteMigrationIncoming(q.qmpMonitorCh.ctx, uri) if err != nil { return err } return q.waitMigration() } // waitVM will wait for the Sandbox's VM to be up and running. func (q *qemu) waitVM(ctx context.Context, qmpConn net.Conn, timeout int) error { span, _ := katatrace.Trace(ctx, q.Logger(), "waitVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() if timeout < 0 { return fmt.Errorf("Invalid timeout %ds", timeout) } cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()} var qmp *govmmQemu.QMP var disconnectCh chan struct{} var ver *govmmQemu.QMPVersion var err error // clear any possible old state before trying to connect again. q.qmpShutdown() timeStart := time.Now() for { disconnectCh = make(chan struct{}) qmp, ver, err = govmmQemu.QMPStartWithConn(q.qmpMonitorCh.ctx, qmpConn, cfg, disconnectCh) if err == nil { break } if int(time.Since(timeStart).Seconds()) > timeout { return fmt.Errorf("Failed to connect to QEMU instance (timeout %ds): %v", timeout, err) } time.Sleep(time.Duration(50) * time.Millisecond) } q.qmpMonitorCh.qmp = qmp q.qmpMonitorCh.disconn = disconnectCh defer q.qmpShutdown() q.Logger().WithFields(logrus.Fields{ "qmp-major-version": ver.Major, "qmp-minor-version": ver.Minor, "qmp-micro-version": ver.Micro, "qmp-Capabilities": strings.Join(ver.Capabilities, ","), }).Infof("QMP details") if err = q.qmpMonitorCh.qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx); err != nil { q.Logger().WithError(err).Error(qmpCapErrMsg) return err } return nil } // StopVM will stop the Sandbox's VM. func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) { q.mu.Lock() defer q.mu.Unlock() span, _ := katatrace.Trace(ctx, q.Logger(), "StopVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() q.Logger().Info("Stopping Sandbox") if atomic.LoadInt32(&q.stopped) != 0 { q.Logger().Info("Already stopped") return nil } defer func() { q.cleanupVM() if err == nil { atomic.StoreInt32(&q.stopped, 1) } }() if err := q.qmpSetup(); err != nil { return err } pids := q.GetPids() if len(pids) == 0 { return errors.New("cannot determine QEMU PID") } pid := pids[0] if pid > 0 { if waitOnly { err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger()) if err != nil { return err } } else { err = syscall.Kill(pid, syscall.SIGKILL) if err != nil { q.Logger().WithError(err).Error("Fail to send SIGKILL to qemu") return err } } } if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus { if err := q.stopVirtiofsDaemon(ctx); err != nil { return err } } return nil } func (q *qemu) cleanupVM() error { // Cleanup vm path dir := filepath.Join(q.config.VMStorePath, q.id) // If it's a symlink, remove both dir and the target. // This can happen when vm template links a sandbox to a vm. link, err := filepath.EvalSymlinks(dir) if err != nil { // Well, it's just Cleanup failure. Let's ignore it. q.Logger().WithError(err).WithField("dir", dir).Warn("failed to resolve vm path") } q.Logger().WithField("link", link).WithField("dir", dir).Infof("Cleanup vm path") if err := os.RemoveAll(dir); err != nil { q.Logger().WithError(err).Warnf("failed to remove vm path %s", dir) } if link != dir && link != "" { if err := os.RemoveAll(link); err != nil { q.Logger().WithError(err).WithField("link", link).Warn("failed to remove resolved vm path") } } if q.config.VMid != "" { dir = filepath.Join(q.config.RunStorePath, q.config.VMid) if err := os.RemoveAll(dir); err != nil { q.Logger().WithError(err).WithField("path", dir).Warnf("failed to remove vm path") } } if rootless.IsRootless() { if _, err := user.Lookup(q.config.User); err != nil { q.Logger().WithError(err).WithFields( logrus.Fields{ "user": q.config.User, "uid": q.config.Uid, }).Warn("failed to find the user, it might have been removed") return nil } if err := pkgUtils.RemoveVmmUser(q.config.User); err != nil { q.Logger().WithError(err).WithFields( logrus.Fields{ "user": q.config.User, "uid": q.config.Uid, }).Warn("failed to delete the user") return nil } q.Logger().WithFields( logrus.Fields{ "user": q.config.User, "uid": q.config.Uid, }).Debug("successfully removed the non root user") } // If we have initdata, we should drop initdata image path hypervisorConfig := q.HypervisorConfig() if len(hypervisorConfig.Initdata) > 0 { initdataWorkdir := filepath.Join(string(filepath.Separator), "/run/kata-containers/shared/initdata", q.id) if err := os.RemoveAll(initdataWorkdir); err != nil { q.Logger().WithError(err).Warnf("failed to remove initdata work dir %s", initdataWorkdir) } } return nil } func (q *qemu) togglePauseSandbox(ctx context.Context, pause bool) error { span, _ := katatrace.Trace(ctx, q.Logger(), "togglePauseSandbox", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() if err := q.qmpSetup(); err != nil { return err } if pause { return q.qmpMonitorCh.qmp.ExecuteStop(q.qmpMonitorCh.ctx) } return q.qmpMonitorCh.qmp.ExecuteCont(q.qmpMonitorCh.ctx) } func (q *qemu) qmpSetup() error { q.qmpMonitorCh.Lock() defer q.qmpMonitorCh.Unlock() if q.qmpMonitorCh.qmp != nil { return nil } events := make(chan govmmQemu.QMPEvent) go q.loopQMPEvent(events) cfg := govmmQemu.QMPConfig{ Logger: newQMPLogger(), EventCh: events, } // Auto-closed by QMPStart(). disconnectCh := make(chan struct{}) qmp, _, err := govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh) if err != nil { q.Logger().WithError(err).Error("Failed to connect to QEMU instance") return err } err = qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx) if err != nil { qmp.Shutdown() q.Logger().WithError(err).Error(qmpCapErrMsg) return err } q.qmpMonitorCh.qmp = qmp q.qmpMonitorCh.disconn = disconnectCh return nil } func (q *qemu) loopQMPEvent(event chan govmmQemu.QMPEvent) { for e := range event { q.Logger().WithField("event", e).Debug("got QMP event") if e.Name == "GUEST_PANICKED" { go q.handleGuestPanic() } } q.Logger().Infof("QMP event channel closed") } func (q *qemu) handleGuestPanic() { if err := q.dumpGuestMemory(q.config.GuestMemoryDumpPath); err != nil { q.Logger().WithError(err).Error("failed to dump guest memory") } // TODO: how to notify the upper level sandbox to handle the error // to do a fast fail(shutdown or others). // tracked by https://github.com/kata-containers/kata-containers/issues/1026 } // canDumpGuestMemory check if can do a guest memory dump operation. // for now it only ensure there must be double of VM size for free disk spaces func (q *qemu) canDumpGuestMemory(dumpSavePath string) error { fs := unix.Statfs_t{} if err := unix.Statfs(dumpSavePath, &fs); err != nil { q.Logger().WithError(err).WithField("dumpSavePath", dumpSavePath).Error("failed to call Statfs") return nil } availSpaceInBytes := fs.Bavail * uint64(fs.Bsize) q.Logger().WithFields( logrus.Fields{ "dumpSavePath": dumpSavePath, "availSpaceInBytes": availSpaceInBytes, }).Info("get avail space") // get guest memory size guestMemorySizeInBytes := (uint64(q.config.MemorySize) + uint64(q.state.HotpluggedMemory)) << utils.MibToBytesShift q.Logger().WithField("guestMemorySizeInBytes", guestMemorySizeInBytes).Info("get guest memory size") // default we want ensure there are at least double of VM memory size free spaces available, // this may complete one dump operation for one sandbox exceptMemorySize := guestMemorySizeInBytes * 2 if availSpaceInBytes >= exceptMemorySize { return nil } return fmt.Errorf("there are not enough free space to store memory dump file. Except %d bytes, but only %d bytes available", exceptMemorySize, availSpaceInBytes) } // dumpSandboxMetaInfo save meta information for debug purpose, includes: // hypervisor version, sandbox/container state, hypervisor config func (q *qemu) dumpSandboxMetaInfo(dumpSavePath string) { dumpStatePath := filepath.Join(dumpSavePath, "state") // copy state from /run/vc/sbs to memory dump directory statePath := filepath.Join(q.config.RunStorePath, q.id) command := []string{"/bin/cp", "-ar", statePath, dumpStatePath} q.Logger().WithField("command", command).Info("try to Save sandbox state") if output, err := pkgUtils.RunCommandFull(command, true); err != nil { q.Logger().WithError(err).WithField("output", output).Error("failed to Save state") } // Save hypervisor meta information fileName := filepath.Join(dumpSavePath, "hypervisor.conf") data, _ := json.MarshalIndent(q.config, "", " ") if err := os.WriteFile(fileName, data, defaultFilePerms); err != nil { q.Logger().WithError(err).WithField("hypervisor.conf", data).Error("write to hypervisor.conf file failed") } // Save hypervisor version hyperVisorVersion, err := pkgUtils.RunCommand([]string{q.config.HypervisorPath, "--version"}) if err != nil { q.Logger().WithError(err).WithField("HypervisorPath", data).Error("failed to get hypervisor version") } fileName = filepath.Join(dumpSavePath, "hypervisor.version") if err := os.WriteFile(fileName, []byte(hyperVisorVersion), defaultFilePerms); err != nil { q.Logger().WithError(err).WithField("hypervisor.version", data).Error("write to hypervisor.version file failed") } } func (q *qemu) dumpGuestMemory(dumpSavePath string) error { if dumpSavePath == "" { return nil } q.memoryDumpFlag.Lock() defer q.memoryDumpFlag.Unlock() q.Logger().WithField("dumpSavePath", dumpSavePath).Info("try to dump guest memory") dumpSavePath = filepath.Join(dumpSavePath, q.id) dumpStatePath := filepath.Join(dumpSavePath, "state") if err := pkgUtils.EnsureDir(dumpStatePath, DirMode); err != nil { return err } // Save meta information for sandbox q.dumpSandboxMetaInfo(dumpSavePath) q.Logger().Info("dump sandbox meta information completed") // Check device free space and estimated dump size if err := q.canDumpGuestMemory(dumpSavePath); err != nil { q.Logger().Warnf("can't dump guest memory: %s", err.Error()) return err } // dump guest memory protocol := fmt.Sprintf("file:%s/vmcore-%s.%s", dumpSavePath, time.Now().Format("20060102150405.999"), memoryDumpFormat) q.Logger().Infof("try to dump guest memory to %s", protocol) if err := q.qmpSetup(); err != nil { q.Logger().WithError(err).Error("setup manage QMP failed") return err } if err := q.qmpMonitorCh.qmp.ExecuteDumpGuestMemory(q.qmpMonitorCh.ctx, protocol, q.config.GuestMemoryDumpPaging, memoryDumpFormat); err != nil { q.Logger().WithError(err).Error("dump guest memory failed") return err } q.Logger().Info("dump guest memory completed") return nil } func (q *qemu) qmpShutdown() { q.qmpMonitorCh.Lock() defer q.qmpMonitorCh.Unlock() if q.qmpMonitorCh.qmp != nil { q.qmpMonitorCh.qmp.Shutdown() // wait on disconnected channel to be sure that the qmp channel has // been closed cleanly. <-q.qmpMonitorCh.disconn q.qmpMonitorCh.qmp = nil q.qmpMonitorCh.disconn = nil } } func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDrive, op Operation, devID string) (err error) { // drive can be a pmem device, in which case it's used as backing file for a nvdimm device if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem { var blocksize int64 file, err := os.Open(drive.File) if err != nil { return err } defer file.Close() st, err := file.Stat() if err != nil { return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err) } // regular files do not support syscall BLKGETSIZE64 if st.Mode().IsRegular() { blocksize = st.Size() } else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 { return err } if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil { q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File) return err } drive.NvdimmID = strconv.Itoa(q.nvdimmCount) q.nvdimmCount++ return nil } qblkDevice := govmmQemu.BlockDevice{ ID: drive.ID, File: drive.File, ReadOnly: drive.ReadOnly, AIO: govmmQemu.BlockDeviceAIO(q.config.BlockDeviceAIO), } if drive.Swap { err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithDriverCache(q.qmpMonitorCh.ctx, "file", &qblkDevice, false, false) } else if q.config.BlockDeviceCacheSet { err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, &qblkDevice, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush) } else { err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, &qblkDevice) } if err != nil { return err } defer func() { if err != nil { q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID) } }() switch { case drive.Swap: fallthrough case q.config.BlockDeviceDriver == config.VirtioBlock: driver := "virtio-blk-pci" addr, bridge, err := q.arch.addDeviceToBridge(ctx, drive.ID, types.PCI) if err != nil { return err } defer func() { if err != nil { q.arch.removeDeviceFromBridge(drive.ID) } }() bridgeSlot, err := types.PciSlotFromInt(bridge.Addr) if err != nil { return err } devSlot, err := types.PciSlotFromString(addr) if err != nil { return err } drive.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot) if err != nil { return err } queues := int(q.config.NumVCPUs()) if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, queues, true, defaultDisableModern); err != nil { return err } case q.config.BlockDeviceDriver == config.VirtioBlockCCW: driver := "virtio-blk-ccw" addr, bridge, err := q.arch.addDeviceToBridge(ctx, drive.ID, types.CCW) if err != nil { return err } var devNoHotplug string devNoHotplug, err = bridge.AddressFormatCCW(addr) if err != nil { return err } drive.DevNo, err = bridge.AddressFormatCCWForVirtServer(addr) if err != nil { return err } if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil { return err } case q.config.BlockDeviceDriver == config.VirtioSCSI: driver := "scsi-hd" // Bus exposed by the SCSI Controller bus := scsiControllerID + ".0" // Get SCSI-id and LUN based on the order of attaching drives. scsiID, lun, err := utils.GetSCSIIdLun(drive.Index) if err != nil { return err } if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, defaultDisableModern); err != nil { return err } default: return fmt.Errorf("Block device %s not recognized", q.config.BlockDeviceDriver) } return nil } func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) { err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime) if err != nil { return err } defer func() { if err != nil { q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID) } }() driver := "vhost-user-blk-pci" machineType := q.HypervisorConfig().HypervisorMachineType switch machineType { case QemuVirt: //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) dev := config.VFIODev{ID: devID} config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh) if err != nil { return err } devSlot, err := types.PciSlotFromString(addr) if err != nil { return err } vAttr.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot) if err != nil { return err } if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil { return err } default: addr, bridge, err := q.arch.addDeviceToBridge(ctx, vAttr.DevID, types.PCI) if err != nil { return err } defer func() { if err != nil { q.arch.removeDeviceFromBridge(vAttr.DevID) } }() bridgeSlot, err := types.PciSlotFromInt(bridge.Addr) if err != nil { return err } devSlot, err := types.PciSlotFromString(addr) if err != nil { return err } vAttr.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot) if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil { return err } } return nil } func (q *qemu) hotplugBlockDevice(ctx context.Context, drive *config.BlockDrive, op Operation) error { if err := q.qmpSetup(); err != nil { return err } devID := "virtio-" + drive.ID if op == AddDevice { return q.hotplugAddBlockDevice(ctx, drive, op, devID) } if !drive.Swap && q.config.BlockDeviceDriver == config.VirtioBlock { if err := q.arch.removeDeviceFromBridge(drive.ID); err != nil { return err } } if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { return err } return q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID) } func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation) error { if err := q.qmpSetup(); err != nil { return err } devID := "virtio-" + vAttr.DevID if op == AddDevice { switch vAttr.Type { case config.VhostUserBlk: return q.hotplugAddVhostUserBlkDevice(ctx, vAttr, op, devID) default: return fmt.Errorf("Incorrect vhost-user device type found") } } else { machineType := q.HypervisorConfig().HypervisorMachineType if machineType != QemuVirt { if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil { return err } } if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { return err } return q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID) } } func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { return q.executeVFIODeviceAdd(device) } func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) { return q.executeVFIODeviceAdd(device) } func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) { addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI) if err != nil { return err } defer func() { if err != nil { q.arch.removeDeviceFromBridge(device.ID) } }() return q.executePCIVFIODeviceAdd(device, addr, bridge.ID) } func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error { switch device.Type { case config.VFIOPCIDeviceNormalType: return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile) case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) case config.VFIOAPDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } } func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { switch device.Type { case config.VFIOPCIDeviceNormalType: return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile) case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) case config.VFIOAPDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } } func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) { if err = q.qmpSetup(); err != nil { return err } if op == AddDevice { buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ "machine-type": q.HypervisorConfig().HypervisorMachineType, "hot-plug-vfio": q.state.HotPlugVFIO, "device-info": string(buf), }).Info("Start hot-plug VFIO device") err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device) // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus // for pc machine type instead of bridge. This is useful for devices that require // a large PCI BAR which is a currently a limitation with PCI bridges. if q.state.HotPlugVFIO == config.RootPort { err = q.hotplugVFIODeviceRootPort(ctx, device) } else if q.state.HotPlugVFIO == config.SwitchPort { err = q.hotplugVFIODeviceSwitchPort(ctx, device) } else if q.state.HotPlugVFIO == config.BridgePort { err = q.hotplugVFIODeviceBridgePort(ctx, device) } if err != nil { return err } // Depending on whether we're doing root port or // bridge hotplug, and how the bridge is set up in // other parts of the code, we may or may not already // have information about the slot number of the // bridge and or the device. For simplicity, just // query both of them back from qemu based on the arch device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh) return err } else { q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") if q.state.HotPlugVFIO == config.BridgePort { if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { return err } } return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) } } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { var ( VMFdNames []string VhostFdNames []string ) for i, VMFd := range VMFds { fdName := fmt.Sprintf("fd%d", i) if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VMFd); err != nil { return err } VMFdNames = append(VMFdNames, fdName) } for i, VhostFd := range VhostFds { fdName := fmt.Sprintf("vhostfd%d", i) if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VhostFd); err != nil { return err } VhostFd.Close() VhostFdNames = append(VhostFdNames, fdName) } return q.qmpMonitorCh.qmp.ExecuteNetdevAddByFds(q.qmpMonitorCh.ctx, "tap", name, VMFdNames, VhostFdNames) } func (q *qemu) hotplugNetDevice(ctx context.Context, endpoint Endpoint, op Operation) (err error) { if err = q.qmpSetup(); err != nil { return err } var tap TapInterface switch endpoint.Type() { case VethEndpointType, IPVlanEndpointType, MacvlanEndpointType, TuntapEndpointType: tap = endpoint.NetworkPair().TapInterface case TapEndpointType: drive := endpoint.(*TapEndpoint) tap = drive.TapInterface default: return fmt.Errorf("this endpoint is not supported") } devID := "virtio-" + tap.ID machineType := q.HypervisorConfig().HypervisorMachineType if op == AddDevice { if err = q.hotAddNetDevice(tap.Name, endpoint.HardwareAddr(), tap.VMFds, tap.VhostFds); err != nil { return err } defer func() { if err != nil { q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name) } }() // Hotplug net dev to pcie root port for QemuVirt if machineType == QemuVirt { addr := "00" bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) dev := config.VFIODev{ID: devID} config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridgeID, romFile, int(q.config.NumVCPUs()), defaultDisableModern) } addr, bridge, err := q.arch.addDeviceToBridge(ctx, tap.ID, types.PCI) if err != nil { return err } defer func() { if err != nil { q.arch.removeDeviceFromBridge(tap.ID) } }() q.arch.setEndpointDevicePath(endpoint, bridge.Addr, addr) var machine govmmQemu.Machine machine, err = q.getQemuMachine() if err != nil { return err } if machine.Type == QemuCCWVirtio { devNoHotplug := fmt.Sprintf("fe.%x.%v", bridge.Addr, addr) return q.qmpMonitorCh.qmp.ExecuteNetCCWDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), devNoHotplug, int(q.config.NumVCPUs())) } return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridge.ID, romFile, int(q.config.NumVCPUs()), defaultDisableModern) } if err := q.arch.removeDeviceFromBridge(tap.ID); err != nil { return err } if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { return err } return q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name) } func (q *qemu) hotplugDevice(ctx context.Context, devInfo interface{}, devType DeviceType, op Operation) (interface{}, error) { switch devType { case BlockDev: drive := devInfo.(*config.BlockDrive) return nil, q.hotplugBlockDevice(ctx, drive, op) case CpuDev: vcpus := devInfo.(uint32) return q.hotplugCPUs(vcpus, op) case VfioDev: device := devInfo.(*config.VFIODev) return nil, q.hotplugVFIODevice(ctx, device, op) case MemoryDev: memdev := devInfo.(*MemoryDevice) return q.hotplugMemory(memdev, op) case NetDev: device := devInfo.(Endpoint) return nil, q.hotplugNetDevice(ctx, device, op) case VhostuserDev: vAttr := devInfo.(*config.VhostUserDeviceAttrs) return nil, q.hotplugVhostUserDevice(ctx, vAttr, op) default: return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType) } } func (q *qemu) HotplugAddDevice(ctx context.Context, devInfo interface{}, devType DeviceType) (interface{}, error) { span, ctx := katatrace.Trace(ctx, q.Logger(), "HotplugAddDevice", qemuTracingTags) katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo) defer span.End() data, err := q.hotplugDevice(ctx, devInfo, devType, AddDevice) if err != nil { return data, err } return data, nil } func (q *qemu) HotplugRemoveDevice(ctx context.Context, devInfo interface{}, devType DeviceType) (interface{}, error) { span, ctx := katatrace.Trace(ctx, q.Logger(), "HotplugRemoveDevice", qemuTracingTags) katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo) defer span.End() data, err := q.hotplugDevice(ctx, devInfo, devType, RemoveDevice) if err != nil { return data, err } return data, nil } func (q *qemu) hotplugCPUs(vcpus uint32, op Operation) (uint32, error) { if vcpus == 0 { q.Logger().Warnf("cannot hotplug 0 vCPUs") return 0, nil } if err := q.qmpSetup(); err != nil { return 0, err } if op == AddDevice { return q.hotplugAddCPUs(vcpus) } return q.hotplugRemoveCPUs(vcpus) } // try to hot add an amount of vCPUs, returns the number of vCPUs added func (q *qemu) hotplugAddCPUs(amount uint32) (uint32, error) { currentVCPUs := q.qemuConfig.SMP.CPUs + uint32(len(q.state.HotpluggedVCPUs)) // Don't fail if the number of max vCPUs is exceeded, log a warning and hot add the vCPUs needed // to reach out max vCPUs if currentVCPUs+amount > q.config.DefaultMaxVCPUs { q.Logger().Warnf("Cannot hotplug %d CPUs, currently this SB has %d CPUs and the maximum amount of CPUs is %d", amount, currentVCPUs, q.config.DefaultMaxVCPUs) amount = q.config.DefaultMaxVCPUs - currentVCPUs } if amount == 0 { // Don't fail if no more vCPUs can be added, since cgroups still can be updated q.Logger().Warnf("maximum number of vCPUs '%d' has been reached", q.config.DefaultMaxVCPUs) return 0, nil } // get the list of hotpluggable CPUs hotpluggableVCPUs, err := q.qmpMonitorCh.qmp.ExecuteQueryHotpluggableCPUs(q.qmpMonitorCh.ctx) if err != nil { return 0, fmt.Errorf("failed to query hotpluggable CPUs: %v", err) } machine := q.arch.machine() var hotpluggedVCPUs uint32 for _, hc := range hotpluggableVCPUs { // qom-path is the path to the CPU, non-empty means that this CPU is already in use if hc.QOMPath != "" { continue } // CPU type, i.e host-x86_64-cpu driver := hc.Type cpuID := fmt.Sprintf("cpu-%d", len(q.state.HotpluggedVCPUs)) socketID := hc.Properties.Socket dieID := hc.Properties.Die coreID := hc.Properties.Core threadID := hc.Properties.Thread // If CPU type is IBM pSeries, Z or arm virt, we do not set socketID and threadID if machine.Type == "pseries" || machine.Type == QemuCCWVirtio || machine.Type == "virt" { socketID = -1 threadID = -1 dieID = -1 } if err := q.qmpMonitorCh.qmp.ExecuteCPUDeviceAdd(q.qmpMonitorCh.ctx, driver, cpuID, socketID, dieID, coreID, threadID, romFile); err != nil { q.Logger().WithField("hotplug", "cpu").Warnf("qmp hotplug cpu, cpuID=%s socketID=%d, error: %v", cpuID, socketID, err) // don't fail, let's try with other CPU continue } // a new vCPU was added, update list of hotplugged vCPUs and Check if all vCPUs were added q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, hv.CPUDevice{ID: cpuID}) hotpluggedVCPUs++ if hotpluggedVCPUs == amount { // All vCPUs were hotplugged return amount, nil } } return hotpluggedVCPUs, fmt.Errorf("failed to hot add vCPUs: only %d vCPUs of %d were added", hotpluggedVCPUs, amount) } // try to hot remove an amount of vCPUs, returns the number of vCPUs removed func (q *qemu) hotplugRemoveCPUs(amount uint32) (uint32, error) { hotpluggedVCPUs := uint32(len(q.state.HotpluggedVCPUs)) // we can only remove hotplugged vCPUs if amount > hotpluggedVCPUs { return 0, fmt.Errorf("Unable to remove %d CPUs, currently there are only %d hotplugged CPUs", amount, hotpluggedVCPUs) } for i := uint32(0); i < amount; i++ { // get the last vCPUs and try to remove it cpu := q.state.HotpluggedVCPUs[len(q.state.HotpluggedVCPUs)-1] if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, cpu.ID); err != nil { return i, fmt.Errorf("failed to hotunplug CPUs, only %d CPUs were hotunplugged: %v", i, err) } // remove from the list the vCPU hotunplugged q.state.HotpluggedVCPUs = q.state.HotpluggedVCPUs[:len(q.state.HotpluggedVCPUs)-1] } return amount, nil } func (q *qemu) hotplugMemory(memDev *MemoryDevice, op Operation) (int, error) { if !q.arch.supportGuestMemoryHotplug() { return 0, noGuestMemHotplugErr } if memDev.SizeMB < 0 { return 0, fmt.Errorf("cannot hotplug negative size (%d) memory", memDev.SizeMB) } memLog := q.Logger().WithField("hotplug", "memory") memLog.WithField("hotplug-memory-mb", memDev.SizeMB).Debug("requested memory hotplug") if err := q.qmpSetup(); err != nil { return 0, err } if memDev.SizeMB == 0 { memLog.Debug("hotplug is not required") return 0, nil } switch op { case RemoveDevice: memLog.WithField("operation", "remove").Debugf("Requested to remove memory: %d MB", memDev.SizeMB) // Dont fail but warn that this is not supported. memLog.Warn("hot-remove VM memory not supported") return 0, nil case AddDevice: memLog.WithField("operation", "add").Debugf("Requested to add memory: %d MB", memDev.SizeMB) memoryAdded, err := q.hotplugAddMemory(memDev) if err != nil { return memoryAdded, err } return memoryAdded, nil default: return 0, fmt.Errorf("invalid operation %v", op) } } func (q *qemu) hotplugAddMemory(memDev *MemoryDevice) (int, error) { memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx) if err != nil { return 0, fmt.Errorf("failed to query memory devices: %v", err) } if len(memoryDevices) != 0 { maxSlot := -1 for _, device := range memoryDevices { if maxSlot < device.Data.Slot { maxSlot = device.Data.Slot } } memDev.Slot = maxSlot + 1 } share, target, memoryBack, err := q.getMemArgs() if err != nil { return 0, err } err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.Slot), target, memDev.SizeMB, share) if err != nil { q.Logger().WithError(err).Error("hotplug memory") return 0, err } // if guest kernel only supports memory hotplug via probe interface, we need to get address of hot-add memory device if memDev.Probe { memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx) if err != nil { return 0, fmt.Errorf("failed to query memory devices: %v", err) } if len(memoryDevices) != 0 { q.Logger().WithField("addr", fmt.Sprintf("0x%x", memoryDevices[len(memoryDevices)-1].Data.Addr)).Debug("recently hot-add memory device") memDev.Addr = memoryDevices[len(memoryDevices)-1].Data.Addr } else { return 0, fmt.Errorf("failed to probe address of recently hot-add memory device, no device exists") } } q.state.HotpluggedMemory += memDev.SizeMB return memDev.SizeMB, nil } func (q *qemu) PauseVM(ctx context.Context) error { span, ctx := katatrace.Trace(ctx, q.Logger(), "PauseVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() return q.togglePauseSandbox(ctx, true) } func (q *qemu) ResumeVM(ctx context.Context) error { span, ctx := katatrace.Trace(ctx, q.Logger(), "ResumeVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() return q.togglePauseSandbox(ctx, false) } // AddDevice will add extra devices to Qemu command line. func (q *qemu) AddDevice(ctx context.Context, devInfo interface{}, devType DeviceType) error { var err error span, _ := katatrace.Trace(ctx, q.Logger(), "AddDevice", qemuTracingTags) katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo) defer span.End() switch v := devInfo.(type) { case types.Volume: if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus { q.Logger().WithField("volume-type", "virtio-fs").Info("adding volume") var randBytes []byte randBytes, err = utils.GenerateRandomBytes(8) if err != nil { return err } id := hex.EncodeToString(randBytes) var sockPath string sockPath, err = q.vhostFSSocketPath(q.id) if err != nil { return err } vhostDev := config.VhostUserDeviceAttrs{ Tag: v.MountTag, Type: config.VhostUserFS, CacheSize: q.config.VirtioFSCacheSize, Cache: q.config.VirtioFSCache, QueueSize: q.config.VirtioFSQueueSize, } vhostDev.SocketPath = sockPath vhostDev.DevID = id q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(ctx, q.qemuConfig.Devices, vhostDev) } else { q.Logger().WithField("volume-type", "virtio-9p").Info("adding volume") q.qemuConfig.Devices, err = q.arch.append9PVolume(ctx, q.qemuConfig.Devices, v) } case types.Socket: q.qemuConfig.Devices = q.arch.appendSocket(q.qemuConfig.Devices, v) case types.VSock: q.fds = append(q.fds, v.VhostFd) q.qemuConfig.Devices, err = q.arch.appendVSock(ctx, q.qemuConfig.Devices, v) case Endpoint: q.qemuConfig.Devices, err = q.arch.appendNetwork(ctx, q.qemuConfig.Devices, v) case config.BlockDrive: q.qemuConfig.Devices, err = q.arch.appendBlockDevice(ctx, q.qemuConfig.Devices, v) case config.VhostUserDeviceAttrs: q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(ctx, q.qemuConfig.Devices, v) case config.VFIODev: q.qemuConfig.Devices = q.arch.appendVFIODevice(q.qemuConfig.Devices, v) default: q.Logger().WithField("dev-type", v).Warn("Could not append device: unsupported device type") } return err } // GetVMConsole builds the path of the console where we can read logs coming // from the sandbox. func (q *qemu) GetVMConsole(ctx context.Context, id string) (string, string, error) { span, _ := katatrace.Trace(ctx, q.Logger(), "GetVMConsole", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() consoleURL, err := utils.BuildSocketPath(q.config.VMStorePath, id, consoleSocket) if err != nil { return consoleProtoUnix, "", err } return consoleProtoUnix, consoleURL, nil } func (q *qemu) SaveVM() error { q.Logger().Info("Save sandbox") if err := q.qmpSetup(); err != nil { return err } // BootToBeTemplate sets the VM to be a template that other VMs can clone from. We would want to // bypass shared memory when saving the VM to a local file through migration exec. if q.config.BootToBeTemplate { err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp) if err != nil { q.Logger().WithError(err).Error("set migration ignore shared memory") return err } } err := q.qmpMonitorCh.qmp.ExecSetMigrateArguments(q.qmpMonitorCh.ctx, fmt.Sprintf("%s>%s", qmpExecCatCmd, q.config.DevicesStatePath)) if err != nil { q.Logger().WithError(err).Error("exec migration") return err } return q.waitMigration() } func (q *qemu) waitMigration() error { t := time.NewTimer(qmpMigrationWaitTimeout) defer t.Stop() for { status, err := q.qmpMonitorCh.qmp.ExecuteQueryMigration(q.qmpMonitorCh.ctx) if err != nil { q.Logger().WithError(err).Error("failed to query migration status") return err } if status.Status == "completed" { break } select { case <-t.C: q.Logger().WithField("migration-status", status).Error("timeout waiting for qemu migration") return fmt.Errorf("timed out after %d seconds waiting for qemu migration", qmpMigrationWaitTimeout) default: // migration in progress q.Logger().WithField("migration-status", status).Debug("migration in progress") time.Sleep(100 * time.Millisecond) } } return nil } func (q *qemu) Disconnect(ctx context.Context) { span, _ := katatrace.Trace(ctx, q.Logger(), "Disconnect", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() q.qmpShutdown() } func (q *qemu) GetTotalMemoryMB(ctx context.Context) uint32 { return q.config.MemorySize + uint32(q.state.HotpluggedMemory) } // ResizeMemory gets a request to update the VM memory to reqMemMB // Memory update is managed with two approaches // Add memory to VM: // When memory is required to be added we hotplug memory // Remove Memory from VM/ Return memory to host. // // Memory unplug can be slow and it cannot be guaranteed. // Additionally, the unplug has not small granularly it has to be // the memory to remove has to be at least the size of one slot. // To return memory back we are resizing the VM memory balloon. // A longer term solution is evaluate solutions like virtio-mem func (q *qemu) ResizeMemory(ctx context.Context, reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error) { currentMemory := q.GetTotalMemoryMB(ctx) if err := q.qmpSetup(); err != nil { return 0, MemoryDevice{}, err } var addMemDevice MemoryDevice if q.config.VirtioMem && currentMemory != reqMemMB { q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB) sizeByte := uint64(reqMemMB - q.config.MemorySize) sizeByte = sizeByte * 1024 * 1024 err := q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", sizeByte) if err != nil { return 0, MemoryDevice{}, err } q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024) return reqMemMB, MemoryDevice{}, nil } switch { case currentMemory < reqMemMB: //hotplug addMemMB := reqMemMB - currentMemory if currentMemory+addMemMB > uint32(q.config.DefaultMaxMemorySize) { addMemMB = uint32(q.config.DefaultMaxMemorySize) - currentMemory } memHotplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB) if err != nil { return currentMemory, MemoryDevice{}, err } addMemDevice.SizeMB = int(memHotplugMB) addMemDevice.Probe = probe data, err := q.HotplugAddDevice(ctx, &addMemDevice, MemoryDev) if err != nil { return currentMemory, addMemDevice, err } memoryAdded, ok := data.(int) if !ok { return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory added, got %+v", data) } currentMemory += uint32(memoryAdded) case currentMemory > reqMemMB: //hotunplug addMemMB := currentMemory - reqMemMB memHotunplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB) if err != nil { return currentMemory, MemoryDevice{}, err } addMemDevice.SizeMB = int(memHotunplugMB) addMemDevice.Probe = probe data, err := q.HotplugRemoveDevice(ctx, &addMemDevice, MemoryDev) if err != nil { return currentMemory, addMemDevice, err } memoryRemoved, ok := data.(int) if !ok { return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory removed, got %+v", data) } //FIXME: This is to Check memory HotplugRemoveDevice reported 0, as this is not supported. // In the future if this is implemented this validation should be removed. if memoryRemoved != 0 { return currentMemory, addMemDevice, fmt.Errorf("memory hot unplug is not supported, something went wrong") } currentMemory -= uint32(memoryRemoved) } // currentMemory is the current memory (updated) of the VM, return to caller to allow verify // the current VM memory state. return currentMemory, addMemDevice, nil } // genericAppendBridges appends to devices the given bridges // nolint: unused, deadcode func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, machineType string) []govmmQemu.Device { bus := defaultPCBridgeBus switch machineType { case QemuQ35, QemuVirt: bus = defaultBridgeBus } for idx, b := range bridges { t := govmmQemu.PCIBridge if b.Type == types.PCIE { t = govmmQemu.PCIEBridge } if b.Type == types.CCW { continue } bridges[idx].Addr = bridgePCIStartAddr + idx devices = append(devices, govmmQemu.BridgeDevice{ Type: t, Bus: bus, ID: b.ID, // Each bridge is required to be assigned a unique chassis id > 0 Chassis: idx + 1, SHPC: false, Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10), // Certain guest BIOS versions think // !SHPC means no hotplug, and won't // reserve the IO and memory windows // that will be needed for devices // added underneath this bridge. This // will only break for certain // combinations of exact qemu, BIOS // and guest kernel versions, but for // consistency, just hint the usual // default windows for a bridge (as // the BIOS would use with SHPC) so // that we can do ACPI hotplug. IOReserve: "4k", MemReserve: "1m", Pref64Reserve: "1m", }, ) } return devices } func genericBridges(number uint32, machineType string) []types.Bridge { var bridges []types.Bridge var bt types.Type switch machineType { case QemuQ35: // currently only pci bridges are supported // qemu-2.10 will introduce pcie bridges bt = types.PCI case QemuVirt: bt = types.PCI case QemuPseries: bt = types.PCI case QemuCCWVirtio: bt = types.CCW default: return nil } for i := uint32(0); i < number; i++ { bridges = append(bridges, types.NewBridge(bt, fmt.Sprintf("%s-bridge-%d", bt, i), make(map[uint32]string), 0)) } return bridges } // nolint: unused, deadcode func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint64) govmmQemu.Memory { // image NVDIMM device needs memory space 1024MB // See https://github.com/clearcontainers/runtime/issues/380 memoryOffset += 1024 memMax := fmt.Sprintf("%dM", hostMemoryMb+memoryOffset) mem := fmt.Sprintf("%dM", memoryMb) memory := govmmQemu.Memory{ Size: mem, Slots: slots, MaxMem: memMax, } return memory } // genericAppendPCIeRootPort appends to devices the given pcie-root-port func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { var ( bus string chassis string multiFunction bool addr string ) switch machineType { case QemuQ35, QemuVirt: bus = defaultBridgeBus chassis = "0" multiFunction = false addr = "0" default: return devices } for i := uint32(0); i < number; i++ { devices = append(devices, govmmQemu.PCIeRootPortDevice{ ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i), Bus: bus, Chassis: chassis, Slot: strconv.FormatUint(uint64(i), 10), Multifunction: multiFunction, Addr: addr, MemReserve: fmt.Sprintf("%dB", memSize32bit), Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), }, ) } return devices } // gollangci-lint enforces multi-line comments to be a block comment // not multiple single line comments ... /* pcie.0 bus // ------------------------------------------------- // | // ------------- // | Root Port | // ------------- // -------------------------|------------------------ // | ----------------- | // | PCI Express | Upstream Port | | // | Switch ----------------- | // | | | | // | ------------------- ------------------- | // | | Downstream Port | | Downstream Port | | // | ------------------- ------------------- | // -------------|-----------------------|------------ // ------------- -------------- // | GPU/ACCEL | | IB/ETH NIC | // ------------- -------------- */ // genericAppendPCIeSwitch adds a PCIe Swtich func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { // Q35, Virt have the correct PCIe support, // hence ignore all other machines if machineType != QemuQ35 && machineType != QemuVirt { return devices } // Using an own ID for the root port, so we do not clash with already // existing root ports adding "s" for switch prefix pcieRootPort := govmmQemu.PCIeRootPortDevice{ ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0), Bus: defaultBridgeBus, Chassis: "1", Slot: strconv.FormatUint(uint64(0), 10), Multifunction: false, Addr: "0", MemReserve: fmt.Sprintf("%dB", memSize32bit), Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), } devices = append(devices, pcieRootPort) pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{ ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0), Bus: pcieRootPort.ID, } devices = append(devices, pcieSwitchUpstreamPort) currentChassis, err := strconv.Atoi(pcieRootPort.Chassis) if err != nil { return devices } nextChassis := currentChassis + 1 for i := uint32(0); i < number; i++ { pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{ ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i), Bus: pcieSwitchUpstreamPort.ID, Chassis: fmt.Sprintf("%d", nextChassis), Slot: strconv.FormatUint(uint64(i), 10), // TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit), // TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), } devices = append(devices, pcieSwitchDownstreamPort) } return devices } func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() tid := VcpuThreadIDs{} if err := q.qmpSetup(); err != nil { return tid, err } cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpusFast(q.qmpMonitorCh.ctx) if err != nil { q.Logger().WithError(err).Error("failed to query cpu infos") return tid, err } tid.vcpus = make(map[int]int, len(cpuInfos)) for _, i := range cpuInfos { if i.ThreadID > 0 { tid.vcpus[i.CPUIndex] = i.ThreadID } } return tid, nil } func calcHotplugMemMiBSize(mem uint32, memorySectionSizeMB uint32) (uint32, error) { if memorySectionSizeMB == 0 { return mem, nil } return uint32(math.Ceil(float64(mem)/float64(memorySectionSizeMB))) * memorySectionSizeMB, nil } func (q *qemu) ResizeVCPUs(ctx context.Context, reqVCPUs uint32) (currentVCPUs uint32, newVCPUs uint32, err error) { currentVCPUs = q.config.NumVCPUs() + uint32(len(q.state.HotpluggedVCPUs)) newVCPUs = currentVCPUs switch { case currentVCPUs < reqVCPUs: //hotplug addCPUs := reqVCPUs - currentVCPUs data, err := q.HotplugAddDevice(ctx, addCPUs, CpuDev) if err != nil { return currentVCPUs, newVCPUs, err } vCPUsAdded, ok := data.(uint32) if !ok { return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs added, got %+v", data) } newVCPUs += vCPUsAdded case currentVCPUs > reqVCPUs: //hotunplug removeCPUs := currentVCPUs - reqVCPUs data, err := q.HotplugRemoveDevice(ctx, removeCPUs, CpuDev) if err != nil { return currentVCPUs, newVCPUs, err } vCPUsRemoved, ok := data.(uint32) if !ok { return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs removed, got %+v", data) } newVCPUs -= vCPUsRemoved } return currentVCPUs, newVCPUs, nil } func (q *qemu) Cleanup(ctx context.Context) error { span, _ := katatrace.Trace(ctx, q.Logger(), "Cleanup", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() for _, fd := range q.fds { if err := fd.Close(); err != nil { q.Logger().WithError(err).Warn("failed closing fd") } } q.fds = []*os.File{} return nil } func (q *qemu) GetPids() []int { data, err := os.ReadFile(q.qemuConfig.PidFile) if err != nil { q.Logger().WithError(err).Error("Could not read qemu pid file") return []int{0} } pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t ")) if err != nil { q.Logger().WithError(err).Error("Could not convert string to int") return []int{0} } pids := []int{pid} if q.state.VirtiofsDaemonPid != 0 { pids = append(pids, q.state.VirtiofsDaemonPid) } return pids } func (q *qemu) GetVirtioFsPid() *int { return &q.state.VirtiofsDaemonPid } type qemuGrpc struct { ID string QmpChannelpath string State QemuState NvdimmCount int // Most members of q.qemuConfig are just to generate // q.qemuConfig.qemuParams that is used by LaunchQemu except // q.qemuConfig.SMP. // So just transport q.qemuConfig.SMP from VM Cache server to runtime. QemuSMP govmmQemu.SMP } func (q *qemu) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error { var qp qemuGrpc err := json.Unmarshal(j, &qp) if err != nil { return err } q.id = qp.ID q.config = *hypervisorConfig q.qmpMonitorCh.ctx = ctx q.qmpMonitorCh.path = qp.QmpChannelpath q.qemuConfig.Ctx = ctx q.state = qp.State q.arch, err = newQemuArch(q.config) if err != nil { return err } q.ctx = ctx q.nvdimmCount = qp.NvdimmCount q.qemuConfig.SMP = qp.QemuSMP q.arch.setBridges(q.state.Bridges) return nil } func (q *qemu) toGrpc(ctx context.Context) ([]byte, error) { q.qmpShutdown() q.Cleanup(ctx) qp := qemuGrpc{ ID: q.id, QmpChannelpath: q.qmpMonitorCh.path, State: q.state, NvdimmCount: q.nvdimmCount, QemuSMP: q.qemuConfig.SMP, } return json.Marshal(&qp) } func (q *qemu) Save() (s hv.HypervisorState) { // If QEMU isn't even running, there isn't any state to Save if atomic.LoadInt32(&q.stopped) != 0 { return } pids := q.GetPids() if len(pids) != 0 { s.Pid = pids[0] } s.VirtiofsDaemonPid = q.state.VirtiofsDaemonPid s.Type = string(QemuHypervisor) s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ DeviceAddr: bridge.Devices, Type: string(bridge.Type), ID: bridge.ID, Addr: bridge.Addr, }) } for _, cpu := range q.state.HotpluggedVCPUs { s.HotpluggedVCPUs = append(s.HotpluggedVCPUs, hv.CPUDevice{ ID: cpu.ID, }) } return } func (q *qemu) Load(s hv.HypervisorState) { q.state.UUID = s.UUID q.state.HotpluggedMemory = s.HotpluggedMemory q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) } for _, cpu := range s.HotpluggedVCPUs { q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, hv.CPUDevice{ ID: cpu.ID, }) } } func (q *qemu) Check() error { if atomic.LoadInt32(&q.stopped) != 0 { return fmt.Errorf("qemu is not running") } q.memoryDumpFlag.Lock() defer q.memoryDumpFlag.Unlock() if err := q.qmpSetup(); err != nil { return err } status, err := q.qmpMonitorCh.qmp.ExecuteQueryStatus(q.qmpMonitorCh.ctx) if err != nil { return err } if status.Status == "internal-error" || status.Status == "guest-panicked" { return errors.Errorf("guest failure: %s", status.Status) } return nil } func (q *qemu) GenerateSocket(id string) (interface{}, error) { return generateVMSocket(id, q.config.VMStorePath) } func (q *qemu) IsRateLimiterBuiltin() bool { return false }