kata-containers/src/runtime/virtcontainers/qemu.go
Xynnn007 91bb6b7c34 runtime: add support for io.katacontainers.config.runtime.cc_init_data
io.katacontainers.config.runtime.cc_init_data specifies initdata used by
the pod in base64(gzip(initdata toml)) format. The initdata will be
encapsulated into an initdata image and mount it as a raw block device
to the guest.

The initdata image will be aligned with 512 bytes, which is chosen as a
usual sector size supported by different hypervisors like qemu, clh and
dragonball.

Note that this patch only adds support for qemu hypervisor.

Signed-off-by: Xynnn007 <xynnn@linux.alibaba.com>
2025-04-15 16:35:59 +08:00

3020 lines
88 KiB
Go

//go:build linux
// Copyright (c) 2016 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"bufio"
"bytes"
"compress/gzip"
"context"
"encoding/binary"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"math"
"net"
"os"
"os/exec"
"os/user"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"unsafe"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
)
// qemuTracingTags defines tags for the trace span
var qemuTracingTags = map[string]string{
"source": "runtime",
"package": "virtcontainers",
"subsystem": "hypervisor",
"type": "qemu",
}
// romFile is the file name of the ROM that can be used for virtio-pci devices.
// If this file name is empty, this means we expect the firmware used by Qemu,
// such as SeaBIOS or OVMF for instance, to handle this directly.
const romFile = ""
// disable-modern is a option to QEMU that will fall back to using 0.9 version
// of virtio. Since moving to QEMU4.0, we can start using virtio 1.0 version.
// Default value is false.
const defaultDisableModern = false
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
disconn chan struct{}
path string
sync.Mutex
}
// QemuState keeps Qemu's state
type QemuState struct {
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
PCIeRootPort uint32
PCIeSwitchPort uint32
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
// nolint: govet
type qemu struct {
arch qemuArch
virtiofsDaemon VirtiofsDaemon
ctx context.Context
// fds is a list of file descriptors inherited by QEMU process
// they'll be closed once QEMU process is running
fds []*os.File
id string
state QemuState
qmpMonitorCh qmpChannel
qemuConfig govmmQemu.Config
config HypervisorConfig
// if in memory dump progress
memoryDumpFlag sync.Mutex
nvdimmCount int
stopped int32
mu sync.Mutex
}
const (
consoleSocket = "console.sock"
qmpSocket = "qmp.sock"
extraMonitorSocket = "extra-monitor.sock"
vhostFSSocket = "vhost-fs.sock"
nydusdAPISock = "nydusd-api.sock"
// memory dump format will be set to elf
memoryDumpFormat = "elf"
qmpCapErrMsg = "Failed to negotiate QMP Capabilities"
qmpExecCatCmd = "exec:cat"
scsiControllerID = "scsi0"
rngID = "rng0"
fallbackFileBackedMemDir = "/dev/shm"
qemuStopSandboxTimeoutSecs = 15
qomPathPrefix = "/machine/peripheral/"
)
// agnostic list of kernel parameters
var defaultKernelParameters = []Param{
{"panic", "1"},
}
type qmpLogger struct {
logger *logrus.Entry
}
func newQMPLogger() qmpLogger {
return qmpLogger{
logger: hvLogger.WithField("subsystem", "qmp"),
}
}
func (l qmpLogger) V(level int32) bool {
return level != 0
}
func (l qmpLogger) Infof(format string, v ...interface{}) {
l.logger.Infof(format, v...)
}
func (l qmpLogger) Warningf(format string, v ...interface{}) {
l.logger.Warnf(format, v...)
}
func (l qmpLogger) Errorf(format string, v ...interface{}) {
l.logger.Errorf(format, v...)
}
// Logger returns a logrus logger appropriate for logging qemu messages
func (q *qemu) Logger() *logrus.Entry {
return hvLogger.WithField("subsystem", "qemu")
}
func (q *qemu) kernelParameters() string {
// get a list of arch kernel parameters
params := q.arch.kernelParameters(q.config.Debug)
// use default parameters
params = append(params, defaultKernelParameters...)
// set the maximum number of vCPUs
params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)})
// set the SELinux params in accordance with the runtime configuration, disable_guest_selinux.
if q.config.DisableGuestSeLinux {
q.Logger().Info("Set selinux=0 to kernel params because SELinux on the guest is disabled")
params = append(params, Param{"selinux", "0"})
} else {
q.Logger().Info("Set selinux=1 to kernel params because SELinux on the guest is enabled")
params = append(params, Param{"selinux", "1"})
}
// add the params specified by the provided config. As the kernel
// honours the last parameter value set and since the config-provided
// params are added here, they will take priority over the defaults.
params = append(params, q.config.KernelParams...)
paramsStr := SerializeParams(params, "=")
return strings.Join(paramsStr, " ")
}
// Adds all capabilities supported by qemu implementation of hypervisor interface
func (q *qemu) Capabilities(ctx context.Context) types.Capabilities {
span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
return q.arch.capabilities(q.config)
}
func (q *qemu) HypervisorConfig() HypervisorConfig {
return q.config
}
// get the QEMU binary path
func (q *qemu) qemuPath() (string, error) {
p, err := q.config.HypervisorAssetPath()
if err != nil {
return "", err
}
if p == "" {
p = q.arch.qemuPath()
}
if _, err = os.Stat(p); os.IsNotExist(err) {
return "", fmt.Errorf("QEMU path (%s) does not exist", p)
}
return p, nil
}
// setup sets the Qemu structure up.
func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *HypervisorConfig) error {
span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
if err := q.setConfig(hypervisorConfig); err != nil {
return err
}
q.id = id
var err error
q.arch, err = newQemuArch(q.config)
if err != nil {
return err
}
initrdPath, err := q.config.InitrdAssetPath()
if err != nil {
return err
}
imagePath, err := q.config.ImageAssetPath()
if err != nil {
return err
}
if initrdPath == "" && imagePath != "" && !q.config.DisableImageNvdimm {
q.nvdimmCount = 1
} else {
q.nvdimmCount = 0
}
var create bool
if q.state.UUID == "" {
create = true
}
q.arch.setBridges(q.state.Bridges)
q.arch.setPFlash(q.config.PFlash)
if create {
q.Logger().Debug("Creating bridges")
q.arch.bridges(q.config.DefaultBridges)
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.PCIeRootPort = q.config.PCIeRootPort
q.state.PCIeSwitchPort = q.config.PCIeSwitchPort
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
if err = utils.MkdirAllWithInheritedOwner(filepath.Join(q.config.RunStorePath, id), DirMode); err != nil {
return err
}
}
nested, err := RunningOnVMM(procCPUInfo)
if err != nil {
return err
}
if !q.config.DisableNestingChecks && nested {
q.arch.enableNestingChecks()
} else {
q.Logger().WithField("inside-vm", fmt.Sprintf("%t", nested)).Debug("Disable nesting environment checks")
q.arch.disableNestingChecks()
}
if !q.config.DisableVhostNet {
q.arch.enableVhostNet()
} else {
q.Logger().Debug("Disable vhost_net")
q.arch.disableVhostNet()
}
return nil
}
func (q *qemu) cpuTopology() govmmQemu.SMP {
return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs)
}
func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
hostMemMb := q.config.DefaultMaxMemorySize
memMb := uint64(q.config.MemorySize)
return q.arch.memoryTopology(memMb, hostMemMb, uint8(q.config.MemSlots)), nil
}
func (q *qemu) qmpSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket)
}
func (q *qemu) extraMonitorSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, extraMonitorSocket)
}
func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) {
machine := q.arch.machine()
accelerators := q.config.MachineAccelerators
if accelerators != "" {
if !strings.HasPrefix(accelerators, ",") {
accelerators = fmt.Sprintf(",%s", accelerators)
}
machine.Options += accelerators
}
return machine, nil
}
func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) {
monitorSockPath, err := q.qmpSocketPath(q.id)
if err != nil {
return nil, err
}
q.qmpMonitorCh = qmpChannel{
ctx: q.ctx,
path: monitorSockPath,
}
var sockets []govmmQemu.QMPSocket
sockets = append(sockets, govmmQemu.QMPSocket{
Type: "unix",
Protocol: govmmQemu.Qmp,
Server: true,
NoWait: true,
})
// The extra monitor socket allows an external user to take full
// control on Qemu and silently break the VM in all possible ways.
// It should only ever be used for debugging purposes, hence the
// check on Debug.
if q.HypervisorConfig().Debug && q.config.ExtraMonitorSocket != "" {
extraMonitorSockPath, err := q.extraMonitorSocketPath(q.id)
if err != nil {
return nil, err
}
sockets = append(sockets, govmmQemu.QMPSocket{
Type: "unix",
Protocol: q.config.ExtraMonitorSocket,
Name: extraMonitorSockPath,
Server: true,
NoWait: true,
})
q.Logger().Warn("QEMU configured to start with an untrusted monitor")
}
return sockets, nil
}
func (q *qemu) buildInitdataDevice(devices []govmmQemu.Device, InitdataImage string) []govmmQemu.Device {
device := govmmQemu.BlockDevice{
Driver: govmmQemu.VirtioBlock,
Transport: govmmQemu.TransportPCI,
ID: "initdata",
File: InitdataImage,
SCSI: false,
WCE: false,
AIO: govmmQemu.Threads,
Interface: "none",
Format: "raw",
}
devices = append(devices, device)
return devices
}
func (q *qemu) buildDevices(ctx context.Context, kernelPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, *govmmQemu.Kernel, error) {
var devices []govmmQemu.Device
kernel := &govmmQemu.Kernel{
Path: kernelPath,
}
_, console, err := q.GetVMConsole(ctx, q.id)
if err != nil {
return nil, nil, nil, err
}
// Add bridges before any other devices. This way we make sure that
// bridge gets the first available PCI address i.e bridgePCIStartAddr
devices = q.arch.appendBridges(devices)
devices, err = q.arch.appendConsole(ctx, devices, console)
if err != nil {
return nil, nil, nil, err
}
assetPath, assetType, err := q.config.ImageOrInitrdAssetPath()
if err != nil {
return nil, nil, nil, err
}
if assetType == types.ImageAsset {
devices, err = q.arch.appendImage(ctx, devices, assetPath)
if err != nil {
return nil, nil, nil, err
}
} else if assetType == types.InitrdAsset {
// InitrdAsset, need to set kernel initrd path
kernel.InitrdPath = assetPath
} else if assetType == types.SecureBootAsset {
// SecureBootAsset, no need to set image or initrd path
q.Logger().Info("For IBM Z Secure Execution, initrd path should not be set")
kernel.InitrdPath = ""
}
if q.config.IOMMU {
devices, err = q.arch.appendIOMMU(devices)
if err != nil {
return nil, nil, nil, err
}
}
if q.config.IfPVPanicEnabled() {
// there should have no errors for pvpanic device
devices, _ = q.arch.appendPVPanicDevice(devices)
}
var ioThread *govmmQemu.IOThread
if q.config.BlockDeviceDriver == config.VirtioSCSI {
devices, ioThread, err = q.arch.appendSCSIController(ctx, devices, q.config.EnableIOThreads)
if err != nil {
return nil, nil, nil, err
}
}
return devices, ioThread, kernel, nil
}
func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming {
incoming := govmmQemu.Incoming{}
if q.config.BootToBeTemplate || q.config.BootFromTemplate {
knobs.FileBackedMem = true
memory.Path = q.config.MemoryPath
if q.config.BootToBeTemplate {
knobs.MemShared = true
}
if q.config.BootFromTemplate {
incoming.MigrationType = govmmQemu.MigrationDefer
}
}
return incoming
}
func (q *qemu) setupFileBackedMem(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) {
var target string
if q.config.FileBackedMemRootDir != "" {
target = q.config.FileBackedMemRootDir
} else {
target = fallbackFileBackedMemDir
}
if _, err := os.Stat(target); err != nil {
q.Logger().WithError(err).Error("File backed memory location does not exist")
return
}
knobs.FileBackedMem = true
knobs.MemShared = true
memory.Path = target
}
func (q *qemu) setConfig(config *HypervisorConfig) error {
q.config = *config
return nil
}
func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) {
virtiofsdSocketPath, err := q.vhostFSSocketPath(q.id)
if err != nil {
return nil, err
}
if q.config.SharedFS == config.VirtioFSNydus {
apiSockPath, err := q.nydusdAPISocketPath(q.id)
if err != nil {
return nil, err
}
nd := &nydusd{
path: q.config.VirtioFSDaemon,
sockPath: virtiofsdSocketPath,
apiSockPath: apiSockPath,
sourcePath: sharedPath,
debug: q.config.Debug,
extraArgs: q.config.VirtioFSExtraArgs,
startFn: startInShimNS,
}
nd.setupShareDirFn = nd.setupPassthroughFS
return nd, nil
}
// Set the xattr option for virtiofsd daemon to enable extended attributes
// in virtiofs if SELinux on the guest side is enabled.
if !q.config.DisableGuestSeLinux {
q.Logger().Info("Set the xattr option for virtiofsd")
q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "--xattr")
}
// default use virtiofsd
return &virtiofsd{
path: q.config.VirtioFSDaemon,
sourcePath: sharedPath,
socketPath: virtiofsdSocketPath,
extraArgs: q.config.VirtioFSExtraArgs,
cache: q.config.VirtioFSCache,
}, nil
}
// prepareInitdataImage will create an image with a very simple layout
//
// There will be multiple sectors. The first 8 bytes are Magic number "initdata".
// Then a "length" field of 8 bytes follows (unsigned int64).
// Finally the gzipped initdata toml. The image will be padded to an
// integer multiple of the sector size for alignment.
//
// offset 0 8 16
// 0 'i' 'n' 'i' 't' 'd' 'a' 't' 'a' | gzip length in le |
// 16 gzip(initdata toml) ...
// (end of the last sector) '\0' paddings
func prepareInitdataImage(initdata string, imagePath string) error {
SectorSize := 512
var buf bytes.Buffer
gzipper := gzip.NewWriter(&buf)
defer gzipper.Close()
gzipper.Write([]byte(initdata))
err := gzipper.Close()
if err != nil {
return fmt.Errorf("failed to compress initdata: %v", err)
}
compressedInitdata := buf.Bytes()
compressedInitdataLength := len(compressedInitdata)
lengthBuffer := make([]byte, 8)
binary.LittleEndian.PutUint64(lengthBuffer, uint64(compressedInitdataLength))
paddingLength := (compressedInitdataLength+16+SectorSize-1)/SectorSize*SectorSize - (compressedInitdataLength + 16)
paddingBuffer := make([]byte, paddingLength)
file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_RDWR, 0640)
if err != nil {
return fmt.Errorf("failed to create initdata image: %v", err)
}
defer file.Close()
_, err = file.Write([]byte("initdata"))
if err != nil {
return fmt.Errorf("failed to write magic number to initdata image: %v", err)
}
_, err = file.Write(lengthBuffer)
if err != nil {
return fmt.Errorf("failed to write data length to initdata image: %v", err)
}
_, err = file.Write([]byte(compressedInitdata))
if err != nil {
return fmt.Errorf("failed to write compressed initdata to initdata image: %v", err)
}
_, err = file.Write(paddingBuffer)
if err != nil {
return fmt.Errorf("failed to write compressed initdata to initdata image: %v", err)
}
return nil
}
func (q *qemu) prepareInitdataMount(config *HypervisorConfig) error {
if len(config.Initdata) == 0 {
q.Logger().Info("No initdata provided. Skip prepare initdata device")
return nil
}
q.Logger().Info("Start to prepare initdata")
initdataWorkdir := filepath.Join("/run/kata-containers/shared/initdata", q.id)
initdataImagePath := filepath.Join(initdataWorkdir, "data.img")
err := os.MkdirAll(initdataWorkdir, 0755)
if err != nil {
q.Logger().WithField("initdata", "create initdata image path").WithError(err)
return err
}
err = prepareInitdataImage(config.Initdata, initdataImagePath)
if err != nil {
q.Logger().WithField("initdata", "prepare initdata image").WithError(err)
return err
}
config.InitdataImage = initdataImagePath
return nil
}
// CreateVM is the Hypervisor VM creation implementation for govmmQemu.
func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error {
// Save the tracing context
q.ctx = ctx
span, ctx := katatrace.Trace(ctx, q.Logger(), "CreateVM", qemuTracingTags, map[string]string{"VM_ID": q.id})
defer span.End()
if err := q.setup(ctx, id, hypervisorConfig); err != nil {
return err
}
if err := q.prepareInitdataMount(hypervisorConfig); err != nil {
return err
}
machine, err := q.getQemuMachine()
if err != nil {
return err
}
smp := q.cpuTopology()
memory, err := q.memoryTopology()
if err != nil {
return err
}
knobs := govmmQemu.Knobs{
NoUserConfig: true,
NoDefaults: true,
NoGraphic: true,
NoReboot: true,
MemPrealloc: q.config.MemPrealloc,
HugePages: q.config.HugePages,
IOMMUPlatform: q.config.IOMMUPlatform,
}
incoming := q.setupTemplate(&knobs, &memory)
// With the current implementations, VM templating will not work with file
// based memory (stand-alone) or virtiofs. This is because VM templating
// builds the first VM with file-backed memory and shared=on and the
// subsequent ones with shared=off. virtio-fs always requires shared=on for
// memory.
if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus ||
q.config.FileBackedMemRootDir != "" {
if !(q.config.BootToBeTemplate || q.config.BootFromTemplate) {
q.setupFileBackedMem(&knobs, &memory)
} else {
return errors.New("VM templating has been enabled with either virtio-fs or file backed memory and this configuration will not work")
}
if q.config.HugePages {
knobs.MemPrealloc = true
}
}
// Vhost-user-blk/scsi process which can improve performance, like SPDK,
// requires shared-on hugepage to work with Qemu.
if q.config.EnableVhostUserStore {
if !q.config.HugePages {
return errors.New("Vhost-user-blk/scsi is enabled without HugePages. This configuration will not work")
}
knobs.MemShared = true
}
rtc := govmmQemu.RTC{
Base: govmmQemu.UTC,
Clock: govmmQemu.Host,
DriftFix: govmmQemu.Slew,
}
if q.state.UUID == "" {
return fmt.Errorf("UUID should not be empty")
}
qmpSockets, err := q.createQmpSocket()
if err != nil {
return err
}
kernelPath, err := q.config.KernelAssetPath()
if err != nil {
return err
}
devices, ioThread, kernel, err := q.buildDevices(ctx, kernelPath)
if err != nil {
return err
}
cpuModel := q.arch.cpuModel()
cpuModel += "," + q.config.CPUFeatures
firmwarePath, err := q.config.FirmwareAssetPath()
if err != nil {
return err
}
firmwareVolumePath, err := q.config.FirmwareVolumeAssetPath()
if err != nil {
return err
}
pflash, err := q.arch.getPFlash()
if err != nil {
return err
}
qemuPath, err := q.qemuPath()
if err != nil {
return err
}
if len(hypervisorConfig.Initdata) > 0 {
devices = q.buildInitdataDevice(devices, hypervisorConfig.InitdataImage)
}
// some devices configuration may also change kernel params, make sure this is called afterwards
kernel.Params = q.kernelParameters()
q.checkBpfEnabled()
qemuConfig := govmmQemu.Config{
Name: fmt.Sprintf("sandbox-%s", q.id),
UUID: q.state.UUID,
Path: qemuPath,
Ctx: q.qmpMonitorCh.ctx,
Uid: q.config.Uid,
Gid: q.config.Gid,
Groups: q.config.Groups,
Machine: machine,
SMP: smp,
Memory: memory,
Devices: devices,
CPUModel: cpuModel,
SeccompSandbox: q.config.SeccompSandbox,
Kernel: *kernel,
RTC: rtc,
QMPSockets: qmpSockets,
Knobs: knobs,
Incoming: incoming,
VGA: "none",
GlobalParam: "kvm-pit.lost_tick_policy=discard",
Bios: firmwarePath,
PFlash: pflash,
PidFile: filepath.Join(q.config.VMStorePath, q.id, "pid"),
Debug: hypervisorConfig.Debug,
}
qemuConfig.Devices, qemuConfig.Bios, err = q.arch.appendProtectionDevice(qemuConfig.Devices, firmwarePath, firmwareVolumePath, hypervisorConfig.InitdataDigest)
if err != nil {
return err
}
if ioThread != nil {
qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread}
}
// Add RNG device to hypervisor
// Skip for s390x as CPACF is used
if machine.Type != QemuCCWVirtio {
rngDev := config.RNGDev{
ID: rngID,
Filename: q.config.EntropySource,
}
qemuConfig.Devices, err = q.arch.appendRNGDevice(ctx, qemuConfig.Devices, rngDev)
if err != nil {
return err
}
}
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type, network); err != nil {
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
return err
}
}
q.qemuConfig = qemuConfig
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
return err
}
func (q *qemu) checkBpfEnabled() {
if q.config.SeccompSandbox != "" {
out, err := os.ReadFile("/proc/sys/net/core/bpf_jit_enable")
if err != nil {
q.Logger().WithError(err).Warningf("failed to get bpf_jit_enable status")
return
}
enabled, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil {
q.Logger().WithError(err).Warningf("failed to convert bpf_jit_enable status to integer")
return
}
if enabled == 0 {
q.Logger().Warningf("bpf_jit_enable is disabled. " +
"It's recommended to turn on bpf_jit_enable to reduce the performance impact of QEMU seccomp sandbox.")
}
}
}
// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
// to hotplug 32 devices. We do not have enough PCIe root bus slots to
// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
// devices.
// Max PCI slots per root bus is 32
// Max PCIe root ports is 16
// Max PCIe switch ports is 16
// There is only 64kB of IO memory each root,switch port will consume 4k hence
// only 16 ports possible.
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string, network Network) error {
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
return nil
}
// Add PCIe Root Port or PCIe Switches to the hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
// into a PCIe Root Port or PCIe Switch.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
// Deduce the right values for mem-reserve and pref-64-reserve memory regions
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
// Get the number of hot(cold)-pluggable ports needed from the provided
// VFIO devices
var numOfPluggablePorts uint32 = 0
// Fow now, pcie native hotplug is the only way for Arm to hotadd pci device.
if machineType == QemuVirt {
epNum, err := network.GetEndpointsNum()
if err != nil {
q.Logger().Warn("Fail to get network endpoints number")
}
virtPcieRootPortNum := len(hypervisorConfig.VhostUserBlkDevices) + epNum
if hypervisorConfig.VirtioMem {
virtPcieRootPortNum++
}
numOfPluggablePorts += uint32(virtPcieRootPortNum)
}
for _, dev := range hypervisorConfig.VFIODevices {
var err error
dev.HostPath, err = config.GetHostPath(dev, false, "")
if err != nil {
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
}
var vfioDevices []*config.VFIODev
// This works for IOMMUFD enabled kernels > 6.x
// In the case of IOMMUFD the device.HostPath will look like
// /dev/vfio/devices/vfio0
// (1) Check if we have the new IOMMUFD or old container based VFIO
if strings.HasPrefix(dev.HostPath, drivers.IommufdDevPath) {
q.Logger().Infof("### IOMMUFD Path: %s", dev.HostPath)
vfioDevices, err = drivers.GetDeviceFromVFIODev(dev)
if err != nil {
return fmt.Errorf("Cannot get VFIO device from IOMMUFD with device: %v err: %v", dev, err)
}
} else {
vfioDevices, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
if err != nil {
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
}
}
for _, vfioDevice := range vfioDevices {
if drivers.IsPCIeDevice(vfioDevice.BDF) {
numOfPluggablePorts = numOfPluggablePorts + 1
}
}
}
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
// If the devices are not advertised via CRI or cold-plugged we need to
// get the number of pluggable root/switch ports from the config
numPCIeRootPorts := hypervisorConfig.PCIeRootPort
numPCIeSwitchPorts := hypervisorConfig.PCIeSwitchPort
// If number of PCIe root ports > 16 then bail out otherwise we may
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
// cannot be added which are crucial for Kata max slots on root bus is 32
// max slots on the complete pci(e) topology is 256 in QEMU
if vfioOnRootPort {
if numOfPluggablePorts < numPCIeRootPorts {
numOfPluggablePorts = numPCIeRootPorts
}
if numOfPluggablePorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
if vfioOnSwitchPort {
if numOfPluggablePorts < numPCIeSwitchPorts {
numOfPluggablePorts = numPCIeSwitchPorts
}
if numOfPluggablePorts > maxPCIeSwitchPort {
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
}
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
// If both Root Port and Switch Port are not enabled, check if QemuVirt need add pcie root port.
if machineType == QemuVirt {
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
}
return nil
}
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
}
func (q *qemu) nydusdAPISocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, nydusdAPISock)
}
func (q *qemu) setupVirtiofsDaemon(ctx context.Context) (err error) {
pid, err := q.virtiofsDaemon.Start(ctx, func() {
q.StopVM(ctx, false)
})
if err != nil {
return err
}
q.state.VirtiofsDaemonPid = pid
return nil
}
func (q *qemu) stopVirtiofsDaemon(ctx context.Context) (err error) {
if q.state.VirtiofsDaemonPid == 0 {
q.Logger().Warn("The virtiofsd had stopped")
return nil
}
err = q.virtiofsDaemon.Stop(ctx)
if err != nil {
return err
}
q.state.VirtiofsDaemonPid = 0
return nil
}
func (q *qemu) getMemArgs() (bool, string, string, error) {
share := false
target := ""
memoryBack := "memory-backend-ram"
if q.qemuConfig.Knobs.HugePages {
// we are setting all the bits that govmm sets when hugepages are enabled.
// https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677
target = "/dev/hugepages"
memoryBack = "memory-backend-file"
share = true
} else {
if q.config.EnableVhostUserStore {
// Vhost-user-blk/scsi process which can improve performance, like SPDK,
// requires shared-on hugepage to work with Qemu.
return share, target, "", fmt.Errorf("Vhost-user-blk/scsi requires hugepage memory")
}
if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus ||
q.config.FileBackedMemRootDir != "" {
target = q.qemuConfig.Memory.Path
memoryBack = "memory-backend-file"
}
}
if q.qemuConfig.Knobs.MemShared {
share = true
}
return share, target, memoryBack, nil
}
func (q *qemu) setupVirtioMem(ctx context.Context) error {
// backend memory size must be multiple of 4Mib
sizeMB := (int(q.config.DefaultMaxMemorySize) - int(q.config.MemorySize)) >> 2 << 2
share, target, memoryBack, err := q.getMemArgs()
if err != nil {
return err
}
if err = q.qmpSetup(); err != nil {
return err
}
addr, bridge, err := q.arch.addDeviceToBridge(ctx, "virtiomem-dev", types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge("virtiomem-dev")
}
}()
bridgeID := bridge.ID
// Hot add virtioMem dev to pcie-root-port for QemuVirt
machineType := q.HypervisorConfig().HypervisorMachineType
if machineType == QemuVirt {
addr = "00"
bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: "virtiomem"}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
}
err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0", addr, bridgeID)
if err == nil {
q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB)
} else {
help := ""
if strings.Contains(err.Error(), "Cannot allocate memory") {
help = ". Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it."
}
err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help)
}
return err
}
// setupEarlyQmpConnection creates a listener socket to be passed to QEMU
// as a QMP listening endpoint. An initial connection is established, to
// be used as the QMP client socket. This allows to detect an early failure
// of QEMU instead of looping on connect until some timeout expires.
func (q *qemu) setupEarlyQmpConnection() (net.Conn, error) {
monitorSockPath := q.qmpMonitorCh.path
qmpListener, err := net.Listen("unix", monitorSockPath)
if err != nil {
q.Logger().WithError(err).Errorf("Unable to listen on unix socket address (%s)", monitorSockPath)
return nil, err
}
// A duplicate fd of this socket will be passed to QEMU. We must
// close the original one when we're done.
defer qmpListener.Close()
if rootless.IsRootless() {
err = syscall.Chown(monitorSockPath, int(q.config.Uid), int(q.config.Gid))
if err != nil {
q.Logger().WithError(err).Errorf("Unable to make unix socket (%s) rootless", monitorSockPath)
return nil, err
}
}
VMFd, err := qmpListener.(*net.UnixListener).File()
if err != nil {
return nil, err
}
defer func() {
if err != nil {
VMFd.Close()
}
}()
// This socket will be used to establish the initial QMP connection
dialer := net.Dialer{Cancel: q.qmpMonitorCh.ctx.Done()}
conn, err := dialer.Dial("unix", monitorSockPath)
if err != nil {
q.Logger().WithError(err).Errorf("Unable to connect to unix socket (%s)", monitorSockPath)
return nil, err
}
// We need to keep the socket file around to be able to re-connect
qmpListener.(*net.UnixListener).SetUnlinkOnClose(false)
// Pass the duplicated fd of the listener socket to QEMU
q.qemuConfig.QMPSockets[0].FD = VMFd
q.fds = append(q.fds, q.qemuConfig.QMPSockets[0].FD)
return conn, nil
}
func (q *qemu) LogAndWait(qemuCmd *exec.Cmd, reader io.ReadCloser) {
pid := qemuCmd.Process.Pid
q.Logger().Infof("Start logging QEMU (qemuPid=%d)", pid)
scanner := bufio.NewScanner(reader)
warnRE := regexp.MustCompile("(^[^:]+: )warning: ")
for scanner.Scan() {
text := scanner.Text()
if warnRE.MatchString(text) {
text = warnRE.ReplaceAllString(text, "$1")
q.Logger().WithField("qemuPid", pid).Warning(text)
} else {
q.Logger().WithField("qemuPid", pid).Error(text)
}
}
q.Logger().Infof("Stop logging QEMU (qemuPid=%d)", pid)
qemuCmd.Wait()
}
// StartVM will start the Sandbox's VM.
func (q *qemu) StartVM(ctx context.Context, timeout int) error {
span, ctx := katatrace.Trace(ctx, q.Logger(), "StartVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
if q.config.Debug {
params := q.arch.kernelParameters(q.config.Debug)
strParams := SerializeParams(params, "=")
formatted := strings.Join(strParams, " ")
// The name of this field matches a similar one generated by
// the runtime and allows users to identify which parameters
// are set here, which come from the runtime and which are set
// by the user.
q.Logger().WithField("default-kernel-parameters", formatted).Debug()
}
defer func() {
for _, fd := range q.fds {
if err := fd.Close(); err != nil {
q.Logger().WithError(err).Error("After launching Qemu")
}
}
q.fds = []*os.File{}
}()
vmPath := filepath.Join(q.config.VMStorePath, q.id)
err := utils.MkdirAllWithInheritedOwner(vmPath, DirMode)
if err != nil {
return err
}
q.Logger().WithField("vm path", vmPath).Info("created vm path")
defer func() {
if err != nil {
if err := os.RemoveAll(vmPath); err != nil {
q.Logger().WithError(err).Error("Fail to clean up vm directory")
}
}
}()
var qmpConn net.Conn
qmpConn, err = q.setupEarlyQmpConnection()
if err != nil {
return err
}
// This needs to be done as late as possible, just before launching
// virtiofsd are executed by kata-runtime after this call, run with
// the SELinux label. If these processes require privileged, we do
// notwant to run them under confinement.
if !q.config.DisableSeLinux {
if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil {
return err
}
defer label.SetProcessLabel("")
}
if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus {
err = q.setupVirtiofsDaemon(ctx)
if err != nil {
return err
}
defer func() {
if err != nil {
if shutdownErr := q.stopVirtiofsDaemon(ctx); shutdownErr != nil {
q.Logger().WithError(shutdownErr).Warn("failed to stop virtiofsDaemon")
}
}
}()
}
qemuCmd, reader, err := govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger())
if err != nil {
q.Logger().WithError(err).Error("failed to launch qemu")
return fmt.Errorf("failed to launch qemu: %s", err)
}
// Log QEMU errors and ensure the QEMU process is reaped after
// termination.
go q.LogAndWait(qemuCmd, reader)
err = q.waitVM(ctx, qmpConn, timeout)
if err != nil {
return err
}
if q.config.BootFromTemplate {
if err = q.bootFromTemplate(); err != nil {
return err
}
}
if q.config.VirtioMem {
err = q.setupVirtioMem(ctx)
}
return err
}
func (q *qemu) bootFromTemplate() error {
if err := q.qmpSetup(); err != nil {
return err
}
defer q.qmpShutdown()
err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
if err != nil {
q.Logger().WithError(err).Error("set migration ignore shared memory")
return err
}
uri := fmt.Sprintf("exec:cat %s", q.config.DevicesStatePath)
err = q.qmpMonitorCh.qmp.ExecuteMigrationIncoming(q.qmpMonitorCh.ctx, uri)
if err != nil {
return err
}
return q.waitMigration()
}
// waitVM will wait for the Sandbox's VM to be up and running.
func (q *qemu) waitVM(ctx context.Context, qmpConn net.Conn, timeout int) error {
span, _ := katatrace.Trace(ctx, q.Logger(), "waitVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
if timeout < 0 {
return fmt.Errorf("Invalid timeout %ds", timeout)
}
cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
var qmp *govmmQemu.QMP
var disconnectCh chan struct{}
var ver *govmmQemu.QMPVersion
var err error
// clear any possible old state before trying to connect again.
q.qmpShutdown()
timeStart := time.Now()
for {
disconnectCh = make(chan struct{})
qmp, ver, err = govmmQemu.QMPStartWithConn(q.qmpMonitorCh.ctx, qmpConn, cfg, disconnectCh)
if err == nil {
break
}
if int(time.Since(timeStart).Seconds()) > timeout {
return fmt.Errorf("Failed to connect to QEMU instance (timeout %ds): %v", timeout, err)
}
time.Sleep(time.Duration(50) * time.Millisecond)
}
q.qmpMonitorCh.qmp = qmp
q.qmpMonitorCh.disconn = disconnectCh
defer q.qmpShutdown()
q.Logger().WithFields(logrus.Fields{
"qmp-major-version": ver.Major,
"qmp-minor-version": ver.Minor,
"qmp-micro-version": ver.Micro,
"qmp-Capabilities": strings.Join(ver.Capabilities, ","),
}).Infof("QMP details")
if err = q.qmpMonitorCh.qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx); err != nil {
q.Logger().WithError(err).Error(qmpCapErrMsg)
return err
}
return nil
}
// StopVM will stop the Sandbox's VM.
func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) {
q.mu.Lock()
defer q.mu.Unlock()
span, _ := katatrace.Trace(ctx, q.Logger(), "StopVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
q.Logger().Info("Stopping Sandbox")
if atomic.LoadInt32(&q.stopped) != 0 {
q.Logger().Info("Already stopped")
return nil
}
defer func() {
q.cleanupVM()
if err == nil {
atomic.StoreInt32(&q.stopped, 1)
}
}()
if err := q.qmpSetup(); err != nil {
return err
}
pids := q.GetPids()
if len(pids) == 0 {
return errors.New("cannot determine QEMU PID")
}
pid := pids[0]
if pid > 0 {
if waitOnly {
err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger())
if err != nil {
return err
}
} else {
err = syscall.Kill(pid, syscall.SIGKILL)
if err != nil {
q.Logger().WithError(err).Error("Fail to send SIGKILL to qemu")
return err
}
}
}
if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus {
if err := q.stopVirtiofsDaemon(ctx); err != nil {
return err
}
}
return nil
}
func (q *qemu) cleanupVM() error {
// Cleanup vm path
dir := filepath.Join(q.config.VMStorePath, q.id)
// If it's a symlink, remove both dir and the target.
// This can happen when vm template links a sandbox to a vm.
link, err := filepath.EvalSymlinks(dir)
if err != nil {
// Well, it's just Cleanup failure. Let's ignore it.
q.Logger().WithError(err).WithField("dir", dir).Warn("failed to resolve vm path")
}
q.Logger().WithField("link", link).WithField("dir", dir).Infof("Cleanup vm path")
if err := os.RemoveAll(dir); err != nil {
q.Logger().WithError(err).Warnf("failed to remove vm path %s", dir)
}
if link != dir && link != "" {
if err := os.RemoveAll(link); err != nil {
q.Logger().WithError(err).WithField("link", link).Warn("failed to remove resolved vm path")
}
}
if q.config.VMid != "" {
dir = filepath.Join(q.config.RunStorePath, q.config.VMid)
if err := os.RemoveAll(dir); err != nil {
q.Logger().WithError(err).WithField("path", dir).Warnf("failed to remove vm path")
}
}
if rootless.IsRootless() {
if _, err := user.Lookup(q.config.User); err != nil {
q.Logger().WithError(err).WithFields(
logrus.Fields{
"user": q.config.User,
"uid": q.config.Uid,
}).Warn("failed to find the user, it might have been removed")
return nil
}
if err := pkgUtils.RemoveVmmUser(q.config.User); err != nil {
q.Logger().WithError(err).WithFields(
logrus.Fields{
"user": q.config.User,
"uid": q.config.Uid,
}).Warn("failed to delete the user")
return nil
}
q.Logger().WithFields(
logrus.Fields{
"user": q.config.User,
"uid": q.config.Uid,
}).Debug("successfully removed the non root user")
}
// If we have initdata, we should drop initdata image path
hypervisorConfig := q.HypervisorConfig()
if len(hypervisorConfig.Initdata) > 0 {
initdataWorkdir := filepath.Join(string(filepath.Separator), "/run/kata-containers/shared/initdata", q.id)
if err := os.RemoveAll(initdataWorkdir); err != nil {
q.Logger().WithError(err).Warnf("failed to remove initdata work dir %s", initdataWorkdir)
}
}
return nil
}
func (q *qemu) togglePauseSandbox(ctx context.Context, pause bool) error {
span, _ := katatrace.Trace(ctx, q.Logger(), "togglePauseSandbox", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
if err := q.qmpSetup(); err != nil {
return err
}
if pause {
return q.qmpMonitorCh.qmp.ExecuteStop(q.qmpMonitorCh.ctx)
}
return q.qmpMonitorCh.qmp.ExecuteCont(q.qmpMonitorCh.ctx)
}
func (q *qemu) qmpSetup() error {
q.qmpMonitorCh.Lock()
defer q.qmpMonitorCh.Unlock()
if q.qmpMonitorCh.qmp != nil {
return nil
}
events := make(chan govmmQemu.QMPEvent)
go q.loopQMPEvent(events)
cfg := govmmQemu.QMPConfig{
Logger: newQMPLogger(),
EventCh: events,
}
// Auto-closed by QMPStart().
disconnectCh := make(chan struct{})
qmp, _, err := govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh)
if err != nil {
q.Logger().WithError(err).Error("Failed to connect to QEMU instance")
return err
}
err = qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx)
if err != nil {
qmp.Shutdown()
q.Logger().WithError(err).Error(qmpCapErrMsg)
return err
}
q.qmpMonitorCh.qmp = qmp
q.qmpMonitorCh.disconn = disconnectCh
return nil
}
func (q *qemu) loopQMPEvent(event chan govmmQemu.QMPEvent) {
for e := range event {
q.Logger().WithField("event", e).Debug("got QMP event")
if e.Name == "GUEST_PANICKED" {
go q.handleGuestPanic()
}
}
q.Logger().Infof("QMP event channel closed")
}
func (q *qemu) handleGuestPanic() {
if err := q.dumpGuestMemory(q.config.GuestMemoryDumpPath); err != nil {
q.Logger().WithError(err).Error("failed to dump guest memory")
}
// TODO: how to notify the upper level sandbox to handle the error
// to do a fast fail(shutdown or others).
// tracked by https://github.com/kata-containers/kata-containers/issues/1026
}
// canDumpGuestMemory check if can do a guest memory dump operation.
// for now it only ensure there must be double of VM size for free disk spaces
func (q *qemu) canDumpGuestMemory(dumpSavePath string) error {
fs := unix.Statfs_t{}
if err := unix.Statfs(dumpSavePath, &fs); err != nil {
q.Logger().WithError(err).WithField("dumpSavePath", dumpSavePath).Error("failed to call Statfs")
return nil
}
availSpaceInBytes := fs.Bavail * uint64(fs.Bsize)
q.Logger().WithFields(
logrus.Fields{
"dumpSavePath": dumpSavePath,
"availSpaceInBytes": availSpaceInBytes,
}).Info("get avail space")
// get guest memory size
guestMemorySizeInBytes := (uint64(q.config.MemorySize) + uint64(q.state.HotpluggedMemory)) << utils.MibToBytesShift
q.Logger().WithField("guestMemorySizeInBytes", guestMemorySizeInBytes).Info("get guest memory size")
// default we want ensure there are at least double of VM memory size free spaces available,
// this may complete one dump operation for one sandbox
exceptMemorySize := guestMemorySizeInBytes * 2
if availSpaceInBytes >= exceptMemorySize {
return nil
}
return fmt.Errorf("there are not enough free space to store memory dump file. Except %d bytes, but only %d bytes available", exceptMemorySize, availSpaceInBytes)
}
// dumpSandboxMetaInfo save meta information for debug purpose, includes:
// hypervisor version, sandbox/container state, hypervisor config
func (q *qemu) dumpSandboxMetaInfo(dumpSavePath string) {
dumpStatePath := filepath.Join(dumpSavePath, "state")
// copy state from /run/vc/sbs to memory dump directory
statePath := filepath.Join(q.config.RunStorePath, q.id)
command := []string{"/bin/cp", "-ar", statePath, dumpStatePath}
q.Logger().WithField("command", command).Info("try to Save sandbox state")
if output, err := pkgUtils.RunCommandFull(command, true); err != nil {
q.Logger().WithError(err).WithField("output", output).Error("failed to Save state")
}
// Save hypervisor meta information
fileName := filepath.Join(dumpSavePath, "hypervisor.conf")
data, _ := json.MarshalIndent(q.config, "", " ")
if err := os.WriteFile(fileName, data, defaultFilePerms); err != nil {
q.Logger().WithError(err).WithField("hypervisor.conf", data).Error("write to hypervisor.conf file failed")
}
// Save hypervisor version
hyperVisorVersion, err := pkgUtils.RunCommand([]string{q.config.HypervisorPath, "--version"})
if err != nil {
q.Logger().WithError(err).WithField("HypervisorPath", data).Error("failed to get hypervisor version")
}
fileName = filepath.Join(dumpSavePath, "hypervisor.version")
if err := os.WriteFile(fileName, []byte(hyperVisorVersion), defaultFilePerms); err != nil {
q.Logger().WithError(err).WithField("hypervisor.version", data).Error("write to hypervisor.version file failed")
}
}
func (q *qemu) dumpGuestMemory(dumpSavePath string) error {
if dumpSavePath == "" {
return nil
}
q.memoryDumpFlag.Lock()
defer q.memoryDumpFlag.Unlock()
q.Logger().WithField("dumpSavePath", dumpSavePath).Info("try to dump guest memory")
dumpSavePath = filepath.Join(dumpSavePath, q.id)
dumpStatePath := filepath.Join(dumpSavePath, "state")
if err := pkgUtils.EnsureDir(dumpStatePath, DirMode); err != nil {
return err
}
// Save meta information for sandbox
q.dumpSandboxMetaInfo(dumpSavePath)
q.Logger().Info("dump sandbox meta information completed")
// Check device free space and estimated dump size
if err := q.canDumpGuestMemory(dumpSavePath); err != nil {
q.Logger().Warnf("can't dump guest memory: %s", err.Error())
return err
}
// dump guest memory
protocol := fmt.Sprintf("file:%s/vmcore-%s.%s", dumpSavePath, time.Now().Format("20060102150405.999"), memoryDumpFormat)
q.Logger().Infof("try to dump guest memory to %s", protocol)
if err := q.qmpSetup(); err != nil {
q.Logger().WithError(err).Error("setup manage QMP failed")
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteDumpGuestMemory(q.qmpMonitorCh.ctx, protocol, q.config.GuestMemoryDumpPaging, memoryDumpFormat); err != nil {
q.Logger().WithError(err).Error("dump guest memory failed")
return err
}
q.Logger().Info("dump guest memory completed")
return nil
}
func (q *qemu) qmpShutdown() {
q.qmpMonitorCh.Lock()
defer q.qmpMonitorCh.Unlock()
if q.qmpMonitorCh.qmp != nil {
q.qmpMonitorCh.qmp.Shutdown()
// wait on disconnected channel to be sure that the qmp channel has
// been closed cleanly.
<-q.qmpMonitorCh.disconn
q.qmpMonitorCh.qmp = nil
q.qmpMonitorCh.disconn = nil
}
}
func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDrive, op Operation, devID string) (err error) {
// drive can be a pmem device, in which case it's used as backing file for a nvdimm device
if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem {
var blocksize int64
file, err := os.Open(drive.File)
if err != nil {
return err
}
defer file.Close()
st, err := file.Stat()
if err != nil {
return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err)
}
// regular files do not support syscall BLKGETSIZE64
if st.Mode().IsRegular() {
blocksize = st.Size()
} else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil {
q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File)
return err
}
drive.NvdimmID = strconv.Itoa(q.nvdimmCount)
q.nvdimmCount++
return nil
}
qblkDevice := govmmQemu.BlockDevice{
ID: drive.ID,
File: drive.File,
ReadOnly: drive.ReadOnly,
AIO: govmmQemu.BlockDeviceAIO(q.config.BlockDeviceAIO),
}
if drive.Swap {
err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithDriverCache(q.qmpMonitorCh.ctx, "file", &qblkDevice, false, false)
} else if q.config.BlockDeviceCacheSet {
err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, &qblkDevice, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush)
} else {
err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, &qblkDevice)
}
if err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID)
}
}()
switch {
case drive.Swap:
fallthrough
case q.config.BlockDeviceDriver == config.VirtioBlock:
driver := "virtio-blk-pci"
addr, bridge, err := q.arch.addDeviceToBridge(ctx, drive.ID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(drive.ID)
}
}()
bridgeSlot, err := types.PciSlotFromInt(bridge.Addr)
if err != nil {
return err
}
devSlot, err := types.PciSlotFromString(addr)
if err != nil {
return err
}
drive.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot)
if err != nil {
return err
}
queues := int(q.config.NumVCPUs())
if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, queues, true, defaultDisableModern); err != nil {
return err
}
case q.config.BlockDeviceDriver == config.VirtioBlockCCW:
driver := "virtio-blk-ccw"
addr, bridge, err := q.arch.addDeviceToBridge(ctx, drive.ID, types.CCW)
if err != nil {
return err
}
var devNoHotplug string
devNoHotplug, err = bridge.AddressFormatCCW(addr)
if err != nil {
return err
}
drive.DevNo, err = bridge.AddressFormatCCWForVirtServer(addr)
if err != nil {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil {
return err
}
case q.config.BlockDeviceDriver == config.VirtioSCSI:
driver := "scsi-hd"
// Bus exposed by the SCSI Controller
bus := scsiControllerID + ".0"
// Get SCSI-id and LUN based on the order of attaching drives.
scsiID, lun, err := utils.GetSCSIIdLun(drive.Index)
if err != nil {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, defaultDisableModern); err != nil {
return err
}
default:
return fmt.Errorf("Block device %s not recognized", q.config.BlockDeviceDriver)
}
return nil
}
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
if err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID)
}
}()
driver := "vhost-user-blk-pci"
machineType := q.HypervisorConfig().HypervisorMachineType
switch machineType {
case QemuVirt:
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
//Since the dev is the first and only one on this bus(root port), it should be 0.
addr := "00"
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: devID}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh)
if err != nil {
return err
}
devSlot, err := types.PciSlotFromString(addr)
if err != nil {
return err
}
vAttr.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot)
if err != nil {
return err
}
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
return err
}
default:
addr, bridge, err := q.arch.addDeviceToBridge(ctx, vAttr.DevID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(vAttr.DevID)
}
}()
bridgeSlot, err := types.PciSlotFromInt(bridge.Addr)
if err != nil {
return err
}
devSlot, err := types.PciSlotFromString(addr)
if err != nil {
return err
}
vAttr.PCIPath, err = types.PciPathFromSlots(bridgeSlot, devSlot)
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil {
return err
}
}
return nil
}
func (q *qemu) hotplugBlockDevice(ctx context.Context, drive *config.BlockDrive, op Operation) error {
if err := q.qmpSetup(); err != nil {
return err
}
devID := "virtio-" + drive.ID
if op == AddDevice {
return q.hotplugAddBlockDevice(ctx, drive, op, devID)
}
if !drive.Swap && q.config.BlockDeviceDriver == config.VirtioBlock {
if err := q.arch.removeDeviceFromBridge(drive.ID); err != nil {
return err
}
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
return q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID)
}
func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation) error {
if err := q.qmpSetup(); err != nil {
return err
}
devID := "virtio-" + vAttr.DevID
if op == AddDevice {
switch vAttr.Type {
case config.VhostUserBlk:
return q.hotplugAddVhostUserBlkDevice(ctx, vAttr, op, devID)
default:
return fmt.Errorf("Incorrect vhost-user device type found")
}
} else {
machineType := q.HypervisorConfig().HypervisorMachineType
if machineType != QemuVirt {
if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil {
return err
}
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
return q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID)
}
}
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(device.ID)
}
}()
return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
}
func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
}
func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
}
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
if err = q.qmpSetup(); err != nil {
return err
}
if op == AddDevice {
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
"machine-type": q.HypervisorConfig().HypervisorMachineType,
"hot-plug-vfio": q.state.HotPlugVFIO,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device)
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else if q.state.HotPlugVFIO == config.BridgePort {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
if err != nil {
return err
}
// Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu based on the arch
device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh)
return err
} else {
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
if q.state.HotPlugVFIO == config.BridgePort {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
var (
VMFdNames []string
VhostFdNames []string
)
for i, VMFd := range VMFds {
fdName := fmt.Sprintf("fd%d", i)
if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VMFd); err != nil {
return err
}
VMFdNames = append(VMFdNames, fdName)
}
for i, VhostFd := range VhostFds {
fdName := fmt.Sprintf("vhostfd%d", i)
if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VhostFd); err != nil {
return err
}
VhostFd.Close()
VhostFdNames = append(VhostFdNames, fdName)
}
return q.qmpMonitorCh.qmp.ExecuteNetdevAddByFds(q.qmpMonitorCh.ctx, "tap", name, VMFdNames, VhostFdNames)
}
func (q *qemu) hotplugNetDevice(ctx context.Context, endpoint Endpoint, op Operation) (err error) {
if err = q.qmpSetup(); err != nil {
return err
}
var tap TapInterface
switch endpoint.Type() {
case VethEndpointType, IPVlanEndpointType, MacvlanEndpointType, TuntapEndpointType:
tap = endpoint.NetworkPair().TapInterface
case TapEndpointType:
drive := endpoint.(*TapEndpoint)
tap = drive.TapInterface
default:
return fmt.Errorf("this endpoint is not supported")
}
devID := "virtio-" + tap.ID
machineType := q.HypervisorConfig().HypervisorMachineType
if op == AddDevice {
if err = q.hotAddNetDevice(tap.Name, endpoint.HardwareAddr(), tap.VMFds, tap.VhostFds); err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name)
}
}()
// Hotplug net dev to pcie root port for QemuVirt
if machineType == QemuVirt {
addr := "00"
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: devID}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridgeID, romFile, int(q.config.NumVCPUs()), defaultDisableModern)
}
addr, bridge, err := q.arch.addDeviceToBridge(ctx, tap.ID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(tap.ID)
}
}()
q.arch.setEndpointDevicePath(endpoint, bridge.Addr, addr)
var machine govmmQemu.Machine
machine, err = q.getQemuMachine()
if err != nil {
return err
}
if machine.Type == QemuCCWVirtio {
devNoHotplug := fmt.Sprintf("fe.%x.%v", bridge.Addr, addr)
return q.qmpMonitorCh.qmp.ExecuteNetCCWDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), devNoHotplug, int(q.config.NumVCPUs()))
}
return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridge.ID, romFile, int(q.config.NumVCPUs()), defaultDisableModern)
}
if err := q.arch.removeDeviceFromBridge(tap.ID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
return q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name)
}
func (q *qemu) hotplugDevice(ctx context.Context, devInfo interface{}, devType DeviceType, op Operation) (interface{}, error) {
switch devType {
case BlockDev:
drive := devInfo.(*config.BlockDrive)
return nil, q.hotplugBlockDevice(ctx, drive, op)
case CpuDev:
vcpus := devInfo.(uint32)
return q.hotplugCPUs(vcpus, op)
case VfioDev:
device := devInfo.(*config.VFIODev)
return nil, q.hotplugVFIODevice(ctx, device, op)
case MemoryDev:
memdev := devInfo.(*MemoryDevice)
return q.hotplugMemory(memdev, op)
case NetDev:
device := devInfo.(Endpoint)
return nil, q.hotplugNetDevice(ctx, device, op)
case VhostuserDev:
vAttr := devInfo.(*config.VhostUserDeviceAttrs)
return nil, q.hotplugVhostUserDevice(ctx, vAttr, op)
default:
return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType)
}
}
func (q *qemu) HotplugAddDevice(ctx context.Context, devInfo interface{}, devType DeviceType) (interface{}, error) {
span, ctx := katatrace.Trace(ctx, q.Logger(), "HotplugAddDevice", qemuTracingTags)
katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo)
defer span.End()
data, err := q.hotplugDevice(ctx, devInfo, devType, AddDevice)
if err != nil {
return data, err
}
return data, nil
}
func (q *qemu) HotplugRemoveDevice(ctx context.Context, devInfo interface{}, devType DeviceType) (interface{}, error) {
span, ctx := katatrace.Trace(ctx, q.Logger(), "HotplugRemoveDevice", qemuTracingTags)
katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo)
defer span.End()
data, err := q.hotplugDevice(ctx, devInfo, devType, RemoveDevice)
if err != nil {
return data, err
}
return data, nil
}
func (q *qemu) hotplugCPUs(vcpus uint32, op Operation) (uint32, error) {
if vcpus == 0 {
q.Logger().Warnf("cannot hotplug 0 vCPUs")
return 0, nil
}
if err := q.qmpSetup(); err != nil {
return 0, err
}
if op == AddDevice {
return q.hotplugAddCPUs(vcpus)
}
return q.hotplugRemoveCPUs(vcpus)
}
// try to hot add an amount of vCPUs, returns the number of vCPUs added
func (q *qemu) hotplugAddCPUs(amount uint32) (uint32, error) {
currentVCPUs := q.qemuConfig.SMP.CPUs + uint32(len(q.state.HotpluggedVCPUs))
// Don't fail if the number of max vCPUs is exceeded, log a warning and hot add the vCPUs needed
// to reach out max vCPUs
if currentVCPUs+amount > q.config.DefaultMaxVCPUs {
q.Logger().Warnf("Cannot hotplug %d CPUs, currently this SB has %d CPUs and the maximum amount of CPUs is %d",
amount, currentVCPUs, q.config.DefaultMaxVCPUs)
amount = q.config.DefaultMaxVCPUs - currentVCPUs
}
if amount == 0 {
// Don't fail if no more vCPUs can be added, since cgroups still can be updated
q.Logger().Warnf("maximum number of vCPUs '%d' has been reached", q.config.DefaultMaxVCPUs)
return 0, nil
}
// get the list of hotpluggable CPUs
hotpluggableVCPUs, err := q.qmpMonitorCh.qmp.ExecuteQueryHotpluggableCPUs(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query hotpluggable CPUs: %v", err)
}
machine := q.arch.machine()
var hotpluggedVCPUs uint32
for _, hc := range hotpluggableVCPUs {
// qom-path is the path to the CPU, non-empty means that this CPU is already in use
if hc.QOMPath != "" {
continue
}
// CPU type, i.e host-x86_64-cpu
driver := hc.Type
cpuID := fmt.Sprintf("cpu-%d", len(q.state.HotpluggedVCPUs))
socketID := fmt.Sprintf("%d", hc.Properties.Socket)
dieID := fmt.Sprintf("%d", hc.Properties.Die)
coreID := fmt.Sprintf("%d", hc.Properties.Core)
threadID := fmt.Sprintf("%d", hc.Properties.Thread)
// If CPU type is IBM pSeries, Z or arm virt, we do not set socketID and threadID
if machine.Type == "pseries" || machine.Type == QemuCCWVirtio || machine.Type == "virt" {
socketID = ""
threadID = ""
dieID = ""
}
if err := q.qmpMonitorCh.qmp.ExecuteCPUDeviceAdd(q.qmpMonitorCh.ctx, driver, cpuID, socketID, dieID, coreID, threadID, romFile); err != nil {
q.Logger().WithField("hotplug", "cpu").Warnf("qmp hotplug cpu, cpuID=%s socketID=%s, error: %v", cpuID, socketID, err)
// don't fail, let's try with other CPU
continue
}
// a new vCPU was added, update list of hotplugged vCPUs and Check if all vCPUs were added
q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, hv.CPUDevice{ID: cpuID})
hotpluggedVCPUs++
if hotpluggedVCPUs == amount {
// All vCPUs were hotplugged
return amount, nil
}
}
return hotpluggedVCPUs, fmt.Errorf("failed to hot add vCPUs: only %d vCPUs of %d were added", hotpluggedVCPUs, amount)
}
// try to hot remove an amount of vCPUs, returns the number of vCPUs removed
func (q *qemu) hotplugRemoveCPUs(amount uint32) (uint32, error) {
hotpluggedVCPUs := uint32(len(q.state.HotpluggedVCPUs))
// we can only remove hotplugged vCPUs
if amount > hotpluggedVCPUs {
return 0, fmt.Errorf("Unable to remove %d CPUs, currently there are only %d hotplugged CPUs", amount, hotpluggedVCPUs)
}
for i := uint32(0); i < amount; i++ {
// get the last vCPUs and try to remove it
cpu := q.state.HotpluggedVCPUs[len(q.state.HotpluggedVCPUs)-1]
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, cpu.ID); err != nil {
return i, fmt.Errorf("failed to hotunplug CPUs, only %d CPUs were hotunplugged: %v", i, err)
}
// remove from the list the vCPU hotunplugged
q.state.HotpluggedVCPUs = q.state.HotpluggedVCPUs[:len(q.state.HotpluggedVCPUs)-1]
}
return amount, nil
}
func (q *qemu) hotplugMemory(memDev *MemoryDevice, op Operation) (int, error) {
if !q.arch.supportGuestMemoryHotplug() {
return 0, noGuestMemHotplugErr
}
if memDev.SizeMB < 0 {
return 0, fmt.Errorf("cannot hotplug negative size (%d) memory", memDev.SizeMB)
}
memLog := q.Logger().WithField("hotplug", "memory")
memLog.WithField("hotplug-memory-mb", memDev.SizeMB).Debug("requested memory hotplug")
if err := q.qmpSetup(); err != nil {
return 0, err
}
if memDev.SizeMB == 0 {
memLog.Debug("hotplug is not required")
return 0, nil
}
switch op {
case RemoveDevice:
memLog.WithField("operation", "remove").Debugf("Requested to remove memory: %d MB", memDev.SizeMB)
// Dont fail but warn that this is not supported.
memLog.Warn("hot-remove VM memory not supported")
return 0, nil
case AddDevice:
memLog.WithField("operation", "add").Debugf("Requested to add memory: %d MB", memDev.SizeMB)
memoryAdded, err := q.hotplugAddMemory(memDev)
if err != nil {
return memoryAdded, err
}
return memoryAdded, nil
default:
return 0, fmt.Errorf("invalid operation %v", op)
}
}
func (q *qemu) hotplugAddMemory(memDev *MemoryDevice) (int, error) {
memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query memory devices: %v", err)
}
if len(memoryDevices) != 0 {
maxSlot := -1
for _, device := range memoryDevices {
if maxSlot < device.Data.Slot {
maxSlot = device.Data.Slot
}
}
memDev.Slot = maxSlot + 1
}
share, target, memoryBack, err := q.getMemArgs()
if err != nil {
return 0, err
}
err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.Slot), target, memDev.SizeMB, share)
if err != nil {
q.Logger().WithError(err).Error("hotplug memory")
return 0, err
}
// if guest kernel only supports memory hotplug via probe interface, we need to get address of hot-add memory device
if memDev.Probe {
memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query memory devices: %v", err)
}
if len(memoryDevices) != 0 {
q.Logger().WithField("addr", fmt.Sprintf("0x%x", memoryDevices[len(memoryDevices)-1].Data.Addr)).Debug("recently hot-add memory device")
memDev.Addr = memoryDevices[len(memoryDevices)-1].Data.Addr
} else {
return 0, fmt.Errorf("failed to probe address of recently hot-add memory device, no device exists")
}
}
q.state.HotpluggedMemory += memDev.SizeMB
return memDev.SizeMB, nil
}
func (q *qemu) PauseVM(ctx context.Context) error {
span, ctx := katatrace.Trace(ctx, q.Logger(), "PauseVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
return q.togglePauseSandbox(ctx, true)
}
func (q *qemu) ResumeVM(ctx context.Context) error {
span, ctx := katatrace.Trace(ctx, q.Logger(), "ResumeVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
return q.togglePauseSandbox(ctx, false)
}
// AddDevice will add extra devices to Qemu command line.
func (q *qemu) AddDevice(ctx context.Context, devInfo interface{}, devType DeviceType) error {
var err error
span, _ := katatrace.Trace(ctx, q.Logger(), "AddDevice", qemuTracingTags)
katatrace.AddTags(span, "sandbox_id", q.id, "device", devInfo)
defer span.End()
switch v := devInfo.(type) {
case types.Volume:
if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus {
q.Logger().WithField("volume-type", "virtio-fs").Info("adding volume")
var randBytes []byte
randBytes, err = utils.GenerateRandomBytes(8)
if err != nil {
return err
}
id := hex.EncodeToString(randBytes)
var sockPath string
sockPath, err = q.vhostFSSocketPath(q.id)
if err != nil {
return err
}
vhostDev := config.VhostUserDeviceAttrs{
Tag: v.MountTag,
Type: config.VhostUserFS,
CacheSize: q.config.VirtioFSCacheSize,
Cache: q.config.VirtioFSCache,
QueueSize: q.config.VirtioFSQueueSize,
}
vhostDev.SocketPath = sockPath
vhostDev.DevID = id
q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(ctx, q.qemuConfig.Devices, vhostDev)
} else {
q.Logger().WithField("volume-type", "virtio-9p").Info("adding volume")
q.qemuConfig.Devices, err = q.arch.append9PVolume(ctx, q.qemuConfig.Devices, v)
}
case types.Socket:
q.qemuConfig.Devices = q.arch.appendSocket(q.qemuConfig.Devices, v)
case types.VSock:
q.fds = append(q.fds, v.VhostFd)
q.qemuConfig.Devices, err = q.arch.appendVSock(ctx, q.qemuConfig.Devices, v)
case Endpoint:
q.qemuConfig.Devices, err = q.arch.appendNetwork(ctx, q.qemuConfig.Devices, v)
case config.BlockDrive:
q.qemuConfig.Devices, err = q.arch.appendBlockDevice(ctx, q.qemuConfig.Devices, v)
case config.VhostUserDeviceAttrs:
q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(ctx, q.qemuConfig.Devices, v)
case config.VFIODev:
q.qemuConfig.Devices = q.arch.appendVFIODevice(q.qemuConfig.Devices, v)
default:
q.Logger().WithField("dev-type", v).Warn("Could not append device: unsupported device type")
}
return err
}
// GetVMConsole builds the path of the console where we can read logs coming
// from the sandbox.
func (q *qemu) GetVMConsole(ctx context.Context, id string) (string, string, error) {
span, _ := katatrace.Trace(ctx, q.Logger(), "GetVMConsole", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
consoleURL, err := utils.BuildSocketPath(q.config.VMStorePath, id, consoleSocket)
if err != nil {
return consoleProtoUnix, "", err
}
return consoleProtoUnix, consoleURL, nil
}
func (q *qemu) SaveVM() error {
q.Logger().Info("Save sandbox")
if err := q.qmpSetup(); err != nil {
return err
}
// BootToBeTemplate sets the VM to be a template that other VMs can clone from. We would want to
// bypass shared memory when saving the VM to a local file through migration exec.
if q.config.BootToBeTemplate {
err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
if err != nil {
q.Logger().WithError(err).Error("set migration ignore shared memory")
return err
}
}
err := q.qmpMonitorCh.qmp.ExecSetMigrateArguments(q.qmpMonitorCh.ctx, fmt.Sprintf("%s>%s", qmpExecCatCmd, q.config.DevicesStatePath))
if err != nil {
q.Logger().WithError(err).Error("exec migration")
return err
}
return q.waitMigration()
}
func (q *qemu) waitMigration() error {
t := time.NewTimer(qmpMigrationWaitTimeout)
defer t.Stop()
for {
status, err := q.qmpMonitorCh.qmp.ExecuteQueryMigration(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("failed to query migration status")
return err
}
if status.Status == "completed" {
break
}
select {
case <-t.C:
q.Logger().WithField("migration-status", status).Error("timeout waiting for qemu migration")
return fmt.Errorf("timed out after %d seconds waiting for qemu migration", qmpMigrationWaitTimeout)
default:
// migration in progress
q.Logger().WithField("migration-status", status).Debug("migration in progress")
time.Sleep(100 * time.Millisecond)
}
}
return nil
}
func (q *qemu) Disconnect(ctx context.Context) {
span, _ := katatrace.Trace(ctx, q.Logger(), "Disconnect", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
q.qmpShutdown()
}
func (q *qemu) GetTotalMemoryMB(ctx context.Context) uint32 {
return q.config.MemorySize + uint32(q.state.HotpluggedMemory)
}
// ResizeMemory gets a request to update the VM memory to reqMemMB
// Memory update is managed with two approaches
// Add memory to VM:
// When memory is required to be added we hotplug memory
// Remove Memory from VM/ Return memory to host.
//
// Memory unplug can be slow and it cannot be guaranteed.
// Additionally, the unplug has not small granularly it has to be
// the memory to remove has to be at least the size of one slot.
// To return memory back we are resizing the VM memory balloon.
// A longer term solution is evaluate solutions like virtio-mem
func (q *qemu) ResizeMemory(ctx context.Context, reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, MemoryDevice, error) {
currentMemory := q.GetTotalMemoryMB(ctx)
if err := q.qmpSetup(); err != nil {
return 0, MemoryDevice{}, err
}
var addMemDevice MemoryDevice
if q.config.VirtioMem && currentMemory != reqMemMB {
q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB)
sizeByte := uint64(reqMemMB - q.config.MemorySize)
sizeByte = sizeByte * 1024 * 1024
err := q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", sizeByte)
if err != nil {
return 0, MemoryDevice{}, err
}
q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024)
return reqMemMB, MemoryDevice{}, nil
}
switch {
case currentMemory < reqMemMB:
//hotplug
addMemMB := reqMemMB - currentMemory
if currentMemory+addMemMB > uint32(q.config.DefaultMaxMemorySize) {
addMemMB = uint32(q.config.DefaultMaxMemorySize) - currentMemory
}
memHotplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
if err != nil {
return currentMemory, MemoryDevice{}, err
}
addMemDevice.SizeMB = int(memHotplugMB)
addMemDevice.Probe = probe
data, err := q.HotplugAddDevice(ctx, &addMemDevice, MemoryDev)
if err != nil {
return currentMemory, addMemDevice, err
}
memoryAdded, ok := data.(int)
if !ok {
return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory added, got %+v", data)
}
currentMemory += uint32(memoryAdded)
case currentMemory > reqMemMB:
//hotunplug
addMemMB := currentMemory - reqMemMB
memHotunplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
if err != nil {
return currentMemory, MemoryDevice{}, err
}
addMemDevice.SizeMB = int(memHotunplugMB)
addMemDevice.Probe = probe
data, err := q.HotplugRemoveDevice(ctx, &addMemDevice, MemoryDev)
if err != nil {
return currentMemory, addMemDevice, err
}
memoryRemoved, ok := data.(int)
if !ok {
return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory removed, got %+v", data)
}
//FIXME: This is to Check memory HotplugRemoveDevice reported 0, as this is not supported.
// In the future if this is implemented this validation should be removed.
if memoryRemoved != 0 {
return currentMemory, addMemDevice, fmt.Errorf("memory hot unplug is not supported, something went wrong")
}
currentMemory -= uint32(memoryRemoved)
}
// currentMemory is the current memory (updated) of the VM, return to caller to allow verify
// the current VM memory state.
return currentMemory, addMemDevice, nil
}
// genericAppendBridges appends to devices the given bridges
// nolint: unused, deadcode
func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, machineType string) []govmmQemu.Device {
bus := defaultPCBridgeBus
switch machineType {
case QemuQ35, QemuVirt:
bus = defaultBridgeBus
}
for idx, b := range bridges {
t := govmmQemu.PCIBridge
if b.Type == types.PCIE {
t = govmmQemu.PCIEBridge
}
if b.Type == types.CCW {
continue
}
bridges[idx].Addr = bridgePCIStartAddr + idx
devices = append(devices,
govmmQemu.BridgeDevice{
Type: t,
Bus: bus,
ID: b.ID,
// Each bridge is required to be assigned a unique chassis id > 0
Chassis: idx + 1,
SHPC: false,
Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10),
// Certain guest BIOS versions think
// !SHPC means no hotplug, and won't
// reserve the IO and memory windows
// that will be needed for devices
// added underneath this bridge. This
// will only break for certain
// combinations of exact qemu, BIOS
// and guest kernel versions, but for
// consistency, just hint the usual
// default windows for a bridge (as
// the BIOS would use with SHPC) so
// that we can do ACPI hotplug.
IOReserve: "4k",
MemReserve: "1m",
Pref64Reserve: "1m",
},
)
}
return devices
}
func genericBridges(number uint32, machineType string) []types.Bridge {
var bridges []types.Bridge
var bt types.Type
switch machineType {
case QemuQ35:
// currently only pci bridges are supported
// qemu-2.10 will introduce pcie bridges
bt = types.PCI
case QemuVirt:
bt = types.PCI
case QemuPseries:
bt = types.PCI
case QemuCCWVirtio:
bt = types.CCW
default:
return nil
}
for i := uint32(0); i < number; i++ {
bridges = append(bridges, types.NewBridge(bt, fmt.Sprintf("%s-bridge-%d", bt, i), make(map[uint32]string), 0))
}
return bridges
}
// nolint: unused, deadcode
func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint64) govmmQemu.Memory {
// image NVDIMM device needs memory space 1024MB
// See https://github.com/clearcontainers/runtime/issues/380
memoryOffset += 1024
memMax := fmt.Sprintf("%dM", hostMemoryMb+memoryOffset)
mem := fmt.Sprintf("%dM", memoryMb)
memory := govmmQemu.Memory{
Size: mem,
Slots: slots,
MaxMem: memMax,
}
return memory
}
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
var (
bus string
chassis string
multiFunction bool
addr string
)
switch machineType {
case QemuQ35, QemuVirt:
bus = defaultBridgeBus
chassis = "0"
multiFunction = false
addr = "0"
default:
return devices
}
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
Multifunction: multiFunction,
Addr: addr,
MemReserve: fmt.Sprintf("%dB", memSize32bit),
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
},
)
}
return devices
}
// gollangci-lint enforces multi-line comments to be a block comment
// not multiple single line comments ...
/* pcie.0 bus
// -------------------------------------------------
// |
// -------------
// | Root Port |
// -------------
// -------------------------|------------------------
// | ----------------- |
// | PCI Express | Upstream Port | |
// | Switch ----------------- |
// | | | |
// | ------------------- ------------------- |
// | | Downstream Port | | Downstream Port | |
// | ------------------- ------------------- |
// -------------|-----------------------|------------
// ------------- --------------
// | GPU/ACCEL | | IB/ETH NIC |
// ------------- --------------
*/
// genericAppendPCIeSwitch adds a PCIe Swtich
func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
// Q35, Virt have the correct PCIe support,
// hence ignore all other machines
if machineType != QemuQ35 && machineType != QemuVirt {
return devices
}
// Using an own ID for the root port, so we do not clash with already
// existing root ports adding "s" for switch prefix
pcieRootPort := govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
Bus: defaultBridgeBus,
Chassis: "1",
Slot: strconv.FormatUint(uint64(0), 10),
Multifunction: false,
Addr: "0",
MemReserve: fmt.Sprintf("%dB", memSize32bit),
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieRootPort)
pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
Bus: pcieRootPort.ID,
}
devices = append(devices, pcieSwitchUpstreamPort)
currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
if err != nil {
return devices
}
nextChassis := currentChassis + 1
for i := uint32(0); i < number; i++ {
pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
Bus: pcieSwitchUpstreamPort.ID,
Chassis: fmt.Sprintf("%d", nextChassis),
Slot: strconv.FormatUint(uint64(i), 10),
// TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieSwitchDownstreamPort)
}
return devices
}
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
tid := VcpuThreadIDs{}
if err := q.qmpSetup(); err != nil {
return tid, err
}
cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpusFast(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("failed to query cpu infos")
return tid, err
}
tid.vcpus = make(map[int]int, len(cpuInfos))
for _, i := range cpuInfos {
if i.ThreadID > 0 {
tid.vcpus[i.CPUIndex] = i.ThreadID
}
}
return tid, nil
}
func calcHotplugMemMiBSize(mem uint32, memorySectionSizeMB uint32) (uint32, error) {
if memorySectionSizeMB == 0 {
return mem, nil
}
return uint32(math.Ceil(float64(mem)/float64(memorySectionSizeMB))) * memorySectionSizeMB, nil
}
func (q *qemu) ResizeVCPUs(ctx context.Context, reqVCPUs uint32) (currentVCPUs uint32, newVCPUs uint32, err error) {
currentVCPUs = q.config.NumVCPUs() + uint32(len(q.state.HotpluggedVCPUs))
newVCPUs = currentVCPUs
switch {
case currentVCPUs < reqVCPUs:
//hotplug
addCPUs := reqVCPUs - currentVCPUs
data, err := q.HotplugAddDevice(ctx, addCPUs, CpuDev)
if err != nil {
return currentVCPUs, newVCPUs, err
}
vCPUsAdded, ok := data.(uint32)
if !ok {
return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs added, got %+v", data)
}
newVCPUs += vCPUsAdded
case currentVCPUs > reqVCPUs:
//hotunplug
removeCPUs := currentVCPUs - reqVCPUs
data, err := q.HotplugRemoveDevice(ctx, removeCPUs, CpuDev)
if err != nil {
return currentVCPUs, newVCPUs, err
}
vCPUsRemoved, ok := data.(uint32)
if !ok {
return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs removed, got %+v", data)
}
newVCPUs -= vCPUsRemoved
}
return currentVCPUs, newVCPUs, nil
}
func (q *qemu) Cleanup(ctx context.Context) error {
span, _ := katatrace.Trace(ctx, q.Logger(), "Cleanup", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
for _, fd := range q.fds {
if err := fd.Close(); err != nil {
q.Logger().WithError(err).Warn("failed closing fd")
}
}
q.fds = []*os.File{}
return nil
}
func (q *qemu) GetPids() []int {
data, err := os.ReadFile(q.qemuConfig.PidFile)
if err != nil {
q.Logger().WithError(err).Error("Could not read qemu pid file")
return []int{0}
}
pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t "))
if err != nil {
q.Logger().WithError(err).Error("Could not convert string to int")
return []int{0}
}
pids := []int{pid}
if q.state.VirtiofsDaemonPid != 0 {
pids = append(pids, q.state.VirtiofsDaemonPid)
}
return pids
}
func (q *qemu) GetVirtioFsPid() *int {
return &q.state.VirtiofsDaemonPid
}
type qemuGrpc struct {
ID string
QmpChannelpath string
State QemuState
NvdimmCount int
// Most members of q.qemuConfig are just to generate
// q.qemuConfig.qemuParams that is used by LaunchQemu except
// q.qemuConfig.SMP.
// So just transport q.qemuConfig.SMP from VM Cache server to runtime.
QemuSMP govmmQemu.SMP
}
func (q *qemu) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
var qp qemuGrpc
err := json.Unmarshal(j, &qp)
if err != nil {
return err
}
q.id = qp.ID
q.config = *hypervisorConfig
q.qmpMonitorCh.ctx = ctx
q.qmpMonitorCh.path = qp.QmpChannelpath
q.qemuConfig.Ctx = ctx
q.state = qp.State
q.arch, err = newQemuArch(q.config)
if err != nil {
return err
}
q.ctx = ctx
q.nvdimmCount = qp.NvdimmCount
q.qemuConfig.SMP = qp.QemuSMP
q.arch.setBridges(q.state.Bridges)
return nil
}
func (q *qemu) toGrpc(ctx context.Context) ([]byte, error) {
q.qmpShutdown()
q.Cleanup(ctx)
qp := qemuGrpc{
ID: q.id,
QmpChannelpath: q.qmpMonitorCh.path,
State: q.state,
NvdimmCount: q.nvdimmCount,
QemuSMP: q.qemuConfig.SMP,
}
return json.Marshal(&qp)
}
func (q *qemu) Save() (s hv.HypervisorState) {
// If QEMU isn't even running, there isn't any state to Save
if atomic.LoadInt32(&q.stopped) != 0 {
return
}
pids := q.GetPids()
if len(pids) != 0 {
s.Pid = pids[0]
}
s.VirtiofsDaemonPid = q.state.VirtiofsDaemonPid
s.Type = string(QemuHypervisor)
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
DeviceAddr: bridge.Devices,
Type: string(bridge.Type),
ID: bridge.ID,
Addr: bridge.Addr,
})
}
for _, cpu := range q.state.HotpluggedVCPUs {
s.HotpluggedVCPUs = append(s.HotpluggedVCPUs, hv.CPUDevice{
ID: cpu.ID,
})
}
return
}
func (q *qemu) Load(s hv.HypervisorState) {
q.state.UUID = s.UUID
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
}
for _, cpu := range s.HotpluggedVCPUs {
q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, hv.CPUDevice{
ID: cpu.ID,
})
}
}
func (q *qemu) Check() error {
if atomic.LoadInt32(&q.stopped) != 0 {
return fmt.Errorf("qemu is not running")
}
q.memoryDumpFlag.Lock()
defer q.memoryDumpFlag.Unlock()
if err := q.qmpSetup(); err != nil {
return err
}
status, err := q.qmpMonitorCh.qmp.ExecuteQueryStatus(q.qmpMonitorCh.ctx)
if err != nil {
return err
}
if status.Status == "internal-error" || status.Status == "guest-panicked" {
return errors.Errorf("guest failure: %s", status.Status)
}
return nil
}
func (q *qemu) GenerateSocket(id string) (interface{}, error) {
return generateVMSocket(id, q.config.VMStorePath)
}
func (q *qemu) IsRateLimiterBuiltin() bool {
return false
}