diff --git a/cli/config/configuration.toml.in b/cli/config/configuration.toml.in index a8df6c58ea..e9062928ce 100644 --- a/cli/config/configuration.toml.in +++ b/cli/config/configuration.toml.in @@ -82,6 +82,13 @@ default_memory = @DEFMEMSZ@ # This is will determine the times that memory will be hotadded to sandbox/VM. #memory_slots = @DEFMEMSLOTS@ +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM devie. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +#memory_offset = 0 + # Disable block device from being used for a container's rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed @@ -91,8 +98,8 @@ default_memory = @DEFMEMSZ@ disable_block_device_use = @DEFDISABLEBLOCK@ # Block storage driver to be used for the hypervisor in case the container -# rootfs is backed by a block device. This is either virtio-scsi or -# virtio-blk. +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. block_device_driver = "@DEFBLOCKSTORAGEDRIVER@" # Specifies cache-related options will be set to block devices or not. diff --git a/pkg/katautils/config-settings.go b/pkg/katautils/config-settings.go index 8dcf6ac370..b85efcb5f3 100644 --- a/pkg/katautils/config-settings.go +++ b/pkg/katautils/config-settings.go @@ -24,6 +24,7 @@ const defaultVCPUCount uint32 = 1 const defaultMaxVCPUCount uint32 = 0 const defaultMemSize uint32 = 2048 // MiB const defaultMemSlots uint32 = 10 +const defaultMemOffset uint32 = 0 // MiB const defaultBridgesCount uint32 = 1 const defaultInterNetworkingModel = "macvtap" const defaultDisableBlockDeviceUse bool = false diff --git a/pkg/katautils/config.go b/pkg/katautils/config.go index 446048a907..00912bdf09 100644 --- a/pkg/katautils/config.go +++ b/pkg/katautils/config.go @@ -98,6 +98,7 @@ type hypervisor struct { DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` MemorySize uint32 `toml:"default_memory"` MemSlots uint32 `toml:"memory_slots"` + MemOffset uint32 `toml:"memory_offset"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` DisableBlockDeviceUse bool `toml:"disable_block_device_use"` @@ -281,6 +282,15 @@ func (h hypervisor) defaultMemSlots() uint32 { return slots } +func (h hypervisor) defaultMemOffset() uint32 { + offset := h.MemOffset + if offset == 0 { + offset = defaultMemOffset + } + + return offset +} + func (h hypervisor) defaultBridges() uint32 { if h.DefaultBridges == 0 { return defaultBridgesCount @@ -294,7 +304,7 @@ func (h hypervisor) defaultBridges() uint32 { } func (h hypervisor) blockDeviceDriver() (string, error) { - supportedBlockDrivers := []string{config.VirtioSCSI, config.VirtioBlock, config.VirtioMmio} + supportedBlockDrivers := []string{config.VirtioSCSI, config.VirtioBlock, config.VirtioMmio, config.Nvdimm} if h.BlockDeviceDriver == "" { return defaultBlockDeviceDriver, nil @@ -514,6 +524,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { DefaultMaxVCPUs: h.defaultMaxVCPUs(), MemorySize: h.defaultMemSz(), MemSlots: h.defaultMemSlots(), + MemOffset: h.defaultMemOffset(), EntropySource: h.GetEntropySource(), DefaultBridges: h.defaultBridges(), DisableBlockDeviceUse: h.DisableBlockDeviceUse, @@ -677,6 +688,7 @@ func initConfig() (config oci.RuntimeConfig, err error) { NumVCPUs: defaultVCPUCount, DefaultMaxVCPUs: defaultMaxVCPUCount, MemorySize: defaultMemSize, + MemOffset: defaultMemOffset, DefaultBridges: defaultBridgesCount, MemPrealloc: defaultEnableMemPrealloc, HugePages: defaultEnableHugePages, diff --git a/virtcontainers/device/config/config.go b/virtcontainers/device/config/config.go index 704160920d..13eaeff352 100644 --- a/virtcontainers/device/config/config.go +++ b/virtcontainers/device/config/config.go @@ -47,6 +47,9 @@ const ( // VirtioSCSI means use virtio-scsi for hotplugging drives VirtioSCSI = "virtio-scsi" + + // Nvdimm means use nvdimm for hotplugging drives + Nvdimm = "nvdimm" ) // Defining these as a variable instead of a const, to allow @@ -119,6 +122,9 @@ type BlockDrive struct { // SCSI address is in the format SCSI-Id:LUN SCSIAddr string + // NvdimmID is the nvdimm id inside the VM + NvdimmID string + // VirtPath at which the device appears inside the VM, outside of the container mount namespace VirtPath string } diff --git a/virtcontainers/device/drivers/block.go b/virtcontainers/device/drivers/block.go index ba0592b729..b977e30e8a 100644 --- a/virtcontainers/device/drivers/block.go +++ b/virtcontainers/device/drivers/block.go @@ -78,7 +78,7 @@ func (device *BlockDevice) Attach(devReceiver api.DeviceReceiver) (err error) { } drive.SCSIAddr = scsiAddr - } else { + } else if customOptions["block-driver"] != "nvdimm" { var globalIdx int switch customOptions["block-driver"] { @@ -102,7 +102,7 @@ func (device *BlockDevice) Attach(devReceiver api.DeviceReceiver) (err error) { drive.VirtPath = filepath.Join("/dev", driveName) } - deviceLogger().WithField("device", device.DeviceInfo.HostPath).Info("Attaching block device") + deviceLogger().WithField("device", device.DeviceInfo.HostPath).WithField("VirtPath", drive.VirtPath).Infof("Attaching %s device", customOptions["block-driver"]) device.BlockDrive = drive if err = devReceiver.HotplugAddDevice(device, config.DeviceBlock); err != nil { return err diff --git a/virtcontainers/device/manager/manager.go b/virtcontainers/device/manager/manager.go index 066c30581c..9086fa455b 100644 --- a/virtcontainers/device/manager/manager.go +++ b/virtcontainers/device/manager/manager.go @@ -26,6 +26,8 @@ const ( VirtioBlock string = "virtio-blk" // VirtioSCSI indicates block driver is virtio-scsi based VirtioSCSI string = "virtio-scsi" + // Nvdimm indicates block driver is nvdimm based + Nvdimm string = "nvdimm" ) var ( @@ -61,6 +63,8 @@ func NewDeviceManager(blockDriver string, devices []api.Device) api.DeviceManage dm.blockDriver = VirtioMmio } else if blockDriver == VirtioBlock { dm.blockDriver = VirtioBlock + } else if blockDriver == Nvdimm { + dm.blockDriver = Nvdimm } else { dm.blockDriver = VirtioSCSI } diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index b218b01694..bff95df32f 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -169,6 +169,9 @@ type HypervisorConfig struct { // MemSlots specifies default memory slots the VM. MemSlots uint32 + // MemOffset specifies memory space for nvdimm device + MemOffset uint32 + // KernelParams are additional guest kernel parameters. KernelParams []Param diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index 18a508cc86..95c0a4b453 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -62,6 +62,7 @@ var ( kataMmioBlkDevType = "mmioblk" kataBlkDevType = "blk" kataSCSIDevType = "scsi" + kataNvdimmDevType = "nvdimm" sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"} shmDir = "shm" kataEphemeralDevType = "ephemeral" @@ -883,6 +884,9 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr case config.VirtioSCSI: kataDevice.Type = kataSCSIDevType kataDevice.Id = d.SCSIAddr + case config.Nvdimm: + kataDevice.Type = kataNvdimmDevType + kataDevice.VmPath = fmt.Sprintf("/dev/pmem%s", d.NvdimmID) } deviceList = append(deviceList, kataDevice) diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index c02ecc5475..9c79c20b69 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -8,20 +8,22 @@ package virtcontainers import ( "context" "fmt" + govmmQemu "github.com/intel/govmm/qemu" + "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" + "github.com/opentracing/opentracing-go" + "github.com/sirupsen/logrus" "math" "os" "path/filepath" "strconv" "strings" + "syscall" "time" - - govmmQemu "github.com/intel/govmm/qemu" - "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" - opentracing "github.com/opentracing/opentracing-go" - "github.com/sirupsen/logrus" + "unsafe" "github.com/kata-containers/runtime/virtcontainers/device/config" "github.com/kata-containers/runtime/virtcontainers/utils" + "golang.org/x/sys/unix" ) // romFile is the file name of the ROM that can be used for virtio-pci devices. @@ -73,6 +75,8 @@ type qemu struct { fds []*os.File ctx context.Context + + nvdimmCount int } const ( @@ -221,6 +225,20 @@ func (q *qemu) init(ctx context.Context, id string, hypervisorConfig *Hypervisor q.config = *hypervisorConfig q.arch = newQemuArch(q.config) + initrdPath, err := q.config.InitrdAssetPath() + if err != nil { + return err + } + imagePath, err := q.config.ImageAssetPath() + if err != nil { + return err + } + if initrdPath == "" && imagePath != "" { + q.nvdimmCount = 1 + } else { + q.nvdimmCount = 0 + } + if err = q.storage.fetchHypervisorState(q.id, &q.state); err != nil { q.Logger().Debug("Creating bridges") q.state.Bridges = q.arch.bridges(q.config.DefaultBridges) @@ -727,6 +745,69 @@ func (q *qemu) removeDeviceFromBridge(ID string) error { return err } +func (q *qemu) hotplugAddBlockDevice(drive *config.BlockDrive, op operation, devID string) error { + var err error + + if q.config.BlockDeviceDriver == config.Nvdimm { + var blocksize int64 + file, err := os.Open(drive.File) + if err != nil { + return err + } + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 { + return err + } + if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize); err != nil { + q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File) + return err + } + drive.NvdimmID = strconv.Itoa(q.nvdimmCount) + q.nvdimmCount++ + return nil + } + + if q.config.BlockDeviceCacheSet { + err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush) + } else { + err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID) + } + if err != nil { + return err + } + + if q.config.BlockDeviceDriver == config.VirtioBlock { + driver := "virtio-blk-pci" + addr, bridge, err := q.addDeviceToBridge(drive.ID) + if err != nil { + return err + } + + // PCI address is in the format bridge-addr/device-addr eg. "03/02" + drive.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr + + if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, true, q.arch.runNested()); err != nil { + return err + } + } else { + driver := "scsi-hd" + + // Bus exposed by the SCSI Controller + bus := scsiControllerID + ".0" + + // Get SCSI-id and LUN based on the order of attaching drives. + scsiID, lun, err := utils.GetSCSIIdLun(drive.Index) + if err != nil { + return err + } + + if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, q.arch.runNested()); err != nil { + return err + } + } + + return nil +} + func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error { err := q.qmpSetup() if err != nil { @@ -736,44 +817,7 @@ func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error devID := "virtio-" + drive.ID if op == addDevice { - if q.config.BlockDeviceCacheSet { - err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush) - } else { - err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID) - } - if err != nil { - return err - } - - if q.config.BlockDeviceDriver == config.VirtioBlock { - driver := "virtio-blk-pci" - addr, bridge, err := q.addDeviceToBridge(drive.ID) - if err != nil { - return err - } - - // PCI address is in the format bridge-addr/device-addr eg. "03/02" - drive.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr - - if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, true, q.arch.runNested()); err != nil { - return err - } - } else { - driver := "scsi-hd" - - // Bus exposed by the SCSI Controller - bus := scsiControllerID + ".0" - - // Get SCSI-id and LUN based on the order of attaching drives. - scsiID, lun, err := utils.GetSCSIIdLun(drive.Index) - if err != nil { - return err - } - - if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, q.arch.runNested()); err != nil { - return err - } - } + err = q.hotplugAddBlockDevice(drive, op, devID) } else { if q.config.BlockDeviceDriver == config.VirtioBlock { if err := q.removeDeviceFromBridge(drive.ID); err != nil { @@ -790,7 +834,7 @@ func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error } } - return nil + return err } func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) error { @@ -1391,12 +1435,11 @@ func genericBridges(number uint32, machineType string) []Bridge { return bridges } -func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory { - // NVDIMM device needs memory space 1024MB +func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint32) govmmQemu.Memory { + // image NVDIMM device needs memory space 1024MB // See https://github.com/clearcontainers/runtime/issues/380 - memoryOffset := 1024 + memoryOffset += 1024 - // add 1G memory space for nvdimm device (vm guest image) memMax := fmt.Sprintf("%dM", hostMemoryMb+uint64(memoryOffset)) mem := fmt.Sprintf("%dM", memoryMb) diff --git a/virtcontainers/qemu_amd64.go b/virtcontainers/qemu_amd64.go index 8dfddde932..7e3fc9d3de 100644 --- a/virtcontainers/qemu_amd64.go +++ b/virtcontainers/qemu_amd64.go @@ -87,6 +87,7 @@ func newQemuArch(config HypervisorConfig) qemuArch { q := &qemuAmd64{ qemuArchBase{ machineType: machineType, + memoryOffset: config.MemOffset, qemuPaths: qemuPaths, supportedQemuMachines: supportedQemuMachines, kernelParamsNonDebug: kernelParamsNonDebug, @@ -96,6 +97,7 @@ func newQemuArch(config HypervisorConfig) qemuArch { } q.handleImagePath(config) + return q } @@ -126,7 +128,7 @@ func (q *qemuAmd64) cpuModel() string { } func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory { - return genericMemoryTopology(memoryMb, hostMemoryMb, slots) + return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) } func (q *qemuAmd64) appendImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) { diff --git a/virtcontainers/qemu_arch_base.go b/virtcontainers/qemu_arch_base.go index 53e952410c..7c098f7b53 100644 --- a/virtcontainers/qemu_arch_base.go +++ b/virtcontainers/qemu_arch_base.go @@ -103,6 +103,7 @@ type qemuArch interface { type qemuArchBase struct { machineType string + memoryOffset uint32 nestedRun bool vhost bool networkIndex int diff --git a/virtcontainers/qemu_arm64.go b/virtcontainers/qemu_arm64.go index 64fb285822..bc72b0fa79 100644 --- a/virtcontainers/qemu_arm64.go +++ b/virtcontainers/qemu_arm64.go @@ -136,6 +136,7 @@ func newQemuArch(config HypervisorConfig) qemuArch { q := &qemuArm64{ qemuArchBase{ machineType: machineType, + memoryOffset: config.MemOffset, qemuPaths: qemuPaths, supportedQemuMachines: supportedQemuMachines, kernelParamsNonDebug: kernelParamsNonDebug, diff --git a/virtcontainers/qemu_ppc64le.go b/virtcontainers/qemu_ppc64le.go index 05d37ce755..ca69e2d831 100644 --- a/virtcontainers/qemu_ppc64le.go +++ b/virtcontainers/qemu_ppc64le.go @@ -74,6 +74,7 @@ func newQemuArch(config HypervisorConfig) qemuArch { q := &qemuPPC64le{ qemuArchBase{ machineType: machineType, + memoryOffset: config.MemOffset, qemuPaths: qemuPaths, supportedQemuMachines: supportedQemuMachines, kernelParamsNonDebug: kernelParamsNonDebug, @@ -83,6 +84,9 @@ func newQemuArch(config HypervisorConfig) qemuArch { } q.handleImagePath(config) + + q.memoryOffset = config.MemOffset + return q } @@ -121,7 +125,7 @@ func (q *qemuPPC64le) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) hostMemoryMb = defaultMemMaxPPC64le } - return genericMemoryTopology(memoryMb, hostMemoryMb, slots) + return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) } func (q *qemuPPC64le) appendImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) { diff --git a/virtcontainers/qemu_s390x.go b/virtcontainers/qemu_s390x.go index 893b2dd62c..cd87d681e6 100644 --- a/virtcontainers/qemu_s390x.go +++ b/virtcontainers/qemu_s390x.go @@ -61,6 +61,7 @@ func newQemuArch(config HypervisorConfig) qemuArch { q := &qemuS390x{ qemuArchBase{ machineType: machineType, + memoryOffset: config.MemOffset, qemuPaths: qemuPaths, supportedQemuMachines: supportedQemuMachines, kernelParamsNonDebug: kernelParamsNonDebug,