diff --git a/src/runtime/Makefile b/src/runtime/Makefile index fa07b87deb..b05106318b 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -253,6 +253,7 @@ ifneq (,$(QEMUCMD)) # qemu-specific options (all should be suffixed by "_QEMU") DEFBLOCKSTORAGEDRIVER_QEMU := virtio-scsi + DEFBLOCKDEVICEAIO_QEMU := io_uring DEFNETWORKMODEL_QEMU := tcfilter KERNELTYPE = uncompressed KERNELNAME = $(call MAKE_KERNEL_NAME,$(KERNELTYPE)) @@ -458,6 +459,7 @@ USER_VARS += DEFDISABLEBLOCK USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN USER_VARS += DEFBLOCKSTORAGEDRIVER_FC USER_VARS += DEFBLOCKSTORAGEDRIVER_QEMU +USER_VARS += DEFBLOCKDEVICEAIO_QEMU USER_VARS += DEFSHAREDFS_CLH_VIRTIOFS USER_VARS += DEFSHAREDFS_QEMU_VIRTIOFS USER_VARS += DEFVIRTIOFSDAEMON diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 3ec44c8b6e..d0a711dcfe 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -208,6 +208,20 @@ virtio_fs_cache = "@DEFVIRTIOFSCACHE@" # or nvdimm. block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@" +# aio is the I/O mechanism used by qemu +# Options: +# +# - threads +# Pthread based disk I/O. +# +# - native +# Native Linux I/O. +# +# - io_uring +# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and +# qemu >=5.0. +block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@" + # Specifies cache-related options will be set to block devices or not. # Default false #block_device_cache_set = true diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index 61f9236b9c..a11c52f75c 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -61,6 +61,17 @@ const ( Nvdimm = "nvdimm" ) +const ( + // AIOThreads is the pthread asynchronous I/O implementation. + AIOThreads = "threads" + + // AIONative is the native Linux AIO implementation + AIONative = "native" + + // AIOUring is the Linux io_uring I/O implementation + AIOIOUring = "io_uring" +) + const ( // Virtio9P means use virtio-9p for the shared file system Virtio9P = "virtio-9p" diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 100316dd9e..4b36df2ea3 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -1140,8 +1140,11 @@ const ( // Threads is the pthread asynchronous I/O implementation. Threads BlockDeviceAIO = "threads" - // Native is the pthread asynchronous I/O implementation. + // Native is the native Linux AIO implementation. Native BlockDeviceAIO = "native" + + // IOUring is the Linux io_uring I/O implementation. + IOUring BlockDeviceAIO = "io_uring" ) const ( diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 9bf091af84..24c92b9b20 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -771,30 +771,28 @@ func (q *QMP) ExecuteQuit(ctx context.Context) error { return q.executeCommand(ctx, "quit", nil, nil) } -func (q *QMP) blockdevAddBaseArgs(driver, device, blockdevID string, ro bool) (map[string]interface{}, map[string]interface{}) { - var args map[string]interface{} - +func (q *QMP) blockdevAddBaseArgs(driver string, blockDevice *BlockDevice) map[string]interface{} { blockdevArgs := map[string]interface{}{ "driver": "raw", - "read-only": ro, + "read-only": blockDevice.ReadOnly, "file": map[string]interface{}{ "driver": driver, - "filename": device, + "filename": blockDevice.File, + "aio": string(blockDevice.AIO), }, } - blockdevArgs["node-name"] = blockdevID - args = blockdevArgs + blockdevArgs["node-name"] = blockDevice.ID - return args, blockdevArgs + return blockdevArgs } // ExecuteBlockdevAdd sends a blockdev-add to the QEMU instance. device is the // path of the device to add, e.g., /dev/rdb0, and blockdevID is an identifier // used to name the device. As this identifier will be passed directly to QMP, // it must obey QMP's naming rules, e,g., it must start with a letter. -func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, device, blockdevID string, ro bool) error { - args, _ := q.blockdevAddBaseArgs("host_device", device, blockdevID, ro) +func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, blockDevice *BlockDevice) error { + args := q.blockdevAddBaseArgs("host_device", blockDevice) return q.executeCommand(ctx, "blockdev-add", args, nil) } @@ -806,29 +804,29 @@ func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, device, blockdevID string, // direct denotes whether use of O_DIRECT (bypass the host page cache) // is enabled. noFlush denotes whether flush requests for the device are // ignored. -func (q *QMP) ExecuteBlockdevAddWithCache(ctx context.Context, device, blockdevID string, direct, noFlush, ro bool) error { - args, blockdevArgs := q.blockdevAddBaseArgs("host_device", device, blockdevID, ro) +func (q *QMP) ExecuteBlockdevAddWithCache(ctx context.Context, blockDevice *BlockDevice, direct, noFlush bool) error { + blockdevArgs := q.blockdevAddBaseArgs("host_device", blockDevice) blockdevArgs["cache"] = map[string]interface{}{ "direct": direct, "no-flush": noFlush, } - return q.executeCommand(ctx, "blockdev-add", args, nil) + return q.executeCommand(ctx, "blockdev-add", blockdevArgs, nil) } // ExecuteBlockdevAddWithDriverCache has three one parameter driver // than ExecuteBlockdevAddWithCache. // Parameter driver can set the driver of block device. -func (q *QMP) ExecuteBlockdevAddWithDriverCache(ctx context.Context, driver, device, blockdevID string, direct, noFlush, ro bool) error { - args, blockdevArgs := q.blockdevAddBaseArgs(driver, device, blockdevID, ro) +func (q *QMP) ExecuteBlockdevAddWithDriverCache(ctx context.Context, driver string, blockDevice *BlockDevice, direct, noFlush bool) error { + blockdevArgs := q.blockdevAddBaseArgs(driver, blockDevice) blockdevArgs["cache"] = map[string]interface{}{ "direct": direct, "no-flush": noFlush, } - return q.executeCommand(ctx, "blockdev-add", args, nil) + return q.executeCommand(ctx, "blockdev-add", blockdevArgs, nil) } // ExecuteDeviceAdd adds the guest portion of a device to a QEMU instance diff --git a/src/runtime/pkg/govmm/qemu/qmp_test.go b/src/runtime/pkg/govmm/qemu/qmp_test.go index 23114a0d72..3f82fa54f9 100644 --- a/src/runtime/pkg/govmm/qemu/qmp_test.go +++ b/src/runtime/pkg/govmm/qemu/qmp_test.go @@ -400,8 +400,13 @@ func TestQMPBlockdevAdd(t *testing.T) { cfg := QMPConfig{Logger: qmpTestLogger{}} q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh) q.version = checkVersion(t, connectedCh) - err := q.ExecuteBlockdevAdd(context.Background(), "/dev/rbd0", - fmt.Sprintf("drive_%s", volumeUUID), false) + dev := BlockDevice{ + ID: fmt.Sprintf("drive_%s", volumeUUID), + File: "/dev/rbd0", + ReadOnly: false, + AIO: Native, + } + err := q.ExecuteBlockdevAdd(context.Background(), &dev) if err != nil { t.Fatalf("Unexpected error %v", err) } @@ -424,8 +429,13 @@ func TestQMPBlockdevAddWithCache(t *testing.T) { cfg := QMPConfig{Logger: qmpTestLogger{}} q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh) q.version = checkVersion(t, connectedCh) - err := q.ExecuteBlockdevAddWithCache(context.Background(), "/dev/rbd0", - fmt.Sprintf("drive_%s", volumeUUID), true, true, false) + dev := BlockDevice{ + ID: fmt.Sprintf("drive_%s", volumeUUID), + File: "/dev/rbd0", + ReadOnly: false, + AIO: Native, + } + err := q.ExecuteBlockdevAddWithCache(context.Background(), &dev, true, true) if err != nil { t.Fatalf("Unexpected error %v", err) } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index 5676f4451c..2aa0754cac 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -216,6 +216,7 @@ type RuntimeConfigOptions struct { ShimPath string LogPath string BlockDeviceDriver string + BlockDeviceAIO string SharedFS string VirtioFSDaemon string JaegerEndpoint string @@ -305,6 +306,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { path = "` + config.HypervisorPath + `" kernel = "` + config.KernelPath + `" block_device_driver = "` + config.BlockDeviceDriver + `" + block_device_aio = "` + config.BlockDeviceAIO + `" kernel_params = "` + config.KernelParams + `" image = "` + config.ImagePath + `" machine_type = "` + config.MachineType + `" diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 2aad22bd85..37dbfee45a 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -63,6 +63,7 @@ const defaultBridgesCount uint32 = 1 const defaultInterNetworkingModel = "tcfilter" const defaultDisableBlockDeviceUse bool = false const defaultBlockDeviceDriver = "virtio-scsi" +const defaultBlockDeviceAIO string = "io_uring" const defaultBlockDeviceCacheSet bool = false const defaultBlockDeviceCacheDirect bool = false const defaultBlockDeviceCacheNoflush bool = false diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index fe93a84128..f3bc06bdf5 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -100,6 +100,7 @@ type hypervisor struct { GuestHookPath string `toml:"guest_hook_path"` GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` SeccompSandbox string `toml:"seccompsandbox"` + BlockDeviceAIO string `toml:"block_device_aio"` HypervisorPathList []string `toml:"valid_hypervisor_paths"` JailerPathList []string `toml:"valid_jailer_paths"` CtlPathList []string `toml:"valid_ctlpaths"` @@ -469,6 +470,22 @@ func (h hypervisor) blockDeviceDriver() (string, error) { return "", fmt.Errorf("Invalid hypervisor block storage driver %v specified (supported drivers: %v)", h.BlockDeviceDriver, supportedBlockDrivers) } +func (h hypervisor) blockDeviceAIO() (string, error) { + supportedBlockAIO := []string{config.AIOIOUring, config.AIONative, config.AIOThreads} + + if h.BlockDeviceAIO == "" { + return defaultBlockDeviceAIO, nil + } + + for _, b := range supportedBlockAIO { + if b == h.BlockDeviceAIO { + return h.BlockDeviceAIO, nil + } + } + + return "", fmt.Errorf("Invalid hypervisor block storage I/O mechanism %v specified (supported AIO: %v)", h.BlockDeviceAIO, supportedBlockAIO) +} + func (h hypervisor) sharedFS() (string, error) { supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus} @@ -728,6 +745,11 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { return vc.HypervisorConfig{}, err } + blockAIO, err := h.blockDeviceAIO() + if err != nil { + return vc.HypervisorConfig{}, err + } + sharedFS, err := h.sharedFS() if err != nil { return vc.HypervisorConfig{}, err @@ -784,6 +806,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Debug: h.Debug, DisableNestingChecks: h.DisableNestingChecks, BlockDeviceDriver: blockDriver, + BlockDeviceAIO: blockAIO, BlockDeviceCacheSet: h.BlockDeviceCacheSet, BlockDeviceCacheDirect: h.BlockDeviceCacheDirect, BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, @@ -1182,6 +1205,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { Debug: defaultEnableDebug, DisableNestingChecks: defaultDisableNestingChecks, BlockDeviceDriver: defaultBlockDeviceDriver, + BlockDeviceAIO: defaultBlockDeviceAIO, BlockDeviceCacheSet: defaultBlockDeviceCacheSet, BlockDeviceCacheDirect: defaultBlockDeviceCacheDirect, BlockDeviceCacheNoflush: defaultBlockDeviceCacheNoflush, diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 86619ab13d..5e493b40e3 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -79,6 +79,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf machineType := "machineType" disableBlockDevice := true blockDeviceDriver := "virtio-scsi" + blockDeviceAIO := "io_uring" enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) @@ -99,6 +100,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf DefaultGuestHookPath: defaultGuestHookPath, DisableBlock: disableBlockDevice, BlockDeviceDriver: blockDeviceDriver, + BlockDeviceAIO: blockDeviceAIO, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, @@ -159,6 +161,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf DefaultMaxMemorySize: maxMemory, DisableBlockDeviceUse: disableBlockDevice, BlockDeviceDriver: defaultBlockDeviceDriver, + BlockDeviceAIO: defaultBlockDeviceAIO, DefaultBridges: defaultBridgesCount, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, @@ -550,6 +553,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VirtioFSCache: defaultVirtioFSCacheMode, + BlockDeviceAIO: defaultBlockDeviceAIO, } expectedAgentConfig := vc.KataAgentConfig{ @@ -593,6 +597,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) orgVHostVSockDevicePath := utils.VHostVSockDevicePath + blockDeviceAIO := "io_uring" defer func() { utils.VHostVSockDevicePath = orgVHostVSockDevicePath }() @@ -614,6 +619,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { TxRateLimiterMaxRate: txRateLimiterMaxRate, SharedFS: "virtio-fs", VirtioFSDaemon: filepath.Join(dir, "virtiofsd"), + BlockDeviceAIO: blockDeviceAIO, } files := []string{hypervisorPath, kernelPath, imagePath} @@ -674,6 +680,11 @@ func TestNewQemuHypervisorConfig(t *testing.T) { if config.TxRateLimiterMaxRate != txRateLimiterMaxRate { t.Errorf("Expected value for tx rate limiter %v, got %v", txRateLimiterMaxRate, config.TxRateLimiterMaxRate) } + + if config.BlockDeviceAIO != blockDeviceAIO { + t.Errorf("Expected value for BlockDeviceAIO %v, got %v", blockDeviceAIO, config.BlockDeviceAIO) + } + } func TestNewFirecrackerHypervisorConfig(t *testing.T) { diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index e043f786c0..95bfe2a33a 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -683,6 +683,22 @@ func addHypervisorBlockOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) } } + if value, ok := ocispec.Annotations[vcAnnotations.BlockDeviceAIO]; ok { + supportedAIO := []string{config.AIONative, config.AIOThreads, config.AIOIOUring} + + valid := false + for _, b := range supportedAIO { + if b == value { + sbConfig.HypervisorConfig.BlockDeviceAIO = value + valid = true + } + } + + if !valid { + return fmt.Errorf("Invalid AIO mechanism %v specified in annotation (supported IO mechanism : %v)", value, supportedAIO) + } + } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.DisableBlockDeviceUse).setBool(func(disableBlockDeviceUse bool) { sbConfig.HypervisorConfig.DisableBlockDeviceUse = disableBlockDeviceUse }); err != nil { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 8b905def87..b5b15ad871 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -642,6 +642,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.HugePages] = "true" ocispec.Annotations[vcAnnotations.IOMMU] = "true" ocispec.Annotations[vcAnnotations.BlockDeviceDriver] = "virtio-scsi" + ocispec.Annotations[vcAnnotations.BlockDeviceAIO] = "io_uring" ocispec.Annotations[vcAnnotations.DisableBlockDeviceUse] = "true" ocispec.Annotations[vcAnnotations.EnableIOThreads] = "true" ocispec.Annotations[vcAnnotations.BlockDeviceCacheSet] = "true" @@ -679,6 +680,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(config.HypervisorConfig.HugePages, true) assert.Equal(config.HypervisorConfig.IOMMU, true) assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi") + assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring") assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true) assert.Equal(config.HypervisorConfig.EnableIOThreads, true) assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index b82700a0e1..49b658db31 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -377,6 +377,9 @@ type HypervisorConfig struct { // SeccompSandbox is the qemu function which enables the seccomp feature SeccompSandbox string + // BlockiDeviceAIO specifies the I/O API to be used. + BlockDeviceAIO string + // KernelParams are additional guest kernel parameters. KernelParams []Param diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 6618d0acdf..d7919a0c8a 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -203,6 +203,9 @@ const ( // BlockDeviceDriver specifies the driver to be used for block device either VirtioSCSI or VirtioBlock BlockDeviceDriver = kataAnnotHypervisorPrefix + "block_device_driver" + // BlockDeviceAIO specifies I/O mechanism to be used with VirtioBlock for qemu + BlockDeviceAIO = kataAnnotHypervisorPrefix + "block_device_aio" + // DisableBlockDeviceUse is a sandbox annotation that disallows a block device from being used. DisableBlockDeviceUse = kataAnnotHypervisorPrefix + "disable_block_device_use" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index fdcf60eda0..56bd5c3899 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -1292,12 +1292,19 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri return nil } + qblkDevice := govmmQemu.BlockDevice{ + ID: drive.ID, + File: drive.File, + ReadOnly: drive.ReadOnly, + AIO: govmmQemu.BlockDeviceAIO(q.config.BlockDeviceAIO), + } + if drive.Swap { - err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithDriverCache(q.qmpMonitorCh.ctx, "file", drive.File, drive.ID, false, false, false) + err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithDriverCache(q.qmpMonitorCh.ctx, "file", &qblkDevice, false, false) } else if q.config.BlockDeviceCacheSet { - err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush, drive.ReadOnly) + err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, &qblkDevice, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush) } else { - err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID, drive.ReadOnly) + err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, &qblkDevice) } if err != nil { return err