runtime: support block-plain emptyDirs

Add Go runtime support for the block-plain emptyDir mode.

Disk-backed Kubernetes emptyDir mounts remain bind mounts so the block
emptyDir handling path can intercept them. The runtime creates a sparse
disk.img in the kubelet emptyDir directory and records direct-volume
metadata for the agent-visible block storage path.

Fresh block emptyDirs request filesystem creation through a dedicated
metadata flag. Plain emptyDirs also record discard support on the block
device. Encrypted emptyDirs keep the existing ephemeral encryption
metadata and carry the same filesystem-creation signal.

Signed-off-by: Manuel Huber <manuelh@nvidia.com>
Assisted-by: OpenAI Codex <codex@openai.com>
This commit is contained in:
Manuel Huber
2026-06-03 18:03:37 +00:00
parent b05d705ea0
commit 24c51cfbbf
27 changed files with 186 additions and 21 deletions

View File

@@ -479,6 +479,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -479,6 +479,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -364,6 +364,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -652,6 +652,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -715,6 +715,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -761,6 +761,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -738,6 +738,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -740,6 +740,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -693,6 +693,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -723,6 +723,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -700,6 +700,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -709,6 +709,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -272,6 +272,9 @@ disable_guest_empty_dir = false
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -417,6 +417,9 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# - block-encrypted
# Plugs a block device to be encrypted in the guest.
#
# - block-plain
# Plugs a block device to be mounted directly in the guest.
#
emptydir_mode = "@DEFEMPTYDIRMODE@"
# Enabled experimental feature list, format: ["a", "b"].

View File

@@ -290,6 +290,9 @@ type DeviceInfo struct {
// If applicable, should this device be considered RO
ReadOnly bool
// DiscardUnmap enables discard/unmap support for this block device.
DiscardUnmap bool
// ColdPlug specifies whether the device must be cold plugged (true)
// or hot plugged (false).
ColdPlug bool
@@ -338,6 +341,9 @@ type BlockDrive struct {
// ReadOnly sets the device file readonly
ReadOnly bool
// DiscardUnmap enables discard/unmap support for this block device.
DiscardUnmap bool
// Pmem enables persistent memory. Use File as backing file
// for a nvdimm device in the guest
Pmem bool

View File

@@ -61,12 +61,13 @@ func (device *BlockDevice) Attach(ctx context.Context, devReceiver api.DeviceRec
}
drive := &config.BlockDrive{
File: device.DeviceInfo.HostPath,
Format: "raw",
ID: utils.MakeNameID("drive", device.DeviceInfo.ID, maxDevIDSize),
Index: index,
Pmem: device.DeviceInfo.Pmem,
ReadOnly: device.DeviceInfo.ReadOnly,
File: device.DeviceInfo.HostPath,
Format: "raw",
ID: utils.MakeNameID("drive", device.DeviceInfo.ID, maxDevIDSize),
Index: index,
Pmem: device.DeviceInfo.Pmem,
ReadOnly: device.DeviceInfo.ReadOnly,
DiscardUnmap: device.DeviceInfo.DiscardUnmap,
}
if fs, ok := device.DeviceInfo.DriverOptions[config.FsTypeOpt]; ok {

View File

@@ -18,8 +18,10 @@ const (
mountInfoFileName = "mountInfo.json"
EncryptionKeyMetadataKey = "encryptionKey"
CreateFilesystemMetadataKey = "createFilesystem"
FSGroupMetadataKey = "fsGroup"
FSGroupChangePolicyMetadataKey = "fsGroupChangePolicy"
BlockVolumeCreateFsDriverKey = "create_filesystem"
)
// FSGroupChangePolicy holds policies that will be used for applying fsGroup to a volume.

View File

@@ -213,11 +213,11 @@ func (r runtime) emptyDirMode() (string, error) {
}
switch r.EmptyDirMode {
case vc.EmptyDirModeSharedFs, vc.EmptyDirModeVirtioBlkEncrypted:
case vc.EmptyDirModeSharedFs, vc.EmptyDirModeVirtioBlkEncrypted, vc.EmptyDirModeVirtioBlkPlain:
return r.EmptyDirMode, nil
default:
return "", fmt.Errorf("invalid emptydir_mode=%q, allowed values: %q, %q",
r.EmptyDirMode, vc.EmptyDirModeSharedFs, vc.EmptyDirModeVirtioBlkEncrypted)
return "", fmt.Errorf("invalid emptydir_mode=%q, allowed values: %q, %q, %q",
r.EmptyDirMode, vc.EmptyDirModeSharedFs, vc.EmptyDirModeVirtioBlkEncrypted, vc.EmptyDirModeVirtioBlkPlain)
}
}

View File

@@ -1660,6 +1660,11 @@ func TestCheckEmptyDirMode(t *testing.T) {
assert.NoError(err)
assert.Equal(vc.EmptyDirModeVirtioBlkEncrypted, mode)
r = runtime{EmptyDirMode: vc.EmptyDirModeVirtioBlkPlain}
mode, err = r.emptyDirMode()
assert.NoError(err)
assert.Equal(vc.EmptyDirModeVirtioBlkPlain, mode)
r = runtime{}
mode, err = r.emptyDirMode()
assert.NoError(err)
@@ -1677,6 +1682,10 @@ func TestCheckEmptyDirMode(t *testing.T) {
r = runtime{EmptyDirMode: "block_encrypted"}
_, err = r.emptyDirMode()
assert.Error(err)
r = runtime{EmptyDirMode: "block_plain"}
_, err = r.emptyDirMode()
assert.Error(err)
}
func TestCheckFactoryConfig(t *testing.T) {

View File

@@ -107,7 +107,9 @@ func SetEphemeralStorageType(ociSpec specs.Spec, disableGuestEmptyDir bool, empt
// disableGuestEmptyDir and emptyDirMode.
if vc.IsHugePageEmptyDir(mnt.Source) {
ociSpec.Mounts[idx].Type = vc.KataLocalDevType
} else if !disableGuestEmptyDir && emptyDirMode != vc.EmptyDirModeVirtioBlkEncrypted {
} else if !disableGuestEmptyDir &&
emptyDirMode != vc.EmptyDirModeVirtioBlkEncrypted &&
emptyDirMode != vc.EmptyDirModeVirtioBlkPlain {
ociSpec.Mounts[idx].Type = vc.KataLocalDevType
}
}

View File

@@ -148,6 +148,35 @@ func TestSetEphemeralStorageType(t *testing.T) {
"Unexpected mount type, got %s expected ephemeral", mountType)
}
func TestSetEphemeralStorageTypeHostEmptyDirModes(t *testing.T) {
assert := assert.New(t)
dir := t.TempDir()
emptyDirPath := filepath.Join(dir, vc.K8sEmptyDir, "disk-volume")
err := os.MkdirAll(emptyDirPath, testDirMode)
assert.NoError(err)
newSpec := func() specs.Spec {
return specs.Spec{
Mounts: []specs.Mount{
{
Source: emptyDirPath,
Type: "bind",
},
},
}
}
ociSpec := SetEphemeralStorageType(newSpec(), false, vc.EmptyDirModeSharedFs)
assert.Equal(vc.KataLocalDevType, ociSpec.Mounts[0].Type)
ociSpec = SetEphemeralStorageType(newSpec(), false, vc.EmptyDirModeVirtioBlkEncrypted)
assert.Equal("bind", ociSpec.Mounts[0].Type)
ociSpec = SetEphemeralStorageType(newSpec(), false, vc.EmptyDirModeVirtioBlkPlain)
assert.Equal("bind", ociSpec.Mounts[0].Type)
}
func TestSetKernelParams(t *testing.T) {
assert := assert.New(t)

View File

@@ -166,7 +166,7 @@ type RuntimeConfig struct {
DisableGuestEmptyDir bool
// EmptyDirMode specifies how Kubernetes emptyDir volumes are handled.
// Valid values are "shared-fs" (default) or "block-encrypted".
// Valid values are "shared-fs" (default), "block-encrypted", or "block-plain".
EmptyDirMode string
// CreateContainer timeout which, if provided, indicates the createcontainer request timeout

View File

@@ -622,9 +622,9 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
// iterate all mounts and create block device if it's block based.
for i := range c.mounts {
// If block devices are disabled, we selectively only hotplug if
// the mount is an encrypted block-based emptyDir, to avoid
// the mount is a block-based emptyDir, to avoid
// cases that could regress 20ca4d2.
if !c.checkBlockDeviceSupport(ctx) && (c.sandbox.config.EmptyDirMode != EmptyDirModeVirtioBlkEncrypted || !IsNonTmpFSEmptyDir(c.mounts[i].Source)) {
if !c.checkBlockDeviceSupport(ctx) && (!isBlockEmptyDirMode(c.sandbox.config.EmptyDirMode) || !IsDiskEmptyDir(c.mounts[i].Source)) {
c.Logger().Warn("Block device not supported")
continue
}
@@ -679,6 +679,13 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
switch key {
case volume.EncryptionKeyMetadataKey:
c.mounts[i].EncryptionKey = value
case volume.CreateFilesystemMetadataKey:
createFs, err := strconv.ParseBool(value)
if err != nil {
c.Logger().WithError(err).Errorf("invalid create filesystem value %s provided for key %s", value, volume.CreateFilesystemMetadataKey)
continue
}
c.mounts[i].BlockDeviceCreateFs = createFs
case volume.FSGroupMetadataKey:
gid, err := strconv.Atoi(value)
if err != nil {
@@ -702,6 +709,8 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
// instead of passing this as a shared mount.
di, err := c.createDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination, c.mounts[i].ReadOnly, isBlockFile)
if err == nil && di != nil {
di.DiscardUnmap = c.mounts[i].BlockDeviceCreateFs && slices.Contains(c.mounts[i].Options, blockVolumeDiscardOption)
b, err := c.sandbox.devManager.NewDevice(*di)
if err != nil {
// Do not return an error, try to create
@@ -878,12 +887,12 @@ func getFilesystemCapacity(path string) (uint64, error) {
}
func (c *Container) createEphemeralDisks() error {
if c.sandbox.config.EmptyDirMode != EmptyDirModeVirtioBlkEncrypted {
if !isBlockEmptyDirMode(c.sandbox.config.EmptyDirMode) {
return nil
}
for i := range c.mounts {
if !IsNonTmpFSEmptyDir(c.mounts[i].Source) {
if !IsDiskEmptyDir(c.mounts[i].Source) {
continue
}
@@ -896,7 +905,7 @@ func (c *Container) createEphemeralDisks() error {
continue
}
diskPath, err := c.setupEphemeralDisk(c.mounts[i].Source)
diskPath, err := c.setupEphemeralDisk(c.mounts[i].Source, c.sandbox.config.EmptyDirMode)
if err != nil {
return err
}
@@ -914,7 +923,7 @@ func (c *Container) createEphemeralDisks() error {
// inside the given emptyDir. It returns the path to the created disk
// image. The fd is always closed and the disk image is removed if any
// step after creation fails.
func (c *Container) setupEphemeralDisk(emptyDirPath string) (diskPath string, err error) {
func (c *Container) setupEphemeralDisk(emptyDirPath, emptyDirMode string) (diskPath string, err error) {
// Create the disk file in the same folder as the original
// emptyDir mount so that Kubelet can enforce the sizeLimit.
diskPath = filepath.Join(emptyDirPath, "disk.img")
@@ -950,8 +959,15 @@ func (c *Container) setupEphemeralDisk(emptyDirPath string) (diskPath string, er
return
}
metadata := map[string]string{
volume.EncryptionKeyMetadataKey: "ephemeral",
metadata := map[string]string{}
var options []string
if isBlockEmptyDirMode(emptyDirMode) {
metadata[volume.CreateFilesystemMetadataKey] = strconv.FormatBool(true)
}
if emptyDirMode == EmptyDirModeVirtioBlkEncrypted {
metadata[volume.EncryptionKeyMetadataKey] = "ephemeral"
} else if emptyDirMode == EmptyDirModeVirtioBlkPlain {
options = []string{blockVolumeDiscardOption}
}
if sourceStat.Gid != 0 {
metadata[volume.FSGroupMetadataKey] = strconv.FormatUint(uint64(sourceStat.Gid), 10)
@@ -962,6 +978,7 @@ func (c *Container) setupEphemeralDisk(emptyDirPath string) (diskPath string, er
Device: diskPath,
FsType: "ext4",
Metadata: metadata,
Options: options,
}); err != nil {
c.Logger().WithError(err).Errorf("failed to assign direct volume for mount %s", emptyDirPath)
return

View File

@@ -73,6 +73,12 @@ const (
// EmptyDirModeVirtioBlkEncrypted is the emptydir_mode value for encrypted virtio-blk emptyDir.
EmptyDirModeVirtioBlkEncrypted = "block-encrypted"
// EmptyDirModeVirtioBlkPlain is the emptydir_mode value for plain virtio-blk emptyDir.
EmptyDirModeVirtioBlkPlain = "block-plain"
// blockVolumeDiscardOption requests discard support for block volume mounts.
blockVolumeDiscardOption = "discard"
// encryptionKeyDriverOption is the driver option used to specify
// an encryption key for a Storage struct.
encryptionKeyDriverOption = "encryption_key"
@@ -2058,6 +2064,9 @@ func (k *kataAgent) handleDeviceBlockVolume(c *Container, m Mount, device api.De
option := fmt.Sprintf("%s=%s", encryptionKeyDriverOption, m.EncryptionKey)
vol.DriverOptions = append(vol.DriverOptions, option)
}
if m.BlockDeviceCreateFs {
vol.DriverOptions = append(vol.DriverOptions, volume.BlockVolumeCreateFsDriverKey)
}
vol.Shared = m.Shared

View File

@@ -338,6 +338,46 @@ func TestHandleDeviceBlockVolume(t *testing.T) {
},
},
},
{
BlockDeviceDriver: config.VirtioBlock,
inputMount: Mount{
BlockDeviceCreateFs: true,
Options: []string{blockVolumeDiscardOption},
},
inputDev: &drivers.BlockDevice{
BlockDrive: &config.BlockDrive{
PCIPath: testPCIPath,
VirtPath: testVirtPath,
},
},
resultVol: &pb.Storage{
Driver: kataBlkDevType,
Source: testPCIPath.String(),
DriverOptions: []string{volume.BlockVolumeCreateFsDriverKey},
Options: []string{blockVolumeDiscardOption},
},
},
{
BlockDeviceDriver: config.VirtioBlock,
inputMount: Mount{
EncryptionKey: "ephemeral",
BlockDeviceCreateFs: true,
},
inputDev: &drivers.BlockDevice{
BlockDrive: &config.BlockDrive{
PCIPath: testPCIPath,
VirtPath: testVirtPath,
},
},
resultVol: &pb.Storage{
Driver: kataBlkDevType,
Source: testPCIPath.String(),
DriverOptions: []string{
encryptionKeyDriverOption + "=ephemeral",
volume.BlockVolumeCreateFsDriverKey,
},
},
},
}
for _, test := range tests {

View File

@@ -278,6 +278,10 @@ type Mount struct {
// to instruct the agent to generate a one-time key.
EncryptionKey string
// BlockDeviceCreateFs requests filesystem creation before mounting a
// fresh block volume. The filesystem type comes from the Storage Fstype.
BlockDeviceCreateFs bool
// Shared indicates whether the mount is shared across containers.
Shared bool
}

View File

@@ -184,7 +184,7 @@ type SandboxConfig struct {
DisableGuestSeccomp bool
// EmptyDirMode specifies how Kubernetes emptyDir volumes are handled.
// Valid values are "shared-fs" (default) or "block-encrypted".
// Valid values are "shared-fs" (default), "block-encrypted", or "block-plain".
EmptyDirMode string
// EnableVCPUsPinning controls whether each vCPU thread should be scheduled to a fixed CPU
@@ -1129,7 +1129,7 @@ func (s *Sandbox) Delete(ctx context.Context) error {
// cleanupEphemeralDisks removes ephemeral disk images and their mount info.
func (s *Sandbox) cleanupEphemeralDisks() error {
if s.config.EmptyDirMode != EmptyDirModeVirtioBlkEncrypted {
if !isBlockEmptyDirMode(s.config.EmptyDirMode) {
return nil
}
@@ -1145,6 +1145,10 @@ func (s *Sandbox) cleanupEphemeralDisks() error {
return nil
}
func isBlockEmptyDirMode(mode string) bool {
return mode == EmptyDirModeVirtioBlkEncrypted || mode == EmptyDirModeVirtioBlkPlain
}
func (s *Sandbox) createNetwork(ctx context.Context) error {
if s.config.NetworkConfig.DisableNewNetwork ||
s.config.NetworkConfig.NetworkID == "" {