From a3e91d9ed2b8f6ba87a299bdaa4d9c347bcc8ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Tue, 20 Jan 2026 14:58:55 -0600 Subject: [PATCH] runtime-go/rs: Set `disable_guest_empty_dir = true` by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the runtime share the host Kubelet emptyDir folder with the guest instead of the agent creating an empty folder in the container rootfs. Doing so enables the Kubelet to track emptyDir usage and evict greedy pods. In other words, with virtio-fs the container rootfs uses host storage whether this is true or false, however with true, Kata uses the k8s emptyDir folder so the sizeLimit is properly enforced by k8s. Addresses the ephemeral storage part of #12203. History: * Initially, emptyDirs are slow because they are shared from the host with 9p. https://github.com/kata-containers/runtime/issues/1472 * To address above, emptyDirs are hardcoded to be created by the agent in the pause container's rootfs, potentially leveraging devicemapper and improving perf. https://github.com/kata-containers/runtime/pull/1485 * The previous PR regressed an (interesting?) use case where emptyDirs were used to share data from the host to the guest, so the behavior was made configurable and `disable_guest_empty_dir = false` is introduced, defaulting to the behavior of the previous PR. https://github.com/kata-containers/kata-containers/pull/2056 * Another resource accounting regression remains which is addressed in this PR. Signed-off-by: Aurélien Bombo --- src/runtime-rs/Makefile | 2 +- src/runtime/Makefile | 2 +- src/runtime/virtcontainers/fs_share_linux.go | 44 ++++++++++++++++--- .../virtcontainers/fs_share_linux_test.go | 33 ++++++++++++++ 4 files changed, 74 insertions(+), 7 deletions(-) diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 9febef291c..edcdcf2ee3 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -179,7 +179,7 @@ DEFNETQUEUES := 1 DEFENABLEANNOTATIONS := [\"enable_iommu\", \"kernel_params\", \"kernel_verity_params\", \"default_vcpus\", \"default_memory\"] DEFENABLEANNOTATIONS_COCO := [\"enable_iommu\", \"kernel_params\", \"kernel_verity_params\", \"default_vcpus\", \"default_memory\", \"cc_init_data\"] DEFDISABLEGUESTSECCOMP := true -DEFDISABLEGUESTEMPTYDIR := false +DEFDISABLEGUESTEMPTYDIR := true DEFEMPTYDIRMODE := shared-fs DEFEMPTYDIRMODE_COCO := block-encrypted ##VAR DEFAULTEXPFEATURES=[features] Default experimental features enabled diff --git a/src/runtime/Makefile b/src/runtime/Makefile index b197a0ff7a..a6ad377bad 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -228,7 +228,7 @@ DEFBRIDGES := 1 DEFENABLEANNOTATIONS := [\"enable_iommu\", \"kernel_params\", \"kernel_verity_params\"] DEFENABLEANNOTATIONS_COCO := [\"enable_iommu\", \"kernel_params\", \"kernel_verity_params\", \"default_vcpus\", \"default_memory\", \"cc_init_data\"] DEFDISABLEGUESTSECCOMP := true -DEFDISABLEGUESTEMPTYDIR := false +DEFDISABLEGUESTEMPTYDIR := true DEFEMPTYDIRMODE := shared-fs DEFEMPTYDIRMODE_COCO := block-encrypted #Default experimental features enabled diff --git a/src/runtime/virtcontainers/fs_share_linux.go b/src/runtime/virtcontainers/fs_share_linux.go index ff9d25dff6..fe7dd7f3fb 100644 --- a/src/runtime/virtcontainers/fs_share_linux.go +++ b/src/runtime/virtcontainers/fs_share_linux.go @@ -84,9 +84,13 @@ type FilesystemShare struct { configVolRegex *regexp.Regexp // Regex to match only the timestamped directory inside the k8's volume mount timestampDirRegex *regexp.Regexp - // The same volume mount can be shared by multiple containers in the same sandbox (pod) - srcDstMap map[string][]string - srcDstMapLock sync.Mutex + // srcDstMap tracks file-level source to destination mappings for configmap/secret watching + srcDstMap map[string][]string + srcDstMapLock sync.Mutex + // srcGuestMap caches volume source path to guest path, enabling multiple containers + // in the same pod to share the same volume mount + srcGuestMap map[string]string + srcGuestMapLock sync.Mutex eventLoopStarted bool eventLoopStartedLock sync.Mutex watcherDoneChannel chan bool @@ -114,6 +118,7 @@ func NewFilesystemShare(s *Sandbox) (*FilesystemShare, error) { sandbox: s, watcherDoneChannel: make(chan bool), srcDstMap: make(map[string][]string), + srcGuestMap: make(map[string]string), watcher: watcher, configVolRegex: configVolRegex, timestampDirRegex: timestampDirRegex, @@ -302,19 +307,38 @@ func (f *FilesystemShare) Cleanup(ctx context.Context) error { return nil } +func shareFileName(containerID, source, destination, randHex string, isSandboxScoped bool) string { + if isSandboxScoped { + return fmt.Sprintf("sandbox-%s-%s", randHex, filepath.Base(filepath.Clean(source))) + } + return fmt.Sprintf("%s-%s-%s", containerID, randHex, filepath.Base(destination)) +} + func (f *FilesystemShare) ShareFile(ctx context.Context, c *Container, m *Mount) (*SharedFile, error) { randBytes, err := utils.GenerateRandomBytes(8) if err != nil { return nil, err } - filename := fmt.Sprintf("%s-%s-%s", c.id, hex.EncodeToString(randBytes), filepath.Base(m.Destination)) + randHex := hex.EncodeToString(randBytes) + caps := f.sandbox.hypervisor.Capabilities(ctx) + mustCopyEmptyDir := !caps.IsFsSharingSupported() && Isk8sHostEmptyDir(m.Source) + + filename := shareFileName(c.id, m.Source, m.Destination, randHex, mustCopyEmptyDir) guestPath := filepath.Join(kataGuestSharedDir(), filename) // copy file to container's rootfs if filesystem sharing is not supported, otherwise // bind mount it in the shared directory. - caps := f.sandbox.hypervisor.Capabilities(ctx) if !caps.IsFsSharingSupported() { + if mustCopyEmptyDir { + f.srcGuestMapLock.Lock() + if guestPath, ok := f.srcGuestMap[m.Source]; ok { + f.srcGuestMapLock.Unlock() + return &SharedFile{guestPath: guestPath}, nil + } + f.srcGuestMapLock.Unlock() + } + f.Logger().Debug("filesystem sharing is not supported, files will be copied") var ignored bool @@ -393,6 +417,13 @@ func (f *FilesystemShare) ShareFile(ctx context.Context, c *Container, m *Mount) return nil, nil } + if mustCopyEmptyDir { + // Cache the host emptyDir guestPath so other containers in the pod + // share the same copied writable directory. + f.srcGuestMapLock.Lock() + f.srcGuestMap[m.Source] = guestPath + f.srcGuestMapLock.Unlock() + } } else { // These mounts are created in the shared dir mountDest := filepath.Join(getMountPath(f.sandbox.ID()), filename) @@ -449,6 +480,9 @@ func (f *FilesystemShare) UnshareFile(ctx context.Context, c *Container, m *Moun } } + // Not deleting from f.srcGuestMapLock since this function is not + // called for mounts without HostPath. + return nil } diff --git a/src/runtime/virtcontainers/fs_share_linux_test.go b/src/runtime/virtcontainers/fs_share_linux_test.go index a6ea52d8ee..e9c273d3cf 100644 --- a/src/runtime/virtcontainers/fs_share_linux_test.go +++ b/src/runtime/virtcontainers/fs_share_linux_test.go @@ -182,3 +182,36 @@ func TestShareRootFilesystem(t *testing.T) { }) } } + +func TestShareFileName(t *testing.T) { + testCases := map[string]struct { + containerID string + source string + destination string + randHex string + sandboxScoped bool + expectedResult string + }{ + "container scoped": { + containerID: "container-id-abc", + source: "/var/lib/kubelet/pods/poduid/volumes/kubernetes.io~empty-dir/cache", + destination: "/mnt/cache", + randHex: "0011223344556677", + expectedResult: "container-id-abc-0011223344556677-cache", + }, + "sandbox scoped source basename": { + containerID: "container-id-abc", + source: "/var/lib/kubelet/pods/poduid/volumes/kubernetes.io~empty-dir/cache/", + destination: "/mnt/different-cache-name", + randHex: "0011223344556677", + sandboxScoped: true, + expectedResult: "sandbox-0011223344556677-cache", + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + assert.Equal(t, tc.expectedResult, shareFileName(tc.containerID, tc.source, tc.destination, tc.randHex, tc.sandboxScoped)) + }) + } +}