From aa7e46b5ed3aecb779bfdc7c32c58a6fab2ecc59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 3 Oct 2025 15:59:32 +0200 Subject: [PATCH] tests: Check the multi-snapshotter situation on containerd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One problem that we've been having for a reasonable amount of time, is containerd not behaving very well when we have multiple snapshotters. Although I'm adding this test with my "CoCo" hat in mind, the issue can happen easily with any other case that requires a different snapshotter (such as, for instance, firecracker + devmapper). With this in mind, let's do some stability tests, checking every hour a simple case of running a few pre-defined containers with runc, and then running the same containers with kata. This should be enough to put us in the situation where containerd gets confused about which snapshotter owns the image layers, and break on us (or not break and show us that this has been solved ...). Signed-off-by: Fabiano FidĂȘncio --- .github/actionlint.yaml | 4 + ...nerd-multi-snapshotter-stability-test.yaml | 164 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 .github/workflows/run-containerd-multi-snapshotter-stability-test.yaml diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index bc55441669..ecfdf9aebe 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -9,6 +9,10 @@ self-hosted-runner: labels: - amd64-nvidia-a100 - arm64-k8s + - containerd-v1.7 + - containerd-v2.0 + - containerd-v2.1 + - containerd-v2.2 - garm-ubuntu-2004 - garm-ubuntu-2004-smaller - garm-ubuntu-2204 diff --git a/.github/workflows/run-containerd-multi-snapshotter-stability-test.yaml b/.github/workflows/run-containerd-multi-snapshotter-stability-test.yaml new file mode 100644 index 0000000000..7f53d4b17a --- /dev/null +++ b/.github/workflows/run-containerd-multi-snapshotter-stability-test.yaml @@ -0,0 +1,164 @@ +name: CI | Run containerd multi-snapshotter stability test +on: + schedule: + - cron: "0 */1 * * *" #run every hour + +permissions: {} + +# This job relies on k8s pre-installed using kubeadm +jobs: + run-containerd-multi-snapshotter-stability-tests: + name: run-containerd-multi-snapshotter-stability-tests + strategy: + fail-fast: false + matrix: + containerd: + - v1.7 + - v2.0 + - v2.1 + - v2.2 + env: + # I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here. + IMAGES_LIST: quay.io/mongodb/mongodb-community-server@sha256:8b73733842da21b6bbb6df4d7b2449229bb3135d2ec8c6880314d88205772a11 ghcr.io/edgelesssys/redis@sha256:ecb0a964c259a166a1eb62f0eb19621d42bd1cce0bc9bb0c71c828911d4ba93d + runs-on: containerd-${{ matrix.containerd }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false + + - name: Rotate the journal + run: sudo journalctl --rotate --vacuum-time 1s + + - name: Pull the kata-deploy image to be used + run: sudo ctr -n k8s.io image pull quay.io/kata-containers/kata-deploy-ci:kata-containers-latest + + - name: Deploy Kata Containers + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata + env: + KATA_HYPERVISOR: qemu-coco-dev + KUBERNETES: vanilla + SNAPSHOTTER: nydus + USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true + + # This is needed as we may hit the createContainerTimeout + - name: Adjust Kata Containers' create_container_timeout + run: | + sudo sed -i -e 's/^\(create_container_timeout\).*=.*$/\1 = 600/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + grep "create_container_timeout.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + + # This is needed in order to have enough tmpfs space inside the guest to pull the image + - name: Adjust Kata Containers' default_memory + run: | + sudo sed -i -e 's/^\(default_memory\).*=.*$/\1 = 4096/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + grep "default_memory.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + + - name: Run a few containers using overlayfs + run: | + # I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here + # shellcheck disable=SC2086 + for img in ${IMAGES_LIST}; do + echo "overlayfs | Using on image: ${img}" + pod="$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')" + kubectl run "${pod}" \ + -it --rm \ + --restart=Never \ + --image="${img}" \ + --image-pull-policy=Always \ + --pod-running-timeout=10m \ + -- uname -r + done + + - name: Run a the same few containers using a different snapshotter + run: | + # I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here + # shellcheck disable=SC2086 + for img in ${IMAGES_LIST}; do + echo "nydus | Using on image: ${img}" + pod="kata-$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')" + kubectl run "${pod}" \ + -it --rm \ + --restart=Never \ + --image="${img}" \ + --image-pull-policy=Always \ + --pod-running-timeout=10m \ + --overrides='{ + "spec": { + "runtimeClassName": "kata-qemu-coco-dev" + } + }' \ + -- uname -r + done + + - name: Uninstall Kata Containers + run: bash tests/integration/kubernetes/gha-run.sh cleanup + env: + KATA_HYPERVISOR: qemu-coco-dev + KUBERNETES: vanilla + SNAPSHOTTER: nydus + USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true + + - name: Run a few containers using overlayfs + run: | + # I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here + # shellcheck disable=SC2086 + for img in ${IMAGES_LIST}; do + echo "overlayfs | Using on image: ${img}" + pod="$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')" + kubectl run "${pod}" \ + -it --rm \ + --restart=Never \ + --image=${img} \ + --image-pull-policy=Always \ + --pod-running-timeout=10m \ + -- uname -r + done + + - name: Deploy Kata Containers + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata + env: + KATA_HYPERVISOR: qemu-coco-dev + KUBERNETES: vanilla + SNAPSHOTTER: nydus + USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true + + # This is needed as we may hit the createContainerTimeout + - name: Adjust Kata Containers' create_container_timeout + run: | + sudo sed -i -e 's/^\(create_container_timeout\).*=.*$/\1 = 600/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + grep "create_container_timeout.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + + # This is needed in order to have enough tmpfs space inside the guest to pull the image + - name: Adjust Kata Containers' default_memory + run: | + sudo sed -i -e 's/^\(default_memory\).*=.*$/\1 = 4096/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + grep "default_memory.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml + + - name: Run a the same few containers using a different snapshotter + run: | + # I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here + # shellcheck disable=SC2086 + for img in ${IMAGES_LIST}; do + echo "nydus | Using on image: ${img}" + pod="kata-$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')" + kubectl run "${pod}" \ + -it --rm \ + --restart=Never \ + --image="${img}" \ + --image-pull-policy=Always \ + --pod-running-timeout=10m \ + --overrides='{ + "spec": { + "runtimeClassName": "kata-qemu-coco-dev" + } + }' \ + -- uname -r + done + + - name: Uninstall Kata Containers + run: bash tests/integration/kubernetes/gha-run.sh cleanup || true + if: always() + env: + KATA_HYPERVISOR: qemu-coco-dev + KUBERNETES: vanilla + SNAPSHOTTER: nydus + USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true