From efd468df3f8a5a0a000c51e7040f881939aba54d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sat, 16 May 2026 10:56:57 +0200 Subject: [PATCH] kata-deploy: retry node labeling after CRI restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On rke2/k3s a CRI restart also restarts the kubelet, which may briefly re-register the node with its cached label set and clobber the kata-runtime label that was just applied via the API. Replace the single label_node call with a retry loop that verifies the label value after setting it. If the label is missing or has the wrong value, it is re-applied (up to 10 attempts with 2 s back-off). This fixes a race condition that became more visible after the switch to individual tarball extraction, which made install take slightly longer and shifted the kubelet re-registration timing window. Signed-off-by: Fabiano FidĂȘncio --- .../packaging/kata-deploy/binary/src/main.rs | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs index 321468a5d3..c4bb1af93b 100644 --- a/tools/packaging/kata-deploy/binary/src/main.rs +++ b/tools/packaging/kata-deploy/binary/src/main.rs @@ -340,12 +340,73 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { runtime::lifecycle::restart_runtime(config, runtime).await?; info!("Runtime restart completed successfully"); - k8s::label_node(config, "katacontainers.io/kata-runtime", Some("true"), true).await?; + label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?; info!("Kata Containers installation completed successfully"); Ok(()) } +/// Label the node and verify the label sticks, retrying if necessary. +/// +/// On rke2/k3s a CRI restart also restarts the kubelet. The kubelet may +/// briefly re-register the node with its cached label set, clobbering the +/// label we just applied via the API. We work around this by verifying the +/// label value after setting it and re-applying if needed. +async fn label_node_with_retry( + config: &config::Config, + label_key: &str, + label_value: &str, +) -> Result<()> { + const MAX_ATTEMPTS: u32 = 10; + const RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(2); + + for attempt in 1..=MAX_ATTEMPTS { + k8s::label_node(config, label_key, Some(label_value), true).await?; + + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + match k8s::get_node_label(config, label_key).await { + Ok(Some(val)) if val == label_value => { + info!( + "Label {}={} confirmed on node (attempt {})", + label_key, label_value, attempt + ); + return Ok(()); + } + Ok(actual) => { + log::warn!( + "Label {}={} did not stick (got {:?}), retrying ({}/{})", + label_key, + label_value, + actual, + attempt, + MAX_ATTEMPTS + ); + } + Err(e) => { + log::warn!( + "Failed to verify label {} (attempt {}/{}): {}", + label_key, + attempt, + MAX_ATTEMPTS, + e + ); + } + } + + if attempt < MAX_ATTEMPTS { + tokio::time::sleep(RETRY_DELAY).await; + } + } + + anyhow::bail!( + "Failed to set label {}={} after {} attempts", + label_key, + label_value, + MAX_ATTEMPTS + ); +} + async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { info!("Cleaning up Kata Containers");