mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 22:50:54 +00:00
kata-deploy: retry node labeling after CRI restart
On rke2/k3s a CRI restart also restarts the kubelet, which may briefly re-register the node with its cached label set and clobber the kata-runtime label that was just applied via the API. Replace the single label_node call with a retry loop that verifies the label value after setting it. If the label is missing or has the wrong value, it is re-applied (up to 10 attempts with 2 s back-off). This fixes a race condition that became more visible after the switch to individual tarball extraction, which made install take slightly longer and shifted the kubelet re-registration timing window. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
@@ -340,12 +340,73 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
|
||||
runtime::lifecycle::restart_runtime(config, runtime).await?;
|
||||
info!("Runtime restart completed successfully");
|
||||
|
||||
k8s::label_node(config, "katacontainers.io/kata-runtime", Some("true"), true).await?;
|
||||
label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?;
|
||||
|
||||
info!("Kata Containers installation completed successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Label the node and verify the label sticks, retrying if necessary.
|
||||
///
|
||||
/// On rke2/k3s a CRI restart also restarts the kubelet. The kubelet may
|
||||
/// briefly re-register the node with its cached label set, clobbering the
|
||||
/// label we just applied via the API. We work around this by verifying the
|
||||
/// label value after setting it and re-applying if needed.
|
||||
async fn label_node_with_retry(
|
||||
config: &config::Config,
|
||||
label_key: &str,
|
||||
label_value: &str,
|
||||
) -> Result<()> {
|
||||
const MAX_ATTEMPTS: u32 = 10;
|
||||
const RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(2);
|
||||
|
||||
for attempt in 1..=MAX_ATTEMPTS {
|
||||
k8s::label_node(config, label_key, Some(label_value), true).await?;
|
||||
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
|
||||
match k8s::get_node_label(config, label_key).await {
|
||||
Ok(Some(val)) if val == label_value => {
|
||||
info!(
|
||||
"Label {}={} confirmed on node (attempt {})",
|
||||
label_key, label_value, attempt
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(actual) => {
|
||||
log::warn!(
|
||||
"Label {}={} did not stick (got {:?}), retrying ({}/{})",
|
||||
label_key,
|
||||
label_value,
|
||||
actual,
|
||||
attempt,
|
||||
MAX_ATTEMPTS
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"Failed to verify label {} (attempt {}/{}): {}",
|
||||
label_key,
|
||||
attempt,
|
||||
MAX_ATTEMPTS,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if attempt < MAX_ATTEMPTS {
|
||||
tokio::time::sleep(RETRY_DELAY).await;
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!(
|
||||
"Failed to set label {}={} after {} attempts",
|
||||
label_key,
|
||||
label_value,
|
||||
MAX_ATTEMPTS
|
||||
);
|
||||
}
|
||||
|
||||
async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
|
||||
info!("Cleaning up Kata Containers");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user