mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 22:50:54 +00:00
kata-deploy: retry node labeling after CRI restart
On rke2/k3s a CRI restart also restarts the kubelet, which may briefly re-register the node with its cached label set and clobber the kata-runtime label that was just applied via the API. Replace the single label_node call with a retry loop that verifies the label value after setting it. If the label is missing or has the wrong value, it is re-applied (up to 10 attempts with 2 s back-off). This fixes a race condition that became more visible after the switch to individual tarball extraction, which made install take slightly longer and shifted the kubelet re-registration timing window. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
@@ -340,12 +340,73 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
|
|||||||
runtime::lifecycle::restart_runtime(config, runtime).await?;
|
runtime::lifecycle::restart_runtime(config, runtime).await?;
|
||||||
info!("Runtime restart completed successfully");
|
info!("Runtime restart completed successfully");
|
||||||
|
|
||||||
k8s::label_node(config, "katacontainers.io/kata-runtime", Some("true"), true).await?;
|
label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?;
|
||||||
|
|
||||||
info!("Kata Containers installation completed successfully");
|
info!("Kata Containers installation completed successfully");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Label the node and verify the label sticks, retrying if necessary.
|
||||||
|
///
|
||||||
|
/// On rke2/k3s a CRI restart also restarts the kubelet. The kubelet may
|
||||||
|
/// briefly re-register the node with its cached label set, clobbering the
|
||||||
|
/// label we just applied via the API. We work around this by verifying the
|
||||||
|
/// label value after setting it and re-applying if needed.
|
||||||
|
async fn label_node_with_retry(
|
||||||
|
config: &config::Config,
|
||||||
|
label_key: &str,
|
||||||
|
label_value: &str,
|
||||||
|
) -> Result<()> {
|
||||||
|
const MAX_ATTEMPTS: u32 = 10;
|
||||||
|
const RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(2);
|
||||||
|
|
||||||
|
for attempt in 1..=MAX_ATTEMPTS {
|
||||||
|
k8s::label_node(config, label_key, Some(label_value), true).await?;
|
||||||
|
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||||
|
|
||||||
|
match k8s::get_node_label(config, label_key).await {
|
||||||
|
Ok(Some(val)) if val == label_value => {
|
||||||
|
info!(
|
||||||
|
"Label {}={} confirmed on node (attempt {})",
|
||||||
|
label_key, label_value, attempt
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Ok(actual) => {
|
||||||
|
log::warn!(
|
||||||
|
"Label {}={} did not stick (got {:?}), retrying ({}/{})",
|
||||||
|
label_key,
|
||||||
|
label_value,
|
||||||
|
actual,
|
||||||
|
attempt,
|
||||||
|
MAX_ATTEMPTS
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
log::warn!(
|
||||||
|
"Failed to verify label {} (attempt {}/{}): {}",
|
||||||
|
label_key,
|
||||||
|
attempt,
|
||||||
|
MAX_ATTEMPTS,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt < MAX_ATTEMPTS {
|
||||||
|
tokio::time::sleep(RETRY_DELAY).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
anyhow::bail!(
|
||||||
|
"Failed to set label {}={} after {} attempts",
|
||||||
|
label_key,
|
||||||
|
label_value,
|
||||||
|
MAX_ATTEMPTS
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
|
async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
|
||||||
info!("Cleaning up Kata Containers");
|
info!("Cleaning up Kata Containers");
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user