runtime-rs: make sandbox vcpu allocation more accurate

This commit addresses a part of the same problem as PR #7623 did for the golang runtime. So far we've been rounding up individual containers' vCPU requests and then summing them up which can lead to allocation of excess vCPUs as described in the mentioned PR's cover letter. We address this by reversing the order of operations, we sum the (possibly fractional) container requests and only then round up the total. We also align runtime-rs's behaviour with runtime-go in that we now include the default vcpu request from the config file ('default_vcpu') in the total. We diverge from PR #7623 in that `default_vcpu` is still treated as an integer (this will be a topic of a separate commit), and that this implementation avoids relying on 32-bit floating point arithmetic as there are some potential problems with using f32. For instance, some numbers commonly used in decimal, notably all of single-decimal-digit numbers 0.1, 0.2 .. 0.9 except 0.5, are periodic in binary and thus fundamentally not representable exactly. Arithmetics performed on such numbers can lead to surprising results, e.g. adding 0.1 ten times gives 1.0000001, not 1, and taking a ceil() results in 2, clearly a wrong answer in vcpu allocation. So instead, we take advantage of the fact that container requests happen to be expressed as a quota/period fraction so we can sum up quotas, fundamentally integral numbers (possibly fractional only due to the need to rewrite them with a common denominator) with much less danger of precision loss. Signed-off-by: Pavel Mores <pmores@redhat.com>
2025-08-15 06:34:03 +00:00 · 2024-11-21 17:18:00 +01:00 · 2024-11-21 17:18:00 +01:00 · cdc0eab8e4
commit cdc0eab8e4
parent 163f04a918
1 changed files with 240 additions and 7 deletions
--- a/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs
+++ b/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs
@ -117,27 +117,67 @@ impl CpuResource {

    // calculates the total required vcpus by adding each container's requirements within the pod
    async fn calc_cpu_resources(&self) -> Result<u32> {
-        let mut total_vcpu = 0;
-        let mut cpuset_vcpu: HashSet<u32> = HashSet::new();
-
        let resources = self.container_cpu_resources.read().await;
+        if resources.is_empty() {
+            return Ok(self.default_vcpu);
+        }
+
+        // If requests of individual containers are expresses with different
+        // periods we'll need to rewrite them with a common denominator
+        // (period) before we can add the numerators (quotas).  We choose
+        // to use the largest period as the common denominator since it
+        // shifts precision out of the fractional part and into the
+        // integral part in case a rewritten quota ends up non-integral.
+        let max_period = resources
+            .iter()
+            .map(|(_, cpu_resource)| cpu_resource.period())
+            .max()
+            // It's ok to unwrap() here as we have checked that 'resources' is
+            // not empty.
+            .unwrap() as f64;
+
+        let mut cpuset_vcpu: HashSet<u32> = HashSet::new();
+        // Even though summing up quotas is fixed-point conceptually we
+        // represent the sum as floating-point because
+        // - we might be rewriting the quota/period fractions if periods
+        //   vary,and a rewritten quota can end up non-integral.  We want
+        //   to preserve the fractional parts until the final rounding
+        //   not to lose precision inadvertenty.
+        // - also to avoid some tedious casting doing maths with quotas.
+        // Using a 64-bit float to represent what are conceptually integral
+        // numbers should be safe here - f64 starts losing precision for
+        // integers only past 2^53 and a sums of quotas are extremely unlikely
+        // to reach that magnitude.
+        let mut total_quota: f64 = 0.0;
+
        for (_, cpu_resource) in resources.iter() {
-            let vcpu = cpu_resource.get_vcpus().unwrap_or(0) as u32;
            cpuset_vcpu.extend(cpu_resource.cpuset().iter());
-            total_vcpu += vcpu;
+
+            let quota = cpu_resource.quota() as f64;
+            let period = cpu_resource.period() as f64;
+            if quota >= 0.0 && period != 0.0 {
+                info!(sl!(), "total_quota={}, adding {}/{} == {}", total_quota, quota, period, quota * (max_period / period));
+                total_quota += quota * (max_period / period);
+            }
        }

        // contrained only by cpuset
-        if total_vcpu == 0 && !cpuset_vcpu.is_empty() {
+        if total_quota == 0.0 && !cpuset_vcpu.is_empty() {
            info!(sl!(), "(from cpuset)get vcpus # {:?}", cpuset_vcpu);
            return Ok(cpuset_vcpu.len() as u32);
        }

+        let total_vcpu = if total_quota > 0.0 && max_period != 0.0 {
+            self.default_vcpu as f64 + total_quota / max_period
+        } else {
+            self.default_vcpu as f64
+        };
+
        info!(
            sl!(),
            "(from cfs_quota&cfs_period)get vcpus count {}", total_vcpu
        );
-        Ok(total_vcpu)
+        Ok(total_vcpu.ceil() as u32)
    }

    // do hotplug and hot-unplug the vcpu
@ -169,3 +209,196 @@ impl CpuResource {
        Ok(new)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kata_types::config::{Hypervisor, TomlConfig};
+    use oci::LinuxCpu;
+
+    fn get_cpu_resource_with_default_vcpus(default_vcpus: i32) -> CpuResource {
+        let mut config = TomlConfig::default();
+        config
+            .hypervisor
+            .insert("qemu".to_owned(), Hypervisor::default());
+        config
+            .hypervisor
+            .entry("qemu".to_owned())
+            .and_modify(|hv_config| hv_config.cpu_info.default_vcpus = default_vcpus);
+        config.runtime.hypervisor_name = "qemu".to_owned();
+        CpuResource::new(Arc::new(config)).unwrap()
+    }
+
+    async fn add_linux_container_cpu_resources(cpu_res: &mut CpuResource, res: Vec<(i64, u64)>) {
+        let mut resources = cpu_res.container_cpu_resources.write().await;
+        for (i, (quota, period)) in res.iter().enumerate() {
+            let mut linux_cpu = LinuxCpu::default();
+            linux_cpu.set_quota(Some(*quota));
+            linux_cpu.set_period(Some(*period));
+            let res = LinuxContainerCpuResources::try_from(&linux_cpu).unwrap();
+            resources.insert(i.to_string(), res);
+        }
+    }
+
+    // A lot of the following tests document why a fixed-point-style
+    // calc_cpu_resources() implementation is better than a f32-based one.
+    #[tokio::test]
+    async fn test_rounding() {
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(0);
+
+        // A f32-based calc_cpu_resources() implementation would fail this
+        // test (adding 0.1 ten times gives roughly 1.0000001).
+        // An f64-based implementation would pass this one (with the summation
+        // result of 0.99999999999999989) but it still doesn't guarantee the
+        // correct result in general.  For instance, adding 0.1 twenty times
+        // in 64 bits results in 2.0000000000000004.
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+                (100_000, 1_000_000),
+            ],
+        )
+        .await;
+
+        assert_eq!(cpu_resource.calc_cpu_resources().await.unwrap(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_big_allocation_1() {
+        let default_vcpus = 10;
+
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![
+                (32_000_000, 1_000_000),
+                (32_000_000, 1_000_000),
+                (64_000_000, 1_000_000),
+            ],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            128 + default_vcpus
+        );
+    }
+
+    #[tokio::test]
+    async fn test_big_allocation_2() {
+        let default_vcpus = 10;
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![
+                (33_000_000, 1_000_000),
+                (31_000_000, 1_000_000),
+                (77_000_011, 1_000_000),
+            ],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            (33 + 31 + 77 + 1) + default_vcpus
+        );
+    }
+
+    #[tokio::test]
+    async fn test_big_allocation_3() {
+        let default_vcpus = 10;
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(&mut cpu_resource, vec![(141_000_008, 1_000_000)]).await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            142 + default_vcpus
+        );
+    }
+
+    #[tokio::test]
+    async fn test_big_allocation_4() {
+        let default_vcpus = 10;
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![
+                (17_000_001, 1_000_000),
+                (17_000_001, 1_000_000),
+                (17_000_001, 1_000_000),
+                (17_000_001, 1_000_000),
+            ],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            (4 * 17 + 1) + default_vcpus
+        );
+    }
+
+    #[tokio::test]
+    async fn test_divisible_periods() {
+        let default_vcpus = 3;
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![(1_000_000, 1_000_000), (1_000_000, 500_000)],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            3 + default_vcpus
+        );
+
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![(3_000_000, 1_500_000), (1_000_000, 500_000)],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            4 + default_vcpus
+        );
+    }
+
+    #[tokio::test]
+    async fn test_indivisible_periods() {
+        let default_vcpus = 1;
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![(1_000_000, 1_000_000), (900_000, 300_000)],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            4 + default_vcpus
+        );
+
+        let mut cpu_resource = get_cpu_resource_with_default_vcpus(default_vcpus as i32);
+        add_linux_container_cpu_resources(
+            &mut cpu_resource,
+            vec![(1_000_000, 1_000_000), (900_000, 299_999)],
+        )
+        .await;
+
+        assert_eq!(
+            cpu_resource.calc_cpu_resources().await.unwrap(),
+            5 + default_vcpus
+        );
+    }
+}