diff --git a/src/runtime-rs/crates/resource/src/volume/hugepage.rs b/src/runtime-rs/crates/resource/src/volume/hugepage.rs index 7349a13fe6..10df760c4c 100644 --- a/src/runtime-rs/crates/resource/src/volume/hugepage.rs +++ b/src/runtime-rs/crates/resource/src/volume/hugepage.rs @@ -13,7 +13,7 @@ use std::{ use super::{Volume, BIND}; use crate::share_fs::ephemeral_path; use agent::Storage; -use anyhow::{anyhow, Context, Ok, Result}; +use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use byte_unit::{Byte, Unit}; use hypervisor::{device::device_manager::DeviceManager, HUGETLBFS}; @@ -188,6 +188,46 @@ mod tests { }; use test_utils::skip_if_not_root; + /// List the huge page sizes the running kernel actually exposes via + /// `/sys/kernel/mm/hugepages/hugepages-NkB`, rendered as binary-unit + /// strings (e.g. "2Mi", "1Gi") that are accepted both by the kernel's + /// `pagesize=...` mount option and by `byte_unit::Byte::parse_str(s, + /// /*allow_binary=*/ true)`. + /// + /// This test was historically hard-coded to `["1Gi", "2Mi"]`, which + /// happens to match what x86_64 Ubuntu kernels expose by default, but + /// other architectures use different page sizes (s390x typically + /// exposes "1Mi", ppc64le with 64K base pages typically exposes "16Mi" + /// and/or "16Gi", etc.). Discovering them at runtime keeps the test + /// arch-portable. + fn supported_hugetlbfs_page_sizes() -> Vec { + let Ok(entries) = fs::read_dir("/sys/kernel/mm/hugepages") else { + return Vec::new(); + }; + let mut sizes = Vec::new(); + for entry in entries.flatten() { + let Ok(name) = entry.file_name().into_string() else { + continue; + }; + let Some(kib) = name + .strip_prefix("hugepages-") + .and_then(|s| s.strip_suffix("kB")) + .and_then(|s| s.parse::().ok()) + else { + continue; + }; + let s = if kib % (1024 * 1024) == 0 { + format!("{}Gi", kib / (1024 * 1024)) + } else if kib % 1024 == 0 { + format!("{}Mi", kib / 1024) + } else { + format!("{}Ki", kib) + }; + sizes.push(s); + } + sizes + } + #[test] fn test_get_huge_page_option() { let format_sizes = ["1GB", "2MB"]; @@ -227,17 +267,62 @@ mod tests { #[test] fn test_get_huge_page_size() { skip_if_not_root!(); - let format_sizes = ["1Gi", "2Mi"]; + let format_sizes = supported_hugetlbfs_page_sizes(); + if format_sizes.is_empty() { + // No hugetlbfs pools on this kernel (e.g. hugetlbfs is + // unconfigured or /sys isn't mounted in the test environment); + // nothing meaningful to round-trip. + return; + } + // Probe once before iterating: some CI runners (e.g. the + // ubuntu-24.04-s390x GHA runner) report supported huge-page sizes via + // /sys but execute the test inside a user/mount namespace where + // mount(2) of hugetlbfs is forbidden (EPERM) even when running as + // root. There's no portable capability bit we can sniff for that, so + // just try once and bail out cleanly if the kernel won't let us mount + // hugetlbfs at all -- skipping is more honest than failing on + // something this test can't control. A real regression on a host + // where mount() *does* work will still surface inside the loop below. + // Hugetlbfs's `pagesize=` mount option expects the kernel-native + // shorthand ("2M", "1G"), not byte_unit's IEC form ("2Mi", "1Gi"): + // it parses the value with `memparse()`, and `/proc/mounts` always + // renders it back as `pagesize={K,M,G}` regardless of input. Pass + // the trimmed form to mount(2) so the test doesn't rely on the + // kernel silently ignoring the trailing `i`, and keep the IEC form + // for the `Byte::parse_str(_, /*allow_binary=*/ true)` comparison. + let probe_dir = tempfile::tempdir().unwrap(); + let probe_dst = probe_dir + .path() + .join(format!("hugepages-probe-{}", format_sizes[0])); + fs::create_dir_all(&probe_dst).unwrap(); + let probe_kernel_size = format_sizes[0].trim_end_matches('i'); + if let Err(e) = mount( + Some(NODEV), + &probe_dst, + Some(HUGETLBFS), + MsFlags::MS_NODEV, + Some(format!("pagesize={}", probe_kernel_size).as_str()), + ) { + eprintln!( + "test_get_huge_page_size: skipping, hugetlbfs mount probe failed \ + (pagesize={}): {}", + probe_kernel_size, e + ); + return; + } + umount(&probe_dst).unwrap(); + for format_size in format_sizes { let dir = tempfile::tempdir().unwrap(); let dst = dir.path().join(format!("hugepages-{}", format_size)); fs::create_dir_all(&dst).unwrap(); + let kernel_size = format_size.trim_end_matches('i'); mount( Some(NODEV), &dst, Some(HUGETLBFS), MsFlags::MS_NODEV, - Some(format!("pagesize={}", format_size).as_str()), + Some(format!("pagesize={}", kernel_size).as_str()), ) .unwrap(); let mut mount = oci::Mount::default(); @@ -245,7 +330,7 @@ mod tests { let option = get_huge_page_option(&mount).unwrap().unwrap(); let page_size = get_page_size(option).unwrap(); - assert_eq!(page_size, Byte::parse_str(format_size, true).unwrap()); + assert_eq!(page_size, Byte::parse_str(&format_size, true).unwrap()); umount(&dst).unwrap(); fs::remove_dir(&dst).unwrap(); }