From 7e5cc37fab4580156be9e7a021e390f3edc60cb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 27 Apr 2026 16:51:23 +0200 Subject: [PATCH] runtime-rs: resource: discover hugetlbfs page sizes from sysfs in test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `volume::hugepage::tests::test_get_huge_page_size` was hard-coded to exercise the round-trip through `get_huge_page_option` / `get_page_size` for two hugetlbfs page sizes: let format_sizes = ["1Gi", "2Mi"]; These are the sizes x86_64 Ubuntu kernels expose by default (`/sys/kernel/mm/hugepages/hugepages-{1048576,2048}kB`), but other architectures use different sizes: * s390x: typically `hugepages-1048576kB` only (1 GiB; no 2 MiB pool) -- the kernel returns `EINVAL` for the missing 2 MiB iteration: thread 'volume::hugepage::tests::test_get_huge_page_size' panicked at .../resource/src/volume/hugepage.rs:242:14: called `Result::unwrap()` on an `Err` value: EINVAL * ppc64le: page sizes vary by kernel build (e.g. 16M/16G with 64K base pages, 2M/1G with 4K base pages), and may not match `["1Gi", "2Mi"]` exactly. Same EINVAL on the iteration whose size isn't a registered hstate. The reason this never bit before is the same as the SELinux test in the previous-but-one commit: the runtime-rs `Makefile` wrapped `test` in an `ifeq UNSUPPORTED_ARCHS` block that turned it into `echo ...; exit 0` on s390x/ppc64le/riscv64gc, so the test was only ever exercised on x86_64 (and aarch64, which happens to have the same default hugetlb page sizes). Dropping that gate is what exposed the latent assumption. Replace the hard-coded list with a small helper that lists the hugetlbfs page sizes the running kernel actually exposes via `/sys/kernel/mm/hugepages/hugepages-NkB`, rendered as binary-unit strings (e.g. "2Mi", "1Gi") that are accepted both by the kernel's `pagesize=...` mount option and by `byte_unit::Byte::parse_str(s, /*allow_binary=*/ true)`. If `/sys/kernel/mm/hugepages` doesn't exist or the directory is empty (e.g. hugetlbfs is unconfigured in the test environment) the test simply returns -- there's nothing meaningful to round-trip. On x86_64 the discovered list comes out as `["1Gi", "2Mi"]` (the same coverage as before). On s390x it becomes `["1Gi"]`, on ppc64le whatever that kernel build supports. Sysfs alone, however, is a necessary-but-not-sufficient signal: it tells us the kernel registered the page size, not whether *this process* is allowed to mount hugetlbfs. The ubuntu-24.04-s390x GHA runner demonstrates the gap -- it exposes `hugepages-1048576kB` via /sys but runs the build inside a user/mount namespace where mount(2) of hugetlbfs returns EPERM even when the test is invoked through sudo: thread 'volume::hugepage::tests::test_get_huge_page_size' panicked at .../resource/src/volume/hugepage.rs:292:14: called `Result::unwrap()` on an `Err` value: EPERM There's no portable capability bit we can sniff for that, so probe once with the first discovered size before iterating; if the probe mount fails, skip the test (rather than panic on something it can't control). A real regression on a host where mount() *does* work will still surface inside the loop below, since the per-size mount calls there continue to assert via `.unwrap()`. While here, feed the kernel-native shorthand (e.g. "2M", "1G") rather than the IEC form ("2Mi", "1Gi") to mount(2). hugetlbfs parses `pagesize=` via `memparse()`, which understands K/M/G but not the IEC `Ki/Mi/Gi`; today the kernel happens to silently drop the trailing `i` (memparse just stops scanning), but that leniency is incidental. /proc/mounts in turn always renders the option back as `pagesize={K,M,G}`, which is exactly the form `get_page_size()` already expects -- it strips `pagesize=` and unconditionally appends `i` before handing the result to byte_unit. Stripping the `i` for the mount option keeps the test's input aligned with the kernel's canonical syntax, while leaving the IEC form intact for the `Byte::parse_str(..., /*allow_binary=*/ true)` comparison. Also drop the unused `Ok` re-export from `use anyhow::{anyhow, Context, Ok, Result}`. Every existing `Ok(...)` site in this module is the variant-constructor form, for which the prelude's `Result::Ok` already works fine in `anyhow::Result` context (same enum, with `E = anyhow::Error` inferred from the surrounding return type), so nothing actually needed `anyhow::Ok` to begin with. Removing the import lets the new helper use plain `let Ok(entries) = ... else` / `let Ok(name) = ... else` patterns directly instead of funneling everything through `.ok()` + `if let Some(...)` to dodge the shadowing. Made-with: Cursor Signed-off-by: Fabiano FidĂȘncio Made-with: Cursor --- .../crates/resource/src/volume/hugepage.rs | 93 ++++++++++++++++++- 1 file changed, 89 insertions(+), 4 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/volume/hugepage.rs b/src/runtime-rs/crates/resource/src/volume/hugepage.rs index 7349a13fe6..10df760c4c 100644 --- a/src/runtime-rs/crates/resource/src/volume/hugepage.rs +++ b/src/runtime-rs/crates/resource/src/volume/hugepage.rs @@ -13,7 +13,7 @@ use std::{ use super::{Volume, BIND}; use crate::share_fs::ephemeral_path; use agent::Storage; -use anyhow::{anyhow, Context, Ok, Result}; +use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use byte_unit::{Byte, Unit}; use hypervisor::{device::device_manager::DeviceManager, HUGETLBFS}; @@ -188,6 +188,46 @@ mod tests { }; use test_utils::skip_if_not_root; + /// List the huge page sizes the running kernel actually exposes via + /// `/sys/kernel/mm/hugepages/hugepages-NkB`, rendered as binary-unit + /// strings (e.g. "2Mi", "1Gi") that are accepted both by the kernel's + /// `pagesize=...` mount option and by `byte_unit::Byte::parse_str(s, + /// /*allow_binary=*/ true)`. + /// + /// This test was historically hard-coded to `["1Gi", "2Mi"]`, which + /// happens to match what x86_64 Ubuntu kernels expose by default, but + /// other architectures use different page sizes (s390x typically + /// exposes "1Mi", ppc64le with 64K base pages typically exposes "16Mi" + /// and/or "16Gi", etc.). Discovering them at runtime keeps the test + /// arch-portable. + fn supported_hugetlbfs_page_sizes() -> Vec { + let Ok(entries) = fs::read_dir("/sys/kernel/mm/hugepages") else { + return Vec::new(); + }; + let mut sizes = Vec::new(); + for entry in entries.flatten() { + let Ok(name) = entry.file_name().into_string() else { + continue; + }; + let Some(kib) = name + .strip_prefix("hugepages-") + .and_then(|s| s.strip_suffix("kB")) + .and_then(|s| s.parse::().ok()) + else { + continue; + }; + let s = if kib % (1024 * 1024) == 0 { + format!("{}Gi", kib / (1024 * 1024)) + } else if kib % 1024 == 0 { + format!("{}Mi", kib / 1024) + } else { + format!("{}Ki", kib) + }; + sizes.push(s); + } + sizes + } + #[test] fn test_get_huge_page_option() { let format_sizes = ["1GB", "2MB"]; @@ -227,17 +267,62 @@ mod tests { #[test] fn test_get_huge_page_size() { skip_if_not_root!(); - let format_sizes = ["1Gi", "2Mi"]; + let format_sizes = supported_hugetlbfs_page_sizes(); + if format_sizes.is_empty() { + // No hugetlbfs pools on this kernel (e.g. hugetlbfs is + // unconfigured or /sys isn't mounted in the test environment); + // nothing meaningful to round-trip. + return; + } + // Probe once before iterating: some CI runners (e.g. the + // ubuntu-24.04-s390x GHA runner) report supported huge-page sizes via + // /sys but execute the test inside a user/mount namespace where + // mount(2) of hugetlbfs is forbidden (EPERM) even when running as + // root. There's no portable capability bit we can sniff for that, so + // just try once and bail out cleanly if the kernel won't let us mount + // hugetlbfs at all -- skipping is more honest than failing on + // something this test can't control. A real regression on a host + // where mount() *does* work will still surface inside the loop below. + // Hugetlbfs's `pagesize=` mount option expects the kernel-native + // shorthand ("2M", "1G"), not byte_unit's IEC form ("2Mi", "1Gi"): + // it parses the value with `memparse()`, and `/proc/mounts` always + // renders it back as `pagesize={K,M,G}` regardless of input. Pass + // the trimmed form to mount(2) so the test doesn't rely on the + // kernel silently ignoring the trailing `i`, and keep the IEC form + // for the `Byte::parse_str(_, /*allow_binary=*/ true)` comparison. + let probe_dir = tempfile::tempdir().unwrap(); + let probe_dst = probe_dir + .path() + .join(format!("hugepages-probe-{}", format_sizes[0])); + fs::create_dir_all(&probe_dst).unwrap(); + let probe_kernel_size = format_sizes[0].trim_end_matches('i'); + if let Err(e) = mount( + Some(NODEV), + &probe_dst, + Some(HUGETLBFS), + MsFlags::MS_NODEV, + Some(format!("pagesize={}", probe_kernel_size).as_str()), + ) { + eprintln!( + "test_get_huge_page_size: skipping, hugetlbfs mount probe failed \ + (pagesize={}): {}", + probe_kernel_size, e + ); + return; + } + umount(&probe_dst).unwrap(); + for format_size in format_sizes { let dir = tempfile::tempdir().unwrap(); let dst = dir.path().join(format!("hugepages-{}", format_size)); fs::create_dir_all(&dst).unwrap(); + let kernel_size = format_size.trim_end_matches('i'); mount( Some(NODEV), &dst, Some(HUGETLBFS), MsFlags::MS_NODEV, - Some(format!("pagesize={}", format_size).as_str()), + Some(format!("pagesize={}", kernel_size).as_str()), ) .unwrap(); let mut mount = oci::Mount::default(); @@ -245,7 +330,7 @@ mod tests { let option = get_huge_page_option(&mount).unwrap().unwrap(); let page_size = get_page_size(option).unwrap(); - assert_eq!(page_size, Byte::parse_str(format_size, true).unwrap()); + assert_eq!(page_size, Byte::parse_str(&format_size, true).unwrap()); umount(&dst).unwrap(); fs::remove_dir(&dst).unwrap(); }