MlCoord: Execution queue

This CL enables multiple outstanding periodic executions. To do so the MlCoordinator now includes an array of started models and a queue of models that are ready to be executed immediately. Additionally, each periodic model has an associated timer. When that timer fires the model is added to the execution queue. When a model finishes executing, the next model is popped off the queue and executed. If a model becomes ready when there's already an execution for it queued, that execution is dropped and a warning printed. A cancel command is added to remove periodic or outstanding executions. A state debug command was also added. Currently we can only load a single model due to limitations with the StorageManager, but we can do so multiple times. Tests: Single shot: https://paste.googleplex.com/6704629669691392 Two periodic execs: https://paste.googleplex.com/5288292800004096 Overloaded warning: https://paste.googleplex.com/4549962219126784 Debug State: KATA> state_mlcoord kata_ml_coordinator::Running model: fubar:mobilenet_v1_emitc_static.model kata_ml_coordinator::Loaded model: fubar:mobilenet_v1_emitc_static.model kata_ml_coordinator::Loadable Models: kata_ml_coordinator:: LoadableModel { bundle_id: "fubar", model_id: "mobilenet_v1_emitc_static.model", rate_in_ms: Some(2000) } kata_ml_coordinator:: LoadableModel { bundle_id: "fubar", model_id: "mobilenet_v1_emitc_static.model", rate_in_ms: Some(6000) } kata_ml_coordinator::Execution Queue: kata_ml_coordinator:: fubar:mobilenet_v1_emitc_static.model kata_ml_coordinator::Statistics: Statistics { load_failures: 0, already_queued: 0 } Change-Id: I7637c9c390eb6ffd9ae22088f37b98c056a441c2 GitOrigin-RevId: 18c0d3fe740a37381f7f1eddee8f2224f679fd61
2025-09-21 11:28:01 +00:00 · 2022-05-12 18:18:33 +00:00
parent 20f1d1aa9d
commit 1662e80ef1
15 changed files with 705 additions and 266 deletions
--- a/apps/system/components/MlCoordinator/kata-ml-component/Cargo.toml
+++ b/apps/system/components/MlCoordinator/kata-ml-component/Cargo.toml
@@ -0,0 +1,22 @@
+cargo-features = ["edition2021"]
+
+[package]
+name = "kata-ml-component"
+version = "0.1.0"
+authors = ["Adam Jesionowski <jesionowski@google.com>"]
+edition = "2021"
+
+[dependencies]
+cstr_core = { version = "0.2.3", default-features = false }
+kata-os-common = { path = "../../kata-os-common" }
+kata-memory-interface = { path = "../../MemoryManager/kata-memory-interface" }
+kata-ml-coordinator = { path = "../kata-ml-coordinator" }
+kata-ml-interface = { path = "../kata-ml-interface" }
+kata-timer-interface = { path = "../../TimerService/kata-timer-interface" }
+log = "0.4"
+spin = "0.9"
+
+[lib]
+name = "kata_ml_component"
+path = "src/run.rs"
+crate-type = ["staticlib"]
--- a/apps/system/components/MlCoordinator/kata-ml-component/src/run.rs
+++ b/apps/system/components/MlCoordinator/kata-ml-component/src/run.rs
@@ -0,0 +1,164 @@
+#![no_std]
+#![allow(clippy::missing_safety_doc)]
+
+extern crate alloc;
+
+use alloc::string::String;
+use cstr_core::CStr;
+use kata_ml_coordinator::MLCoordinator;
+use kata_ml_coordinator::ModelIdx;
+use kata_ml_interface::MlCoordError;
+use kata_os_common::allocator;
+use kata_os_common::logger::KataLogger;
+use kata_os_common::sel4_sys;
+use kata_os_common::slot_allocator::KATA_CSPACE_SLOTS;
+use kata_timer_interface::*;
+use log::{error, trace};
+use sel4_sys::seL4_CPtr;
+use spin::Mutex;
+
+static mut ML_COORD: Mutex<MLCoordinator> = Mutex::new(MLCoordinator::new());
+
+extern "C" {
+    static SELF_CNODE_FIRST_SLOT: seL4_CPtr;
+    static SELF_CNODE_LAST_SLOT: seL4_CPtr;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn pre_init() {
+    static KATA_LOGGER: KataLogger = KataLogger;
+    log::set_logger(&KATA_LOGGER).unwrap();
+    log::set_max_level(log::LevelFilter::Trace);
+
+    // TODO(sleffler): temp until we integrate with seL4
+    static mut HEAP_MEMORY: [u8; 4 * 1024] = [0; 4 * 1024];
+    allocator::ALLOCATOR.init(HEAP_MEMORY.as_mut_ptr() as usize, HEAP_MEMORY.len());
+    trace!(
+        "setup heap: start_addr {:p} size {}",
+        HEAP_MEMORY.as_ptr(),
+        HEAP_MEMORY.len()
+    );
+
+    KATA_CSPACE_SLOTS.init(
+        /*first_slot=*/ SELF_CNODE_FIRST_SLOT,
+        /*size=*/ SELF_CNODE_LAST_SLOT - SELF_CNODE_FIRST_SLOT,
+    );
+    trace!(
+        "setup cspace slots: first slot {} free {}",
+        KATA_CSPACE_SLOTS.base_slot(),
+        KATA_CSPACE_SLOTS.free_slots()
+    );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn mlcoord__init() {
+    ML_COORD.lock().init();
+}
+
+#[no_mangle]
+pub extern "C" fn run() {
+    loop {
+        timer_service_wait();
+        let completed = timer_service_completed_timers();
+
+        for i in 0..31 {
+            let idx: u32 = 1 << i;
+            if completed & idx != 0 {
+                unsafe {
+                    if let Err(e) = ML_COORD.lock().timer_completed(i as ModelIdx) {
+                        error!("Error when trying to run periodic model: {:?}", e);
+                    }
+                }
+            }
+        }
+    }
+}
+
+unsafe fn validate_ids(
+    c_bundle_id: *const cstr_core::c_char,
+    c_model_id: *const cstr_core::c_char,
+) -> Result<(String, String), MlCoordError> {
+    let bundle_id = CStr::from_ptr(c_bundle_id)
+        .to_str()
+        .map_err(|_| MlCoordError::InvalidBundleId)?;
+    let model_id = CStr::from_ptr(c_model_id)
+        .to_str()
+        .map_err(|_| MlCoordError::InvalidModelId)?;
+    Ok((String::from(bundle_id), String::from(model_id)))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn mlcoord_oneshot(
+    c_bundle_id: *const cstr_core::c_char,
+    c_model_id: *const cstr_core::c_char,
+) -> MlCoordError {
+    let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
+        Ok(ids) => ids,
+        Err(e) => return e,
+    };
+
+    if let Err(e) = ML_COORD.lock().oneshot(bundle_id, model_id) {
+        return e;
+    }
+
+    MlCoordError::MlCoordOk
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn mlcoord_periodic(
+    c_bundle_id: *const cstr_core::c_char,
+    c_model_id: *const cstr_core::c_char,
+    rate_in_ms: u32,
+) -> MlCoordError {
+    let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
+        Ok(ids) => ids,
+        Err(e) => return e,
+    };
+    if let Err(e) = ML_COORD.lock().periodic(bundle_id, model_id, rate_in_ms) {
+        return e;
+    }
+
+    MlCoordError::MlCoordOk
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn mlcoord_cancel(
+    c_bundle_id: *const cstr_core::c_char,
+    c_model_id: *const cstr_core::c_char,
+) -> MlCoordError {
+    let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
+        Ok(ids) => ids,
+        Err(e) => return e,
+    };
+
+    if let Err(e) = ML_COORD.lock().cancel(bundle_id, model_id) {
+        return e;
+    }
+
+    MlCoordError::MlCoordOk
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn host_req_handle() {
+    ML_COORD.lock().handle_host_req_interrupt();
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn finish_handle() {
+    ML_COORD.lock().handle_return_interrupt();
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn instruction_fault_handle() {
+    ML_COORD.lock().handle_instruction_fault_interrupt();
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn data_fault_handle() {
+    ML_COORD.lock().handle_data_fault_interrupt();
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn mlcoord_debug_state() {
+    ML_COORD.lock().debug_state();
+}