MlCoord: Execution queue

This CL enables multiple outstanding periodic executions. To do so the
MlCoordinator now includes an array of started models and a queue of
models that are ready to be executed immediately. Additionally, each
periodic model has an associated timer. When that timer fires the model
is added to the execution queue. When a model finishes executing, the
next model is popped off the queue and executed. If a model becomes
ready when there's already an execution for it queued, that execution
is dropped and a warning printed. A cancel command is added to remove
periodic or outstanding executions.

A state debug command was also added.

Currently we can only load a single model due to limitations with the
StorageManager, but we can do so multiple times.

Tests:
Single shot: https://paste.googleplex.com/6704629669691392
Two periodic execs: https://paste.googleplex.com/5288292800004096
Overloaded warning: https://paste.googleplex.com/4549962219126784

Debug State:
KATA> state_mlcoord
kata_ml_coordinator::Running model: fubar:mobilenet_v1_emitc_static.model
kata_ml_coordinator::Loaded model: fubar:mobilenet_v1_emitc_static.model
kata_ml_coordinator::Loadable Models:
kata_ml_coordinator::  LoadableModel { bundle_id: "fubar", model_id: "mobilenet_v1_emitc_static.model", rate_in_ms: Some(2000) }
kata_ml_coordinator::  LoadableModel { bundle_id: "fubar", model_id: "mobilenet_v1_emitc_static.model", rate_in_ms: Some(6000) }
kata_ml_coordinator::Execution Queue:
kata_ml_coordinator::  fubar:mobilenet_v1_emitc_static.model
kata_ml_coordinator::Statistics: Statistics { load_failures: 0, already_queued: 0 }

Change-Id: I7637c9c390eb6ffd9ae22088f37b98c056a441c2
GitOrigin-RevId: 18c0d3fe740a37381f7f1eddee8f2224f679fd61
This commit is contained in:
Adam Jesionowski
2022-05-12 18:18:33 +00:00
committed by Sam Leffler
parent 20f1d1aa9d
commit 1662e80ef1
15 changed files with 705 additions and 266 deletions

View File

@@ -0,0 +1,22 @@
cargo-features = ["edition2021"]
[package]
name = "kata-ml-component"
version = "0.1.0"
authors = ["Adam Jesionowski <jesionowski@google.com>"]
edition = "2021"
[dependencies]
cstr_core = { version = "0.2.3", default-features = false }
kata-os-common = { path = "../../kata-os-common" }
kata-memory-interface = { path = "../../MemoryManager/kata-memory-interface" }
kata-ml-coordinator = { path = "../kata-ml-coordinator" }
kata-ml-interface = { path = "../kata-ml-interface" }
kata-timer-interface = { path = "../../TimerService/kata-timer-interface" }
log = "0.4"
spin = "0.9"
[lib]
name = "kata_ml_component"
path = "src/run.rs"
crate-type = ["staticlib"]

View File

@@ -0,0 +1,164 @@
#![no_std]
#![allow(clippy::missing_safety_doc)]
extern crate alloc;
use alloc::string::String;
use cstr_core::CStr;
use kata_ml_coordinator::MLCoordinator;
use kata_ml_coordinator::ModelIdx;
use kata_ml_interface::MlCoordError;
use kata_os_common::allocator;
use kata_os_common::logger::KataLogger;
use kata_os_common::sel4_sys;
use kata_os_common::slot_allocator::KATA_CSPACE_SLOTS;
use kata_timer_interface::*;
use log::{error, trace};
use sel4_sys::seL4_CPtr;
use spin::Mutex;
static mut ML_COORD: Mutex<MLCoordinator> = Mutex::new(MLCoordinator::new());
extern "C" {
static SELF_CNODE_FIRST_SLOT: seL4_CPtr;
static SELF_CNODE_LAST_SLOT: seL4_CPtr;
}
#[no_mangle]
pub unsafe extern "C" fn pre_init() {
static KATA_LOGGER: KataLogger = KataLogger;
log::set_logger(&KATA_LOGGER).unwrap();
log::set_max_level(log::LevelFilter::Trace);
// TODO(sleffler): temp until we integrate with seL4
static mut HEAP_MEMORY: [u8; 4 * 1024] = [0; 4 * 1024];
allocator::ALLOCATOR.init(HEAP_MEMORY.as_mut_ptr() as usize, HEAP_MEMORY.len());
trace!(
"setup heap: start_addr {:p} size {}",
HEAP_MEMORY.as_ptr(),
HEAP_MEMORY.len()
);
KATA_CSPACE_SLOTS.init(
/*first_slot=*/ SELF_CNODE_FIRST_SLOT,
/*size=*/ SELF_CNODE_LAST_SLOT - SELF_CNODE_FIRST_SLOT,
);
trace!(
"setup cspace slots: first slot {} free {}",
KATA_CSPACE_SLOTS.base_slot(),
KATA_CSPACE_SLOTS.free_slots()
);
}
#[no_mangle]
pub unsafe extern "C" fn mlcoord__init() {
ML_COORD.lock().init();
}
#[no_mangle]
pub extern "C" fn run() {
loop {
timer_service_wait();
let completed = timer_service_completed_timers();
for i in 0..31 {
let idx: u32 = 1 << i;
if completed & idx != 0 {
unsafe {
if let Err(e) = ML_COORD.lock().timer_completed(i as ModelIdx) {
error!("Error when trying to run periodic model: {:?}", e);
}
}
}
}
}
}
unsafe fn validate_ids(
c_bundle_id: *const cstr_core::c_char,
c_model_id: *const cstr_core::c_char,
) -> Result<(String, String), MlCoordError> {
let bundle_id = CStr::from_ptr(c_bundle_id)
.to_str()
.map_err(|_| MlCoordError::InvalidBundleId)?;
let model_id = CStr::from_ptr(c_model_id)
.to_str()
.map_err(|_| MlCoordError::InvalidModelId)?;
Ok((String::from(bundle_id), String::from(model_id)))
}
#[no_mangle]
pub unsafe extern "C" fn mlcoord_oneshot(
c_bundle_id: *const cstr_core::c_char,
c_model_id: *const cstr_core::c_char,
) -> MlCoordError {
let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
Ok(ids) => ids,
Err(e) => return e,
};
if let Err(e) = ML_COORD.lock().oneshot(bundle_id, model_id) {
return e;
}
MlCoordError::MlCoordOk
}
#[no_mangle]
pub unsafe extern "C" fn mlcoord_periodic(
c_bundle_id: *const cstr_core::c_char,
c_model_id: *const cstr_core::c_char,
rate_in_ms: u32,
) -> MlCoordError {
let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
Ok(ids) => ids,
Err(e) => return e,
};
if let Err(e) = ML_COORD.lock().periodic(bundle_id, model_id, rate_in_ms) {
return e;
}
MlCoordError::MlCoordOk
}
#[no_mangle]
pub unsafe extern "C" fn mlcoord_cancel(
c_bundle_id: *const cstr_core::c_char,
c_model_id: *const cstr_core::c_char,
) -> MlCoordError {
let (bundle_id, model_id) = match validate_ids(c_bundle_id, c_model_id) {
Ok(ids) => ids,
Err(e) => return e,
};
if let Err(e) = ML_COORD.lock().cancel(bundle_id, model_id) {
return e;
}
MlCoordError::MlCoordOk
}
#[no_mangle]
pub unsafe extern "C" fn host_req_handle() {
ML_COORD.lock().handle_host_req_interrupt();
}
#[no_mangle]
pub unsafe extern "C" fn finish_handle() {
ML_COORD.lock().handle_return_interrupt();
}
#[no_mangle]
pub unsafe extern "C" fn instruction_fault_handle() {
ML_COORD.lock().handle_instruction_fault_interrupt();
}
#[no_mangle]
pub unsafe extern "C" fn data_fault_handle() {
ML_COORD.lock().handle_data_fault_interrupt();
}
#[no_mangle]
pub unsafe extern "C" fn mlcoord_debug_state() {
ML_COORD.lock().debug_state();
}