MlCoordinator: Use return information at model_output

Previously we put the return code and fault PC (if any) at the very end of the TCM. This was always intended to be temporary. The return information has instead been placed at the beginning of the model_output section. Minor changes: * Running_model tracks the image_id instead of an index. * Re-ordered ImageSizes members to match layout in memory. * Rename unpacked_size to in_memory_size to match rest of MlCoordinator * Fix fake-vec-core to match kata-vec-core Change-Id: I33ceb59cb36312b60992a6ada49605ffa3b2fa78 GitOrigin-RevId: 5cf399e2c609e0b4ca83101714881b02eb09b94e
2025-09-16 15:08:27 +00:00 · 2022-08-08 14:42:36 -07:00
parent fd7f31bcb2
commit 492a4e725d
5 changed files with 72 additions and 42 deletions
--- a/apps/system/components/MlCoordinator/fake-vec-core/src/lib.rs
+++ b/apps/system/components/MlCoordinator/fake-vec-core/src/lib.rs
@@ -5,12 +5,12 @@ extern crate alloc;

 use alloc::boxed::Box;
 use kata_io::Read;
-use kata_ml_shared::Permission;
+use kata_ml_shared::{OutputHeader, Permission, WindowId};

 pub fn enable_interrupts(_enable: bool) {}

 pub fn set_wmmu_window(
-    _window_id: usize,
+    _window_id: WindowId,
    _start_address: usize,
    _length: usize,
    _permission: Permission,
@@ -42,6 +42,4 @@ pub fn clear_tcm(_addr: usize, _len: usize) {}

 pub fn wait_for_clear_to_finish() {}

-pub fn get_return_code() -> u32 { 0 }
-
-pub fn get_fault_register() -> u32 { 0 }
+pub fn get_output_header(_addr: usize) -> OutputHeader { OutputHeader::default() }
--- a/apps/system/components/MlCoordinator/kata-ml-coordinator/src/lib.rs
+++ b/apps/system/components/MlCoordinator/kata-ml-coordinator/src/lib.rs
@@ -33,8 +33,8 @@ struct Statistics {
 }

 pub struct MLCoordinator {
-    /// The currently running model index, if any.
-    running_model: Option<ModelIdx>,
+    /// The currently running model, if any.
+    running_model: Option<ImageId>,
    /// A list of all models that have been requested for oneshot or periodic
    /// execution.
    models: [Option<LoadableModel>; MAX_MODELS],
@@ -233,7 +233,7 @@ impl MLCoordinator {

        self.image_manager.set_wmmu(&model.id);

-        self.running_model = Some(next_idx);
+        self.running_model = Some(model.id.clone());
        MlCore::run(); // Start core at default PC.

        Ok(())
@@ -244,17 +244,25 @@ impl MLCoordinator {
            fn finish_acknowledge() -> u32;
        }

-        // TODO(jesionowski): Move the result from TCM to SRAM,
-        // update the input/model.
-        let return_code = MlCore::get_return_code();
-        let fault = MlCore::get_fault_register();
+        if let Some(image_id) = self.running_model.as_ref() {
+            if let Some(output_header) = self.image_manager.output_header(image_id) {
+                // TODO(jesionowski): Move the result from TCM to SRAM,
+                // update the input/model.

-        // TODO(jesionowski): Signal the application that there was a failure.
-        if return_code != 0 {
-            error!(
-                "vctop execution failed with code {}, fault pc: {:#010X}",
-                return_code, fault
-            );
+                if output_header.return_code != 0 {
+                    // TODO(jesionowski): Signal the application that there was a failure.
+                    error!(
+                        "vctop execution failed with code {}, fault pc: {:#010X}",
+                        output_header.return_code, output_header.epc
+                    );
+                }
+            } else {
+                // This can happen during normal execution if mlcancel happens
+                // during an execution.
+                warn!("Executable finished running but image is not loaded.");
+            }
+        } else {
+            error!("Unexpected return interrupt with no running model.")
        }

        self.running_model = None;
@@ -432,10 +440,9 @@ impl MLCoordinator {
    }

    pub fn debug_state(&self) {
-        match self.running_model {
-            Some(idx) => {
-                let (bundle, model) = self.ids_at(idx);
-                info!("Running model: {}:{}", bundle, model);
+        match &self.running_model {
+            Some(id) => {
+                info!("Running model: {}:{}", id.bundle_id, id.model_id);
            }
            None => info!("No running model."),
        }
--- a/apps/system/components/MlCoordinator/kata-ml-shared/src/lib.rs
+++ b/apps/system/components/MlCoordinator/kata-ml-shared/src/lib.rs
@@ -20,10 +20,10 @@ pub struct ImageId {
 /// description of each section. Sizes are in bytes.
 #[derive(Clone, Copy, Debug, Default)]
 pub struct ImageSizes {
-    pub text: usize,
    pub model_input: usize,
-    pub model_output: usize,
+    pub text: usize,
    pub constant_data: usize,
+    pub model_output: usize,
    pub static_data: usize,
    pub temporary_data: usize,
 }
@@ -48,6 +48,19 @@ impl ImageSizes {
            && self.static_data != 0
            && self.temporary_data != 0
    }
+
+    pub fn model_output_offset(&self) -> usize { self.text + self.constant_data }
+}
+
+/// After execution our ML executable populates the top of .model_output with
+/// the return code, the address of the fault if the RC is non-zero, and the
+/// length of the output that follows.
+#[derive(Clone, Copy, Debug, Default)]
+#[repr(C)]
+pub struct OutputHeader {
+    pub return_code: u32,
+    pub epc: u32,
+    pub output_length: u32,
 }

 /// The page size of the WMMU.
--- a/apps/system/components/MlCoordinator/kata-ml-support/src/image_manager.rs
+++ b/apps/system/components/MlCoordinator/kata-ml-support/src/image_manager.rs
@@ -329,6 +329,18 @@ impl ImageManager {
    /// Zeroes out the temporary data section.
    pub fn clear_temp_data(&self) { MlCore::clear_tcm(self.tcm_bottom, self.tcm_bottom_size()); }

+    /// Gets the output header for |id|.
+    pub fn output_header(&self, id: &ImageId) -> Option<OutputHeader> {
+        match self.get_image_index(id) {
+            Some(idx) => {
+                let image = &self.images[idx].as_ref().unwrap();
+                let addr = image.data_top_addr + image.sizes.model_output_offset();
+                Some(MlCore::get_output_header(addr))
+            }
+            None => None,
+        }
+    }
+
    fn ids_at(&self, idx: ImageIdx) -> (&str, &str) {
        match self.images[idx].as_ref() {
            Some(image) => (&image.id.bundle_id, &image.id.model_id),
--- a/apps/system/components/MlCoordinator/kata-vec-core/src/lib.rs
+++ b/apps/system/components/MlCoordinator/kata-vec-core/src/lib.rs
@@ -10,13 +10,17 @@ mod vc_top;
 use core::mem::size_of;
 use core::slice;
 use kata_io::Read;
-use kata_ml_shared::{Permission, WindowId, TCM_PADDR, TCM_SIZE};
+use kata_ml_shared::{OutputHeader, Permission, WindowId, TCM_PADDR, TCM_SIZE};
 use log::{error, trace};

 extern "C" {
    static TCM: *mut u32;
 }

+fn get_tcm_slice() -> &'static mut [u32] {
+    unsafe { slice::from_raw_parts_mut(TCM, TCM_SIZE / size_of::<u32>()) }
+}
+
 pub fn enable_interrupts(enable: bool) {
    let intr_enable = vc_top::IntrEnable::new()
        .with_host_req(enable)
@@ -57,12 +61,12 @@ pub fn run() {

 /// Writes the section of the image from |start_address| to
 /// |start_address + on_flash_size| into the TCM. Zeroes the section from
-/// |on_flash_size| to |unpacked_size|. Returns None if the write failed.
+/// |on_flash_size| to |in_memory_size|. Returns None if the write failed.
 pub fn write_image_part<R: Read>(
    image: &mut R,
    start_address: usize,
    on_flash_size: usize,
-    unpacked_size: usize,
+    in_memory_size: usize,
 ) -> Option<()> {
    let start = start_address - TCM_PADDR;

@@ -70,7 +74,7 @@ pub fn write_image_part<R: Read>(
        "Writing {:x} bytes to 0x{:x}, {:x} unpacked size",
        on_flash_size,
        start_address,
-        unpacked_size
+        in_memory_size
    );

    let tcm_slice = unsafe { slice::from_raw_parts_mut(TCM as *mut u8, TCM_SIZE) };
@@ -81,7 +85,7 @@ pub fn write_image_part<R: Read>(
    };

    // TODO(jesionowski): Use hardware clear when TCM_SIZE fits into INIT_END.
-    tcm_slice[start + on_flash_size..start + unpacked_size].fill(0x00);
+    tcm_slice[start + on_flash_size..start + in_memory_size].fill(0x00);

    Some(())
 }
@@ -160,19 +164,15 @@ pub fn clear_tcm(addr: usize, byte_length: usize) {
 #[allow(dead_code)]
 pub fn wait_for_clear_to_finish() { while !vc_top::get_init_status().init_done() {} }

-// TODO(jesionowski): Remove these when error handling is refactored.
-// The status will be faulty iff the interrupt line is raised, and
-// we won't have the fault registers on Springbok.
-fn get_tcm_slice() -> &'static mut [u32] {
-    unsafe { slice::from_raw_parts_mut(TCM, TCM_SIZE / size_of::<u32>()) }
-}
+/// Transmutes a copy of the bytes at |addr| into an OutputHeader.
+pub fn get_output_header(addr: usize) -> OutputHeader {
+    assert!(addr >= TCM_PADDR);
+    assert!(addr + size_of::<OutputHeader>() <= TCM_PADDR + TCM_SIZE);

-pub fn get_return_code() -> u32 {
-    const RC_OFFSET: usize = 0x3FFFEE;
-    get_tcm_slice()[RC_OFFSET]
-}
+    let offset: isize = (addr - TCM_PADDR).try_into().unwrap();

-pub fn get_fault_register() -> u32 {
-    const FAULT_OFFSET: usize = 0x3FFFEF;
-    get_tcm_slice()[FAULT_OFFSET]
+    unsafe {
+        let ptr = TCM.offset(offset) as *const OutputHeader;
+        *ptr
+    }
 }