dragonball: vcpu metrics change to be recorded per vcpu

In this commit, the vcpu metrics in Dragonball will be changed to record per-vcpu.

Fixes: #7248

Signed-off-by: lisongqian <mail@lisongqian.cn>
This commit is contained in:
lisongqian 2023-09-23 16:08:47 +08:00
parent fa60fbe023
commit dbfe6512fc
6 changed files with 181 additions and 38 deletions

View File

@ -5,11 +5,14 @@
extern crate procfs;
use crate::metric::{IncMetric, METRICS};
use anyhow::{anyhow, Result};
use prometheus::{Encoder, IntCounter, IntGaugeVec, Opts, Registry, TextEncoder};
use std::sync::Mutex;
use anyhow::{anyhow, Result};
use dbs_utils::metric::IncMetric;
use prometheus::{Encoder, IntCounter, IntGaugeVec, Opts, Registry, TextEncoder};
use crate::metric::METRICS;
const NAMESPACE_KATA_HYPERVISOR: &str = "kata_hypervisor";
lazy_static! {
@ -23,7 +26,7 @@ lazy_static! {
IntCounter::new(format!("{}_{}",NAMESPACE_KATA_HYPERVISOR,"scrape_count"), "Hypervisor metrics scrape count.").unwrap();
static ref HYPERVISOR_VCPU: IntGaugeVec =
IntGaugeVec::new(Opts::new(format!("{}_{}",NAMESPACE_KATA_HYPERVISOR,"vcpu"), "Hypervisor metrics specific to VCPUs' mode of functioning."), &["item"]).unwrap();
IntGaugeVec::new(Opts::new(format!("{}_{}",NAMESPACE_KATA_HYPERVISOR,"vcpu"), "Hypervisor metrics specific to VCPUs' mode of functioning."), &["cpu_id", "item"]).unwrap();
static ref HYPERVISOR_SECCOMP: IntGaugeVec =
IntGaugeVec::new(Opts::new(format!("{}_{}",NAMESPACE_KATA_HYPERVISOR,"seccomp"), "Hypervisor metrics for the seccomp filtering."), &["item"]).unwrap();
@ -75,30 +78,33 @@ fn update_hypervisor_metrics() -> Result<()> {
}
fn set_intgauge_vec_vcpu(icv: &prometheus::IntGaugeVec) {
icv.with_label_values(&["exit_io_in"])
.set(METRICS.vcpu.exit_io_in.count() as i64);
icv.with_label_values(&["exit_io_out"])
.set(METRICS.vcpu.exit_io_out.count() as i64);
icv.with_label_values(&["exit_mmio_read"])
.set(METRICS.vcpu.exit_mmio_read.count() as i64);
icv.with_label_values(&["exit_mmio_write"])
.set(METRICS.vcpu.exit_mmio_write.count() as i64);
icv.with_label_values(&["failures"])
.set(METRICS.vcpu.failures.count() as i64);
icv.with_label_values(&["filter_cpuid"])
.set(METRICS.vcpu.filter_cpuid.count() as i64);
let metric_guard = METRICS.read().unwrap();
for (cpu_id, metrics) in metric_guard.vcpu.iter() {
icv.with_label_values(&[cpu_id.to_string().as_str(), "exit_io_in"])
.set(metrics.exit_io_in.count() as i64);
icv.with_label_values(&[cpu_id.to_string().as_str(), "exit_io_out"])
.set(metrics.exit_io_out.count() as i64);
icv.with_label_values(&[cpu_id.to_string().as_str(), "exit_mmio_read"])
.set(metrics.exit_mmio_read.count() as i64);
icv.with_label_values(&[cpu_id.to_string().as_str(), "exit_mmio_write"])
.set(metrics.exit_mmio_write.count() as i64);
icv.with_label_values(&[cpu_id.to_string().as_str(), "failures"])
.set(metrics.failures.count() as i64);
icv.with_label_values(&[cpu_id.to_string().as_str(), "filter_cpuid"])
.set(metrics.filter_cpuid.count() as i64);
}
}
fn set_intgauge_vec_seccomp(icv: &prometheus::IntGaugeVec) {
let metric_gurad = METRICS.read().unwrap();
let metric_guard = METRICS.read().unwrap();
icv.with_label_values(&["num_faults"])
.set(metric_gurad.seccomp.num_faults.count() as i64);
.set(metric_guard.seccomp.num_faults.count() as i64);
}
fn set_intgauge_vec_signals(icv: &prometheus::IntGaugeVec) {
let metric_gurad = METRICS.read().unwrap();
let metric_guard = METRICS.read().unwrap();
icv.with_label_values(&["sigbus"])
.set(metric_gurad.signals.sigbus.count() as i64);
.set(metric_guard.signals.sigbus.count() as i64);
icv.with_label_values(&["sigsegv"])
.set(metric_gurad.signals.sigsegv.count() as i64);
.set(metric_guard.signals.sigsegv.count() as i64);
}

View File

@ -2,14 +2,19 @@
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use dbs_utils::metric::{IncMetric, SharedIncMetric};
use dbs_utils::metric::SharedIncMetric;
use lazy_static::lazy_static;
use serde::Serialize;
lazy_static! {
/// Static instance used for handling metrics.
/// # Static instance used for handling metrics.
///
/// Using a big lock over the DragonballMetrics since we have various device metric types
/// and the write operation is only used when creating or removing devices, it has a low
/// competitive overhead.
pub static ref METRICS: RwLock<DragonballMetrics> = RwLock::new(DragonballMetrics::default());
}
@ -50,9 +55,121 @@ pub struct SignalMetrics {
#[derive(Default, Serialize)]
pub struct DragonballMetrics {
/// Metrics related to a vcpu's functioning.
pub vcpu: VcpuMetrics,
pub vcpu: HashMap<u32, Arc<VcpuMetrics>>,
/// Metrics related to seccomp filtering.
pub seccomp: SeccompMetrics,
/// Metrics related to signals.
pub signals: SignalMetrics,
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use std::thread;
use dbs_utils::metric::IncMetric;
use crate::metric::{VcpuMetrics, METRICS};
#[test]
fn test_read_map() {
let metrics = Arc::new(VcpuMetrics::default());
let vcpu_id: u32 = u32::MIN;
METRICS
.write()
.unwrap()
.vcpu
.insert(vcpu_id, metrics.clone());
metrics.failures.inc();
assert_eq!(
METRICS
.read()
.unwrap()
.vcpu
.get(&vcpu_id)
.unwrap()
.failures
.count(),
1
);
}
#[test]
fn test_metrics_count() {
let metrics = Arc::new(VcpuMetrics::default());
let vcpu_id: u32 = 65535;
METRICS
.write()
.unwrap()
.vcpu
.insert(vcpu_id, metrics.clone());
let metrics1 = metrics.clone();
let thread1 = thread::spawn(move || {
for _i in 0..10 {
metrics1.exit_io_in.inc();
}
});
let metrics2 = metrics.clone();
let thread2 = thread::spawn(move || {
for _i in 0..10 {
metrics2.exit_io_in.inc();
}
});
thread1.join().unwrap();
thread2.join().unwrap();
assert_eq!(
METRICS
.read()
.unwrap()
.vcpu
.get(&vcpu_id)
.unwrap()
.exit_io_in
.count(),
20
);
}
#[test]
fn test_rw_lock() {
let metrics = Arc::new(VcpuMetrics::default());
let vcpu_id: u32 = u32::MAX;
METRICS
.write()
.unwrap()
.vcpu
.insert(vcpu_id, metrics.clone());
let write_thread = thread::spawn(move || {
for _ in 0..10 {
let metrics = Arc::new(VcpuMetrics::default());
let vcpu_id: u32 = 128;
METRICS
.write()
.unwrap()
.vcpu
.insert(vcpu_id, metrics.clone());
}
});
let read_thread = thread::spawn(move || {
for _ in 0..10 {
assert_eq!(
METRICS
.read()
.unwrap()
.vcpu
.get(&vcpu_id)
.unwrap()
.failures
.count(),
0
);
}
});
write_thread.join().unwrap();
read_thread.join().unwrap();
}
}

View File

@ -10,7 +10,6 @@ use std::ops::Deref;
use std::sync::mpsc::{channel, Sender};
use std::sync::Arc;
use crate::IoManagerCached;
use dbs_arch::{regs, VpmuFeatureLevel};
use dbs_boot::get_fdt_addr;
use dbs_utils::time::TimestampUs;
@ -19,8 +18,10 @@ use vm_memory::{Address, GuestAddress, GuestAddressSpace};
use vmm_sys_util::eventfd::EventFd;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::metric::VcpuMetrics;
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent};
use crate::vcpu::VcpuConfig;
use crate::IoManagerCached;
#[allow(unused)]
impl Vcpu {
@ -67,6 +68,7 @@ impl Vcpu {
support_immediate_exit,
mpidr: 0,
exit_evt,
metrics: Arc::new(VcpuMetrics::default()),
})
}

View File

@ -15,6 +15,7 @@ use std::sync::mpsc::{Receiver, Sender, TryRecvError};
use std::sync::{Arc, Barrier};
use std::thread;
use dbs_utils::metric::IncMetric;
use dbs_utils::time::TimestampUs;
use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
use kvm_ioctls::{VcpuExit, VcpuFd};
@ -25,7 +26,7 @@ use vmm_sys_util::eventfd::EventFd;
use vmm_sys_util::signal::{register_signal_handler, Killable};
use super::sm::StateMachine;
use crate::metric::{IncMetric, METRICS};
use crate::metric::{VcpuMetrics, METRICS};
use crate::signal_handler::sigrtmin;
use crate::IoManagerCached;
@ -303,6 +304,9 @@ pub struct Vcpu {
// Whether kvm used supports immediate_exit flag.
support_immediate_exit: bool,
// metrics for a vCPU.
metrics: Arc<VcpuMetrics>,
// CPUID information for the x86_64 CPU
#[cfg(target_arch = "x86_64")]
cpuid: kvm_bindings::CpuId,
@ -446,7 +450,7 @@ impl Vcpu {
#[cfg(target_arch = "x86_64")]
VcpuExit::IoIn(addr, data) => {
let _ = self.io_mgr.pio_read(addr, data);
METRICS.vcpu.exit_io_in.inc();
self.metrics.exit_io_in.inc();
Ok(VcpuEmulation::Handled)
}
#[cfg(target_arch = "x86_64")]
@ -454,17 +458,17 @@ impl Vcpu {
if !self.check_io_port_info(addr, data)? {
let _ = self.io_mgr.pio_write(addr, data);
}
METRICS.vcpu.exit_io_out.inc();
self.metrics.exit_io_out.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::MmioRead(addr, data) => {
let _ = self.io_mgr.mmio_read(addr, data);
METRICS.vcpu.exit_mmio_read.inc();
self.metrics.exit_mmio_read.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::MmioWrite(addr, data) => {
let _ = self.io_mgr.mmio_write(addr, data);
METRICS.vcpu.exit_mmio_write.inc();
self.metrics.exit_mmio_write.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::Hlt => {
@ -477,12 +481,12 @@ impl Vcpu {
}
// Documentation specifies that below kvm exits are considered errors.
VcpuExit::FailEntry(reason, cpu) => {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
error!("Received KVM_EXIT_FAIL_ENTRY signal, reason {reason}, cpu number {cpu}");
Err(VcpuError::VcpuUnhandledKvmExit)
}
VcpuExit::InternalError => {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
error!("Received KVM_EXIT_INTERNAL_ERROR signal");
Err(VcpuError::VcpuUnhandledKvmExit)
}
@ -495,7 +499,7 @@ impl Vcpu {
Ok(VcpuEmulation::Stopped)
}
_ => {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
error!(
"Received KVM_SYSTEM_EVENT signal type: {}, flag: {}",
event_type, event_flags
@ -504,7 +508,7 @@ impl Vcpu {
}
},
r => {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
// TODO: Are we sure we want to finish running a vcpu upon
// receiving a vm exit that is not necessarily an error?
error!("Unexpected exit reason on vcpu run: {:?}", r);
@ -523,7 +527,7 @@ impl Vcpu {
Ok(VcpuEmulation::Interrupted)
}
_ => {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
error!("Failure during vcpu run: {}", e);
#[cfg(target_arch = "x86_64")]
{
@ -731,7 +735,7 @@ impl Vcpu {
fn waiting_exit(&mut self) -> StateMachine<Self> {
// trigger vmm to stop machine
if let Err(e) = self.exit_evt.write(1) {
METRICS.vcpu.failures.inc();
self.metrics.failures.inc();
error!("Failed signaling vcpu exit event: {}", e);
}
@ -765,11 +769,17 @@ impl Vcpu {
pub fn vcpu_fd(&self) -> &VcpuFd {
self.fd.as_ref()
}
pub fn metrics(&self) -> Arc<VcpuMetrics> {
self.metrics.clone()
}
}
impl Drop for Vcpu {
fn drop(&mut self) {
let _ = self.reset_thread_local_data();
let id: u32 = self.id as u32;
METRICS.write().unwrap().vcpu.remove(&id);
}
}

View File

@ -29,6 +29,7 @@ use vmm_sys_util::eventfd::EventFd;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::api::v1::InstanceInfo;
use crate::kvm_context::KvmContext;
use crate::metric::METRICS;
use crate::vcpu::vcpu_impl::{
Vcpu, VcpuError, VcpuEvent, VcpuHandle, VcpuResizeResult, VcpuResponse, VcpuStateEvent,
};
@ -555,6 +556,11 @@ impl VcpuManager {
};
let mut vcpu = self.create_vcpu_arch(cpu_index, kvm_vcpu, request_ts)?;
METRICS
.write()
.unwrap()
.vcpu
.insert(cpu_index as u32, vcpu.metrics());
self.configure_single_vcpu(entry_addr, &mut vcpu)
.map_err(VcpuManagerError::Vcpu)?;
self.vcpu_infos[cpu_index as usize].vcpu = Some(vcpu);

View File

@ -11,6 +11,7 @@ use std::sync::Arc;
use dbs_arch::cpuid::{process_cpuid, VmSpec};
use dbs_arch::gdt::gdt_entry;
use dbs_utils::metric::IncMetric;
use dbs_utils::time::TimestampUs;
use kvm_bindings::CpuId;
use kvm_ioctls::{VcpuFd, VmFd};
@ -19,7 +20,7 @@ use vm_memory::{Address, GuestAddress, GuestAddressSpace};
use vmm_sys_util::eventfd::EventFd;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::metric::{IncMetric, METRICS};
use crate::metric::VcpuMetrics;
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent};
use crate::vcpu::VcpuConfig;
use crate::IoManagerCached;
@ -69,6 +70,7 @@ impl Vcpu {
vcpu_state_sender,
exit_evt,
support_immediate_exit,
metrics: Arc::new(VcpuMetrics::default()),
cpuid,
})
}
@ -137,7 +139,7 @@ impl Vcpu {
)
.map_err(VcpuError::CpuId)?;
process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| {
METRICS.vcpu.filter_cpuid.inc();
self.metrics.filter_cpuid.inc();
error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e);
VcpuError::CpuId(e)
})?;