mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-01 11:56:29 +00:00
kata-ctl: add monitor subcommand for runtime-rs
The previous kata-monitor in golang could not communicate with runtime-rs to gather metrics due to different sandbox addresses. This PR adds the subcommand monitor in kata-ctl to gather metrics from runtime-rs and monitor itself. Fixes: #5017 Signed-off-by: Yuan-Zhuo <yuanzhuo0118@outlook.com>
This commit is contained in:
parent
d74639d8c6
commit
731e7c763f
877
src/tools/kata-ctl/Cargo.lock
generated
877
src/tools/kata-ctl/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -44,7 +44,12 @@ logging = { path = "../../libs/logging" }
|
|||||||
slog = "2.7.0"
|
slog = "2.7.0"
|
||||||
slog-scope = "4.4.0"
|
slog-scope = "4.4.0"
|
||||||
hyper = "0.14.20"
|
hyper = "0.14.20"
|
||||||
tokio = "1.28.1"
|
tokio = { version = "1.28.1", features = ["signal"] }
|
||||||
|
ttrpc = "0.6.0"
|
||||||
|
|
||||||
|
prometheus = { version = "0.13.0", features = ["process"] }
|
||||||
|
procfs = "0.12.0"
|
||||||
|
lazy_static = "1.2"
|
||||||
|
|
||||||
[target.'cfg(target_arch = "s390x")'.dependencies]
|
[target.'cfg(target_arch = "s390x")'.dependencies]
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["json", "blocking", "native-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["json", "blocking", "native-tls"] }
|
||||||
|
@ -56,6 +56,9 @@ pub enum Commands {
|
|||||||
/// Gather metrics associated with infrastructure used to run a sandbox
|
/// Gather metrics associated with infrastructure used to run a sandbox
|
||||||
Metrics(MetricsCommand),
|
Metrics(MetricsCommand),
|
||||||
|
|
||||||
|
/// Start a monitor to get metrics of Kata Containers
|
||||||
|
Monitor(MonitorArgument),
|
||||||
|
|
||||||
/// Display version details
|
/// Display version details
|
||||||
Version,
|
Version,
|
||||||
}
|
}
|
||||||
@ -122,6 +125,12 @@ pub enum IpTablesArguments {
|
|||||||
Metrics,
|
Metrics,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Args)]
|
||||||
|
pub struct MonitorArgument {
|
||||||
|
/// The address to listen on for HTTP requests. (default "127.0.0.1:8090")
|
||||||
|
pub address: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Args)]
|
#[derive(Debug, Args)]
|
||||||
pub struct DirectVolumeCommand {
|
pub struct DirectVolumeCommand {
|
||||||
#[clap(subcommand)]
|
#[clap(subcommand)]
|
||||||
|
@ -3,9 +3,16 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
extern crate lazy_static;
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
extern crate slog;
|
||||||
|
|
||||||
mod arch;
|
mod arch;
|
||||||
mod args;
|
mod args;
|
||||||
mod check;
|
mod check;
|
||||||
|
mod monitor;
|
||||||
mod ops;
|
mod ops;
|
||||||
mod types;
|
mod types;
|
||||||
mod utils;
|
mod utils;
|
||||||
@ -18,7 +25,7 @@ use std::process::exit;
|
|||||||
use args::{Commands, KataCtlCli};
|
use args::{Commands, KataCtlCli};
|
||||||
|
|
||||||
use ops::check_ops::{
|
use ops::check_ops::{
|
||||||
handle_check, handle_factory, handle_iptables, handle_metrics, handle_version,
|
handle_check, handle_factory, handle_iptables, handle_metrics, handle_monitor, handle_version,
|
||||||
};
|
};
|
||||||
use ops::env_ops::handle_env;
|
use ops::env_ops::handle_env;
|
||||||
use ops::exec_ops::handle_exec;
|
use ops::exec_ops::handle_exec;
|
||||||
@ -52,6 +59,7 @@ fn real_main() -> Result<()> {
|
|||||||
Commands::Factory => handle_factory(),
|
Commands::Factory => handle_factory(),
|
||||||
Commands::Iptables(args) => handle_iptables(args),
|
Commands::Iptables(args) => handle_iptables(args),
|
||||||
Commands::Metrics(args) => handle_metrics(args),
|
Commands::Metrics(args) => handle_metrics(args),
|
||||||
|
Commands::Monitor(args) => handle_monitor(args),
|
||||||
Commands::Version => handle_version(),
|
Commands::Version => handle_version(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
181
src/tools/kata-ctl/src/monitor/http_server.rs
Normal file
181
src/tools/kata-ctl/src/monitor/http_server.rs
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
// Copyright 2022-2023 Ant Group
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
use crate::monitor::metrics::get_monitor_metrics;
|
||||||
|
use crate::sl;
|
||||||
|
use crate::utils::TIMEOUT;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use hyper::body;
|
||||||
|
use hyper::service::{make_service_fn, service_fn};
|
||||||
|
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||||
|
use shim_interface::shim_mgmt::client::MgmtClient;
|
||||||
|
use slog::{self, info};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::net::SocketAddr;
|
||||||
|
|
||||||
|
const ROOT_URI: &str = "/";
|
||||||
|
const METRICS_URI: &str = "/metrics";
|
||||||
|
|
||||||
|
async fn handler_mux(req: Request<Body>) -> Result<Response<Body>> {
|
||||||
|
info!(
|
||||||
|
sl!(),
|
||||||
|
"mgmt-svr(mux): recv req, method: {}, uri: {}",
|
||||||
|
req.method(),
|
||||||
|
req.uri().path()
|
||||||
|
);
|
||||||
|
|
||||||
|
match (req.method(), req.uri().path()) {
|
||||||
|
(&Method::GET, ROOT_URI) => root_uri_handler(req).await,
|
||||||
|
(&Method::GET, METRICS_URI) => metrics_uri_handler(req).await,
|
||||||
|
_ => not_found_uri_handler(req).await,
|
||||||
|
}
|
||||||
|
.map_or_else(
|
||||||
|
|e| {
|
||||||
|
Response::builder()
|
||||||
|
.status(StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
.body(Body::from(format!("{:?}\n", e)))
|
||||||
|
.map_err(|e| anyhow!("Failed to Build Response {:?}", e))
|
||||||
|
},
|
||||||
|
Ok,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn http_server_setup(socket_addr: &str) -> Result<()> {
|
||||||
|
let addr: SocketAddr = socket_addr
|
||||||
|
.parse()
|
||||||
|
.context("failed to parse http socket address")?;
|
||||||
|
|
||||||
|
let make_svc =
|
||||||
|
make_service_fn(|_conn| async { Ok::<_, anyhow::Error>(service_fn(handler_mux)) });
|
||||||
|
|
||||||
|
Server::bind(&addr).serve(make_svc).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn root_uri_handler(_req: Request<Body>) -> Result<Response<Body>> {
|
||||||
|
Response::builder()
|
||||||
|
.status(StatusCode::OK)
|
||||||
|
.body(Body::from(
|
||||||
|
r#"Available HTTP endpoints:
|
||||||
|
/metrics : Get metrics from sandboxes.
|
||||||
|
"#,
|
||||||
|
))
|
||||||
|
.map_err(|e| anyhow!("Failed to Build Response {:?}", e))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn metrics_uri_handler(req: Request<Body>) -> Result<Response<Body>> {
|
||||||
|
let mut response_body = String::new();
|
||||||
|
|
||||||
|
response_body += &get_monitor_metrics().context("Failed to Get Monitor Metrics")?;
|
||||||
|
|
||||||
|
if let Some(uri_query) = req.uri().query() {
|
||||||
|
if let Ok(sandbox_id) = parse_sandbox_id(uri_query) {
|
||||||
|
response_body += &get_runtime_metrics(sandbox_id)
|
||||||
|
.await
|
||||||
|
.context(format!("{}\nFailed to Get Runtime Metrics", response_body))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Response::builder()
|
||||||
|
.status(StatusCode::OK)
|
||||||
|
.body(Body::from(response_body))
|
||||||
|
.map_err(|e| anyhow!("Failed to Build Response {:?}", e))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_runtime_metrics(sandbox_id: &str) -> Result<String> {
|
||||||
|
// build shim client
|
||||||
|
let shim_client =
|
||||||
|
MgmtClient::new(sandbox_id, Some(TIMEOUT)).context("failed to build shim mgmt client")?;
|
||||||
|
|
||||||
|
// get METRICS_URI
|
||||||
|
let shim_response = shim_client
|
||||||
|
.get(METRICS_URI)
|
||||||
|
.await
|
||||||
|
.context("failed to get METRICS_URI")?;
|
||||||
|
|
||||||
|
// get runtime_metrics
|
||||||
|
let runtime_metrics = String::from_utf8(body::to_bytes(shim_response).await?.to_vec())
|
||||||
|
.context("failed to get runtime_metrics")?;
|
||||||
|
|
||||||
|
Ok(runtime_metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn not_found_uri_handler(_req: Request<Body>) -> Result<Response<Body>> {
|
||||||
|
Response::builder()
|
||||||
|
.status(StatusCode::NOT_FOUND)
|
||||||
|
.body(Body::from("NOT FOUND"))
|
||||||
|
.map_err(|e| anyhow!("Failed to Build Response {:?}", e))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_sandbox_id(uri: &str) -> Result<&str> {
|
||||||
|
let uri_pairs: HashMap<_, _> = uri
|
||||||
|
.split_whitespace()
|
||||||
|
.map(|s| s.split_at(s.find('=').unwrap_or(0)))
|
||||||
|
.map(|(key, val)| (key, &val[1..]))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
match uri_pairs.get("sandbox") {
|
||||||
|
Some(sid) => Ok(sid.to_owned()),
|
||||||
|
None => Err(anyhow!("params sandbox not found")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_sandbox_id() {
|
||||||
|
assert!(parse_sandbox_id("sandbox=demo_sandbox").unwrap() == "demo_sandbox");
|
||||||
|
assert!(parse_sandbox_id("foo=bar").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_root_uri_handler() {
|
||||||
|
let root_resp = handler_mux(
|
||||||
|
Request::builder()
|
||||||
|
.method("GET")
|
||||||
|
.uri("/")
|
||||||
|
.body(hyper::Body::from(""))
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(root_resp.status() == StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_metrics_uri_handler() {
|
||||||
|
let metrics_resp = handler_mux(
|
||||||
|
Request::builder()
|
||||||
|
.method("GET")
|
||||||
|
.uri("/metrics?sandbox=demo_sandbox")
|
||||||
|
.body(hyper::Body::from(""))
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(metrics_resp.status() == StatusCode::INTERNAL_SERVER_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_not_found_uri_handler() {
|
||||||
|
let not_found_resp = handler_mux(
|
||||||
|
Request::builder()
|
||||||
|
.method("POST")
|
||||||
|
.uri("/metrics?sandbox=demo_sandbox")
|
||||||
|
.body(hyper::Body::from(""))
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(not_found_resp.status() == StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
}
|
91
src/tools/kata-ctl/src/monitor/metrics.rs
Normal file
91
src/tools/kata-ctl/src/monitor/metrics.rs
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
// Copyright 2022-2023 Ant Group
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
extern crate procfs;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
|
||||||
|
use prometheus::{Encoder, Gauge, IntCounter, Registry, TextEncoder};
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
const NAMESPACE_KATA_MONITOR: &str = "kata_ctl_monitor";
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
|
||||||
|
static ref REGISTERED: Mutex<bool> = Mutex::new(false);
|
||||||
|
|
||||||
|
// custom registry
|
||||||
|
static ref REGISTRY: Registry = Registry::new();
|
||||||
|
|
||||||
|
// monitor metrics
|
||||||
|
static ref MONITOR_SCRAPE_COUNT: IntCounter =
|
||||||
|
IntCounter::new(format!("{}_{}", NAMESPACE_KATA_MONITOR, "scrape_count"), "Monitor scrape count").unwrap();
|
||||||
|
|
||||||
|
static ref MONITOR_MAX_FDS: Gauge = Gauge::new(format!("{}_{}", NAMESPACE_KATA_MONITOR, "process_max_fds"), "Open FDs for monitor").unwrap();
|
||||||
|
|
||||||
|
static ref MONITOR_OPEN_FDS: Gauge = Gauge::new(format!("{}_{}", NAMESPACE_KATA_MONITOR, "process_open_fds"), "Open FDs for monitor").unwrap();
|
||||||
|
|
||||||
|
static ref MONITOR_RESIDENT_MEMORY: Gauge = Gauge::new(format!("{}_{}", NAMESPACE_KATA_MONITOR, "process_resident_memory_bytes"), "Resident memory size in bytes for monitor").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// get monitor metrics
|
||||||
|
pub fn get_monitor_metrics() -> Result<String> {
|
||||||
|
let mut registered = REGISTERED
|
||||||
|
.lock()
|
||||||
|
.map_err(|e| anyhow!("failed to check monitor metrics register status {:?}", e))?;
|
||||||
|
|
||||||
|
if !(*registered) {
|
||||||
|
register_monitor_metrics().context("failed to register monitor metrics")?;
|
||||||
|
*registered = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
update_monitor_metrics().context("failed to update monitor metrics")?;
|
||||||
|
|
||||||
|
// gather all metrics and return as a String
|
||||||
|
let metric_families = REGISTRY.gather();
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
TextEncoder::new()
|
||||||
|
.encode(&metric_families, &mut buffer)
|
||||||
|
.context("failed to encode gathered metrics")?;
|
||||||
|
|
||||||
|
Ok(String::from_utf8(buffer)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn register_monitor_metrics() -> Result<()> {
|
||||||
|
REGISTRY.register(Box::new(MONITOR_SCRAPE_COUNT.clone()))?;
|
||||||
|
REGISTRY.register(Box::new(MONITOR_MAX_FDS.clone()))?;
|
||||||
|
REGISTRY.register(Box::new(MONITOR_OPEN_FDS.clone()))?;
|
||||||
|
REGISTRY.register(Box::new(MONITOR_RESIDENT_MEMORY.clone()))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update_monitor_metrics() -> Result<()> {
|
||||||
|
MONITOR_SCRAPE_COUNT.inc();
|
||||||
|
|
||||||
|
let me = match procfs::process::Process::myself() {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("failed to create process instance: {:?}", e);
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Ok(fds) = procfs::sys::fs::file_max() {
|
||||||
|
MONITOR_MAX_FDS.set(fds as f64);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(fds) = me.fd_count() {
|
||||||
|
MONITOR_OPEN_FDS.set(fds as f64);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(statm) = me.statm() {
|
||||||
|
MONITOR_RESIDENT_MEMORY.set(statm.resident as f64);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
8
src/tools/kata-ctl/src/monitor/mod.rs
Normal file
8
src/tools/kata-ctl/src/monitor/mod.rs
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
// Copyright 2022-2023 Ant Group
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
mod metrics;
|
||||||
|
|
||||||
|
pub mod http_server;
|
@ -5,15 +5,21 @@
|
|||||||
|
|
||||||
use crate::arch::arch_specific::get_checks;
|
use crate::arch::arch_specific::get_checks;
|
||||||
|
|
||||||
use crate::args::{CheckArgument, CheckSubCommand, IptablesCommand, MetricsCommand};
|
use crate::args::{
|
||||||
|
CheckArgument, CheckSubCommand, IptablesCommand, MetricsCommand, MonitorArgument,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::check;
|
use crate::check;
|
||||||
|
|
||||||
|
use crate::monitor::http_server;
|
||||||
|
|
||||||
use crate::ops::version;
|
use crate::ops::version;
|
||||||
|
|
||||||
use crate::types::*;
|
use crate::types::*;
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
|
||||||
|
const MONITOR_DEFAULT_SOCK_ADDR: &str = "127.0.0.1:8090";
|
||||||
|
|
||||||
use slog::{info, o, warn};
|
use slog::{info, o, warn};
|
||||||
|
|
||||||
@ -128,6 +134,17 @@ pub fn handle_metrics(_args: MetricsCommand) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn handle_monitor(monitor_args: MonitorArgument) -> Result<()> {
|
||||||
|
tokio::runtime::Runtime::new()
|
||||||
|
.context("failed to new runtime for aync http server")?
|
||||||
|
.block_on(http_server::http_server_setup(
|
||||||
|
monitor_args
|
||||||
|
.address
|
||||||
|
.as_deref()
|
||||||
|
.unwrap_or(MONITOR_DEFAULT_SOCK_ADDR),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn handle_version() -> Result<()> {
|
pub fn handle_version() -> Result<()> {
|
||||||
let version = version::get().unwrap();
|
let version = version::get().unwrap();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user