mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-09-17 14:58:16 +00:00
runtime-rs: Increase QMP read timeout to mitigate failures
It frequently causes "Resource Temporarily Unavailable (OS Error 11)" with the original 250ms read timeout When passing through devices via VFIO in QEMU. The root cause lies in synchronization timeout windows failing to accommodate inherent delays during critical hardware init phases in kernel space. This commit would increase the timeout to 5000ms which was determined through some tests. While not guaranteeing complete resolution for all hardware combinations, this change significantly reduces timeout failures. Fixes # 10361 Signed-off-by: alex.lyn <alex.lyn@antgroup.com>
This commit is contained in:
@@ -19,6 +19,9 @@ use qapi::qmp;
|
||||
use qapi_qmp::{self, PciDeviceInfo};
|
||||
use qapi_spec::Dictionary;
|
||||
|
||||
/// default qmp connection read timeout
|
||||
const DEFAULT_QMP_READ_TIMEOUT: u64 = 250;
|
||||
|
||||
pub struct Qmp {
|
||||
qmp: qapi::Qmp<qapi::Stream<BufReader<UnixStream>, UnixStream>>,
|
||||
|
||||
@@ -55,7 +58,7 @@ impl Qmp {
|
||||
// (containerd's task creation timeout is 2 s by default). OTOH
|
||||
// setting it too short would risk interfering with a normal launch,
|
||||
// perhaps just seeing some delay due to a heavily loaded host.
|
||||
stream.set_read_timeout(Some(Duration::from_millis(250)))?;
|
||||
stream.set_read_timeout(Some(Duration::from_millis(DEFAULT_QMP_READ_TIMEOUT)))?;
|
||||
|
||||
let mut qmp = Qmp {
|
||||
qmp: qapi::Qmp::new(qapi::Stream::new(
|
||||
@@ -622,9 +625,33 @@ impl Qmp {
|
||||
};
|
||||
info!(sl!(), "vfio_device_add: {:?}", vfio_device_add.clone());
|
||||
|
||||
self.qmp
|
||||
.execute(&vfio_device_add)
|
||||
.map_err(|e| anyhow!("device_add vfio device failed {:?}", e))?;
|
||||
// We've chosen to set a 5-second read timeout on Unix sockets for QMP operations. We consider set_read_timeout()
|
||||
// a lightweight operation that shouldn't significantly impact performance, even with multiple VFIO devices.
|
||||
// However, we also need to ensure its debuggability.
|
||||
// As it could obscure the root cause of connection failures as set an excessively long QMP timeout.
|
||||
// For example, if QEMU fails to launch, a 5-second QMP timeout will immediately provide a "QMP connection failed" log message,
|
||||
// clearly pinpointing the issue. Conversely, a prolonged timeout might only result in vague error messages, making debugging
|
||||
// difficult as it won't explicitly indicate where the problem lies.
|
||||
|
||||
// Given our current inability to comprehensively test across a wide range of hardware and configurations, we've made a pragmatic
|
||||
// decision: we'll maintain the 5-second timeout for now. A configurable timeout option will be introduced if future use cases
|
||||
// clearly demonstrate a justified need.
|
||||
{
|
||||
// set read timeout with 5000
|
||||
self.qmp
|
||||
.inner_mut()
|
||||
.get_mut_write()
|
||||
.set_read_timeout(Some(Duration::from_millis(5000)))?;
|
||||
// send the VFIO hotplug request
|
||||
self.qmp
|
||||
.execute(&vfio_device_add)
|
||||
.map_err(|e| anyhow!("device_add vfio device failed {:?}", e))?;
|
||||
// reset read timeout with 250
|
||||
self.qmp
|
||||
.inner_mut()
|
||||
.get_mut_write()
|
||||
.set_read_timeout(Some(Duration::from_millis(DEFAULT_QMP_READ_TIMEOUT)))?;
|
||||
}
|
||||
|
||||
let pci_path = self
|
||||
.get_device_by_qdev_id(hostdev_id)
|
||||
|
Reference in New Issue
Block a user