runtime: add geust memory dump

When guest panic, dump guest kernel memory to host filesystem.
And also includes:
- hypervisor config
- hypervisor version
- and state of sandbox

Fixes: #1012

Signed-off-by: bin liu <bin@hyper.sh>
This commit is contained in:
bin liu
2020-10-21 14:09:14 +08:00
parent 5b065eb599
commit 40418f6d88
16 changed files with 345 additions and 46 deletions

View File

@@ -440,6 +440,13 @@ type HypervisorConfig struct {
// Enable annotations by name
EnableAnnotations []string
// GuestCoredumpPath is the path in host for saving guest memory dump
GuestMemoryDumpPath string
// GuestMemoryDumpPaging is used to indicate if enable paging
// for QEMU dump-guest-memory command
GuestMemoryDumpPaging bool
}
// vcpu mapping from vcpu number to thread number
@@ -607,6 +614,10 @@ func (conf *HypervisorConfig) HypervisorAssetPath() (string, error) {
return conf.assetPath(types.HypervisorAsset)
}
func (conf *HypervisorConfig) IfPVPanicEnabled() bool {
return conf.GuestMemoryDumpPath != ""
}
// HypervisorCtlAssetPath returns the VM hypervisor ctl path
func (conf *HypervisorConfig) HypervisorCtlAssetPath() (string, error) {
return conf.assetPath(types.HypervisorCtlAsset)

View File

@@ -31,6 +31,7 @@ import (
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config"
persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/uuid"
@@ -99,6 +100,9 @@ type qemu struct {
stopped bool
store persistapi.PersistDriver
// if in memory dump progress
memoryDumpFlag sync.Mutex
}
const (
@@ -106,6 +110,9 @@ const (
qmpSocket = "qmp.sock"
vhostFSSocket = "vhost-fs.sock"
// memory dump format will be set to elf
memoryDumpFormat = "elf"
qmpCapErrMsg = "Failed to negoatiate QMP capabilities"
qmpExecCatCmd = "exec:cat"
@@ -408,13 +415,17 @@ func (q *qemu) buildDevices(initrdPath string) ([]govmmQemu.Device, *govmmQemu.I
}
}
if q.config.IfPVPanicEnabled() {
// there should have no errors for pvpanic device
devices, _ = q.arch.appendPVPanicDevice(devices)
}
var ioThread *govmmQemu.IOThread
if q.config.BlockDeviceDriver == config.VirtioSCSI {
return q.arch.appendSCSIController(devices, q.config.EnableIOThreads)
}
return devices, ioThread, nil
}
func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming {
@@ -1027,7 +1038,13 @@ func (q *qemu) qmpSetup() error {
return nil
}
cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
events := make(chan govmmQemu.QMPEvent)
go q.loopQMPEvent(events)
cfg := govmmQemu.QMPConfig{
Logger: newQMPLogger(),
EventCh: events,
}
// Auto-closed by QMPStart().
disconnectCh := make(chan struct{})
@@ -1050,6 +1067,136 @@ func (q *qemu) qmpSetup() error {
return nil
}
func (q *qemu) loopQMPEvent(event chan govmmQemu.QMPEvent) {
for {
select {
case e, open := <-event:
if !open {
q.Logger().Infof("QMP event channel closed")
return
}
q.Logger().WithField("event", e).Debug("got QMP event")
if e.Name == "GUEST_PANICKED" {
go q.handleGuestPanic()
}
}
}
}
func (q *qemu) handleGuestPanic() {
if err := q.dumpGuestMemory(q.config.GuestMemoryDumpPath); err != nil {
q.Logger().WithError(err).Error("failed to dump guest memory")
}
// TODO: how to notify the upper level sandbox to handle the error
// to do a fast fail(shutdown or others).
// tracked by https://github.com/kata-containers/kata-containers/issues/1026
}
// canDumpGuestMemory check if can do a guest memory dump operation.
// for now it only ensure there must be double of VM size for free disk spaces
func (q *qemu) canDumpGuestMemory(dumpSavePath string) error {
fs := unix.Statfs_t{}
if err := unix.Statfs(dumpSavePath, &fs); err != nil {
q.Logger().WithError(err).WithField("dumpSavePath", dumpSavePath).Error("failed to call Statfs")
return nil
}
availSpaceInBytes := fs.Bavail * uint64(fs.Bsize)
q.Logger().WithFields(
logrus.Fields{
"dumpSavePath": dumpSavePath,
"availSpaceInBytes": availSpaceInBytes,
}).Info("get avail space")
// get guest memory size
guestMemorySizeInBytes := (uint64(q.config.MemorySize) + uint64(q.state.HotpluggedMemory)) << utils.MibToBytesShift
q.Logger().WithField("guestMemorySizeInBytes", guestMemorySizeInBytes).Info("get guest memory size")
// default we want ensure there are at least double of VM memory size free spaces available,
// this may complete one dump operation for one sandbox
exceptMemorySize := guestMemorySizeInBytes * 2
if availSpaceInBytes >= exceptMemorySize {
return nil
} else {
return fmt.Errorf("there are not enough free space to store memory dump file. Except %d bytes, but only %d bytes available", exceptMemorySize, availSpaceInBytes)
}
}
// dumpSandboxMetaInfo save meta information for debug purpose, includes:
// hypervisor verison, sandbox/container state, hypervisor config
func (q *qemu) dumpSandboxMetaInfo(dumpSavePath string) {
dumpStatePath := filepath.Join(dumpSavePath, "state")
// copy state from /run/vc/sbs to memory dump directory
statePath := filepath.Join(q.store.RunStoragePath(), q.id)
command := []string{"/bin/cp", "-ar", statePath, dumpStatePath}
q.Logger().WithField("command", command).Info("try to save sandbox state")
if output, err := pkgUtils.RunCommandFull(command, true); err != nil {
q.Logger().WithError(err).WithField("output", output).Error("failed to save state")
}
// save hypervisor meta information
fileName := filepath.Join(dumpSavePath, "hypervisor.conf")
data, _ := json.MarshalIndent(q.config, "", " ")
if err := ioutil.WriteFile(fileName, data, defaultFilePerms); err != nil {
q.Logger().WithError(err).WithField("hypervisor.conf", data).Error("write to hypervisor.conf file failed")
}
// save hypervisor version
hyperVisorVersion, err := pkgUtils.RunCommand([]string{q.config.HypervisorPath, "--version"})
if err != nil {
q.Logger().WithError(err).WithField("HypervisorPath", data).Error("failed to get hypervisor version")
}
fileName = filepath.Join(dumpSavePath, "hypervisor.version")
if err := ioutil.WriteFile(fileName, []byte(hyperVisorVersion), defaultFilePerms); err != nil {
q.Logger().WithError(err).WithField("hypervisor.version", data).Error("write to hypervisor.version file failed")
}
}
func (q *qemu) dumpGuestMemory(dumpSavePath string) error {
if dumpSavePath == "" {
return nil
}
q.memoryDumpFlag.Lock()
defer q.memoryDumpFlag.Unlock()
q.Logger().WithField("dumpSavePath", dumpSavePath).Info("try to dump guest memory")
dumpSavePath = filepath.Join(dumpSavePath, q.id)
dumpStatePath := filepath.Join(dumpSavePath, "state")
if err := pkgUtils.EnsureDir(dumpStatePath, DirMode); err != nil {
return err
}
// save meta information for sandbox
q.dumpSandboxMetaInfo(dumpSavePath)
q.Logger().Info("dump sandbox meta information completed")
// check device free space and estimated dump size
if err := q.canDumpGuestMemory(dumpSavePath); err != nil {
q.Logger().Warnf("can't dump guest memory: %s", err.Error())
return err
}
// dump guest memory
protocol := fmt.Sprintf("file:%s/vmcore-%s.%s", dumpSavePath, time.Now().Format("20060102150405.999"), memoryDumpFormat)
q.Logger().Infof("try to dump guest memory to %s", protocol)
if err := q.qmpSetup(); err != nil {
q.Logger().WithError(err).Error("setup manage QMP failed")
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteDumpGuestMemory(q.qmpMonitorCh.ctx, protocol, q.config.GuestMemoryDumpPaging, memoryDumpFormat); err != nil {
q.Logger().WithError(err).Error("dump guest memory failed")
return err
}
q.Logger().Info("dump guest memory completed")
return nil
}
func (q *qemu) qmpShutdown() {
q.qmpMonitorCh.Lock()
defer q.qmpMonitorCh.Unlock()
@@ -2250,6 +2397,9 @@ func (q *qemu) load(s persistapi.HypervisorState) {
}
func (q *qemu) check() error {
q.memoryDumpFlag.Lock()
defer q.memoryDumpFlag.Unlock()
err := q.qmpSetup()
if err != nil {
return err

View File

@@ -133,6 +133,9 @@ type qemuArch interface {
// append vIOMMU device
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
// append pvpanic device
appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
}
type qemuArchBase struct {
@@ -789,3 +792,9 @@ func (q *qemuArchBase) appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Devi
return devices, fmt.Errorf("Machine Type %s does not support vIOMMU", q.qemuMachine.Type)
}
}
// appendPVPanicDevice appends a pvpanic device
func (q *qemuArchBase) appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error) {
devices = append(devices, govmmQemu.PVPanicDevice{NoShutdown: true})
return devices, nil
}