qemu-vanilla: add patches required in qemu 4.0 to run kata containers

Add kata specific patches for qemu 4.0:
* 9p optimization
* vm-templating patch

fixes #471

Signed-off-by: Julio Montes <julio.montes@intel.com>
This commit is contained in:
Julio Montes
2019-04-29 11:13:07 -05:00
parent 539e5d0fc8
commit 5c8d55c63f
2 changed files with 355 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
From 894a8ae5524fff4ad9d3551c515788c5650e1fc7 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Wed, 28 Mar 2018 20:14:53 +0800
Subject: [PATCH 1/2] 9p: removing coroutines of 9p to increase the I/O
performance
This is a quick workaround, need to be fixed.
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
hw/9pfs/9p.c | 12 +++++-------
hw/9pfs/9p.h | 6 +++---
hw/9pfs/coth.h | 3 +++
3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 55821343e5..c5f089860d 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -690,10 +690,7 @@ static void coroutine_fn pdu_complete(V9fsPDU *pdu, ssize_t len)
out_notify:
pdu->s->transport->push_and_notify(pdu);
- /* Now wakeup anybody waiting in flush for this request */
- if (!qemu_co_queue_next(&pdu->complete)) {
- pdu_free(pdu);
- }
+ pdu_free(pdu);
}
static mode_t v9mode_to_mode(uint32_t mode, V9fsString *extension)
@@ -3525,7 +3522,7 @@ static inline bool is_read_only_op(V9fsPDU *pdu)
void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
{
- Coroutine *co;
+// Coroutine *co;
CoroutineEntry *handler;
V9fsState *s = pdu->s;
@@ -3543,8 +3540,9 @@ void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
}
qemu_co_queue_init(&pdu->complete);
- co = qemu_coroutine_create(handler, pdu);
- qemu_coroutine_enter(co);
+ handler(pdu);
+ //co = qemu_coroutine_create(handler, pdu);
+ //qemu_coroutine_enter(co);
}
/* Returns 0 on success, 1 on failure. */
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 8883761b2c..24aeba03f7 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -320,21 +320,21 @@ extern int total_open_fd;
static inline void v9fs_path_write_lock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_wrlock(&s->rename_lock);
+ // qemu_co_rwlock_wrlock(&s->rename_lock);
}
}
static inline void v9fs_path_read_lock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_rdlock(&s->rename_lock);
+ // qemu_co_rwlock_rdlock(&s->rename_lock);
}
}
static inline void v9fs_path_unlock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_unlock(&s->rename_lock);
+ // qemu_co_rwlock_unlock(&s->rename_lock);
}
}
diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index 19e4d9287e..728a25865d 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -47,6 +47,9 @@
qemu_coroutine_yield(); \
} while (0)
+#undef v9fs_co_run_in_worker
+#define v9fs_co_run_in_worker(code_block) do {code_block} while(0);
+
void co_run_in_worker_bh(void *);
int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
--
2.17.2

View File

@@ -0,0 +1,257 @@
From ea692fc6ff15a231acd2d7396166bef8e49dab05 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Fri, 30 Mar 2018 18:09:54 +0800
Subject: [PATCH 2/2] migration: add capability to bypass the shared memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
1) What's this
When the migration capability 'bypass-shared-memory'
is set, the shared memory will be bypassed when migration.
It is the key feature to enable several excellent features for
the qemu, such as qemu-local-migration, qemu-live-update,
extremely-fast-save-restore, vm-template, vm-fast-live-clone,
yet-another-post-copy-migration, etc..
The philosophy behind this key feature, including the resulting
advanced key features, is that a part of the memory management
is separated out from the qemu, and let the other toolkits
such as libvirt, kata-containers (https://github.com/kata-containers)
runv(https://github.com/hyperhq/runv/) or some multiple cooperative
qemu commands directly access to it, manage it, provide features on it.
2) Status in real world
The hyperhq(http://hyper.sh http://hypercontainer.io/)
introduced the feature vm-template(vm-fast-live-clone)
to the hyper container for several years, it works perfect.
(see https://github.com/hyperhq/runv/pull/297).
The feature vm-template makes the containers(VMs) can
be started in 130ms and save 80M memory for every
container(VM). So that the hyper containers are fast
and high-density as normal containers.
kata-containers project (https://github.com/kata-containers)
which was launched by hyper, intel and friends and which descended
from runv (and clear-container) should have this feature enabled.
Unfortunately, due to the code confliction between runv&cc,
this feature was temporary disabled and it is being brought
back by hyper and intel team.
3) How to use and bring up advanced features.
In current qemu command line, shared memory has
to be configured via memory-object.
a) feature: qemu-local-migration, qemu-live-update
Set the mem-path on the tmpfs and set share=on for it when
start the vm. example:
-object \
memory-backend-file,id=mem,size=128M,mem-path=/dev/shm/memory,share=on \
-numa node,nodeid=0,cpus=0-7,memdev=mem
when you want to migrate the vm locally (after fixed a security bug
of the qemu-binary, or other reason), you can start a new qemu with
the same command line and -incoming, then you can migrate the
vm from the old qemu to the new qemu with the migration capability
'bypass-shared-memory' set. The migration will migrate the device-state
*ONLY*, the memory is the origin memory backed by tmpfs file.
b) feature: extremely-fast-save-restore
the same above, but the mem-path is on the persistent file system.
c) feature: vm-template, vm-fast-live-clone
the template vm is started as 1), and paused when the guest reaches
the template point(example: the guest app is ready), then the template
vm is saved. (the qemu process of the template can be killed now, because
we need only the memory and the device state files (in tmpfs)).
Then we can launch one or multiple VMs base on the template vm states,
the new VMs are started without the “share=on”, all the new VMs share
the initial memory from the memory file, they save a lot of memory.
all the new VMs start from the template point, the guest app can go to
work quickly.
The new VM booted from template vm cant become template again,
if you need this unusual chained-template feature, you can write
a cloneable-tmpfs kernel module for it.
The libvirt toolkit cant manage vm-template currently, in the
hyperhq/runv, we use qemu wrapper script to do it. I hope someone add
“libvrit managed template” feature to libvirt.
d) feature: yet-another-post-copy-migration
It is a possible feature, no toolkit can do it well now.
Using nbd server/client on the memory file is reluctantly Ok but
inconvenient. A special feature for tmpfs might be needed to
fully complete this feature.
No one need yet another post copy migration method,
but it is possible when some crazy man need it.
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Sebastien Boeuf <sebastien.boeuf@intel.com>
Cc: James O. D. Hunt <james.o.hunt@intel.com>
Cc: Xu Wang <gnawux@gmail.com>
Cc: Peng Tao <bergwolf@gmail.com>
Cc: Xiao Guangrong <xiaoguangrong@tencent.com>
Cc: Xiao Guangrong <xiaoguangrong.eric@gmail.com>
Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
---
migration/migration.c | 14 ++++++++++++++
migration/migration.h | 1 +
migration/ram.c | 27 ++++++++++++++++++---------
qapi/migration.json | 9 +++++++--
4 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/migration/migration.c b/migration/migration.c
index 609e0df5d0..02c96aadb1 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2010,6 +2010,20 @@ bool migrate_release_ram(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
}
+bool migrate_bypass_shared_memory(void)
+{
+ MigrationState *s;
+
+ /* it is not workable with postcopy yet. */
+ if (migrate_postcopy_ram()) {
+ return false;
+ }
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_BYPASS_SHARED_MEMORY];
+}
+
bool migrate_postcopy_ram(void)
{
MigrationState *s;
diff --git a/migration/migration.h b/migration/migration.h
index 438f17edad..e8eae82910 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -261,6 +261,7 @@ MigrationState *migrate_get_current(void);
bool migrate_postcopy(void);
+bool migrate_bypass_shared_memory(void);
bool migrate_release_ram(void);
bool migrate_postcopy_ram(void);
bool migrate_zero_blocks(void);
diff --git a/migration/ram.c b/migration/ram.c
index 1ca9ba77b6..1b35b4a30c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1646,6 +1646,11 @@ unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
unsigned long *bitmap = rb->bmap;
unsigned long next;
+ /* when this ramblock is requested bypassing */
+ if (!bitmap) {
+ return size;
+ }
+
if (ramblock_is_ignored(rb)) {
return size;
}
@@ -1773,7 +1778,9 @@ static void migration_bitmap_sync(RAMState *rs)
qemu_mutex_lock(&rs->bitmap_mutex);
rcu_read_lock();
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
- migration_bitmap_sync_range(rs, block, 0, block->used_length);
+ if (!migrate_bypass_shared_memory() || !qemu_ram_is_shared(block)) {
+ migration_bitmap_sync_range(rs, block, 0, block->used_length);
+ }
}
ram_counters.remaining = ram_bytes_remaining();
rcu_read_unlock();
@@ -3183,18 +3190,12 @@ static int ram_state_init(RAMState **rsp)
qemu_mutex_init(&(*rsp)->src_page_req_mutex);
QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
- /*
- * Count the total number of pages used by ram blocks not including any
- * gaps due to alignment or unplugs.
- */
- (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
-
ram_state_reset(*rsp);
return 0;
}
-static void ram_list_init_bitmaps(void)
+static void ram_list_init_bitmaps(RAMState *rs)
{
RAMBlock *block;
unsigned long pages;
@@ -3202,9 +3203,17 @@ static void ram_list_init_bitmaps(void)
/* Skip setting bitmap if there is no RAM */
if (ram_bytes_total()) {
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ if (migrate_bypass_shared_memory() && qemu_ram_is_shared(block)) {
+ continue;
+ }
pages = block->max_length >> TARGET_PAGE_BITS;
block->bmap = bitmap_new(pages);
bitmap_set(block->bmap, 0, pages);
+ /*
+ * Count the total number of pages used by ram blocks not
+ * including any gaps due to alignment or unplugs.
+ */
+ rs->migration_dirty_pages += pages;
if (migrate_postcopy_ram()) {
block->unsentmap = bitmap_new(pages);
bitmap_set(block->unsentmap, 0, pages);
@@ -3220,7 +3229,7 @@ static void ram_init_bitmaps(RAMState *rs)
qemu_mutex_lock_ramlist();
rcu_read_lock();
- ram_list_init_bitmaps();
+ ram_list_init_bitmaps(rs);
memory_global_dirty_log_start();
migration_bitmap_sync_precopy(rs);
diff --git a/qapi/migration.json b/qapi/migration.json
index 9cfbaf8c6c..4194fdd5a4 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -415,6 +415,10 @@
#
# @x-ignore-shared: If enabled, QEMU will not migrate shared memory (since 4.0)
#
+# @bypass-shared-memory: the shared memory region will be bypassed on migration.
+# This feature allows the memory region to be reused by new qemu(s)
+# or be migrated separately. (since 2.12)
+#
# Since: 1.2
##
{ 'enum': 'MigrationCapability',
@@ -422,7 +426,7 @@
'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram',
'block', 'return-path', 'pause-before-switchover', 'multifd',
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
- 'x-ignore-shared' ] }
+ 'bypass-shared-memory', 'x-ignore-shared' ] }
##
# @MigrationCapabilityStatus:
@@ -476,7 +480,8 @@
# {"state": false, "capability": "compress"},
# {"state": true, "capability": "events"},
# {"state": false, "capability": "postcopy-ram"},
-# {"state": false, "capability": "x-colo"}
+# {"state": false, "capability": "x-colo"},
+# {"state": false, "capability": "bypass-shared-memory"}
# ]}
#
##
--
2.17.2