From 38cd5b481d52a3a47560983d38a31e0d9004e7dc Mon Sep 17 00:00:00 2001
From: Shuo A Liu <shuo.a.liu@intel.com>
Date: Mon, 1 Feb 2021 18:15:20 +0800
Subject: [PATCH] hv: keylocker: host keylocker iwkey context switch

Different vCPU may have different IWKeys. Hypervisor need do the iwkey
context switch.

This patch introduce a load_iwkey() function to do that. Switches the
host iwkey when the switch_in vCPU satisfies:
  1) keylocker feature enabled
  2) Different from the current loaded one.

Two opportunities to do the load_iwkey():
  1) Guest enables CR4.KL bit.
  2) vCPU thread context switch.

load_iwkey() costs ~600 cycles when do the load IWKey action.

Tracked-On: #5695
Signed-off-by: Shuo A Liu <shuo.a.liu@intel.com>
Acked-by: Eddie Dong <eddie.dong@intel.com>
---
 hypervisor/arch/x86/guest/vcpu.c         | 31 ++++++++++++++++++++++++
 hypervisor/arch/x86/guest/virtual_cr.c   |  1 +
 hypervisor/arch/x86/guest/vmexit.c       |  3 ++-
 hypervisor/include/arch/x86/cpu.h        |  2 +-
 hypervisor/include/arch/x86/guest/vcpu.h |  1 +
 hypervisor/include/arch/x86/per_cpu.h    |  1 +
 6 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/hypervisor/arch/x86/guest/vcpu.c b/hypervisor/arch/x86/guest/vcpu.c
index 8f75eda80..56412ea77 100644
--- a/hypervisor/arch/x86/guest/vcpu.c
+++ b/hypervisor/arch/x86/guest/vcpu.c
@@ -205,6 +205,35 @@ static void init_iwkey(struct acrn_vcpu *vcpu)
 		vcpu->arch.IWKey.encryption_key[1] = get_random_value();
 		vcpu->arch.IWKey.encryption_key[2] = get_random_value();
 		vcpu->arch.IWKey.encryption_key[3] = get_random_value();
+		/* It's always safe to clear whose_iwkey */
+		per_cpu(whose_iwkey, pcpuid_from_vcpu(vcpu)) = NULL;
+	}
+}
+
+void load_iwkey(struct acrn_vcpu *vcpu)
+{
+	uint64_t xmm_save[6];
+
+	/* Only load IWKey with vCPU CR4 keylocker bit enabled */
+	if (pcpu_has_cap(X86_FEATURE_KEYLOCKER) && vcpu->arch.cr4_kl_enabled &&
+	    (get_cpu_var(whose_iwkey) != vcpu)) {
+		/* Save/restore xmm0/xmm1/xmm2 during the process */
+		asm volatile (	"movdqu %%xmm0, %0\n"
+				"movdqu %%xmm1, %1\n"
+				"movdqu %%xmm2, %2\n"
+				"movdqu %3, %%xmm0\n"
+				"movdqu %4, %%xmm1\n"
+				"movdqu %5, %%xmm2\n"
+				: "=m"(xmm_save[0]), "=m"(xmm_save[2]), "=m"(xmm_save[4])
+				: "m"(vcpu->arch.IWKey.integrity_key[0]),
+				"m"(vcpu->arch.IWKey.encryption_key[0]),
+				"m"(vcpu->arch.IWKey.encryption_key[2]));
+		asm_loadiwkey(0);
+		asm volatile (	"movdqu %2, %%xmm2\n"
+				"movdqu %1, %%xmm1\n"
+				"movdqu %0, %%xmm0\n"
+				: : "m"(xmm_save[0]), "m"(xmm_save[2]), "m"(xmm_save[4]));
+		get_cpu_var(whose_iwkey) = vcpu;
 	}
 }
 
@@ -826,6 +855,8 @@ static void context_switch_in(struct thread_object *next)
 	msr_write(MSR_IA32_FMASK, ectx->ia32_fmask);
 	msr_write(MSR_IA32_KERNEL_GS_BASE, ectx->ia32_kernel_gs_base);
 
+	load_iwkey(vcpu);
+
 	rstore_xsave_area(vcpu, ectx);
 }
 
diff --git a/hypervisor/arch/x86/guest/virtual_cr.c b/hypervisor/arch/x86/guest/virtual_cr.c
index 40d734203..d5e9e8dc3 100644
--- a/hypervisor/arch/x86/guest/virtual_cr.c
+++ b/hypervisor/arch/x86/guest/virtual_cr.c
@@ -391,6 +391,7 @@ static void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4)
 		if (!err_found && ((cr4_changed_bits & CR4_KL) != 0UL)) {
 			if ((cr4 & CR4_KL) != 0UL) {
 				vcpu->arch.cr4_kl_enabled = true;
+				load_iwkey(vcpu);
 			} else {
 				vcpu->arch.cr4_kl_enabled = false;
 			}
diff --git a/hypervisor/arch/x86/guest/vmexit.c b/hypervisor/arch/x86/guest/vmexit.c
index 0344b950f..0091a9d43 100644
--- a/hypervisor/arch/x86/guest/vmexit.c
+++ b/hypervisor/arch/x86/guest/vmexit.c
@@ -453,7 +453,8 @@ static int32_t loadiwkey_vmexit_handler(struct acrn_vcpu *vcpu)
 		vcpu->arch.IWKey.integrity_key[0] = xmm[0];
 		vcpu->arch.IWKey.integrity_key[1] = xmm[1];
 
-		loadiwkey(0);
+		asm_loadiwkey(0);
+		get_cpu_var(whose_iwkey) = vcpu;
 	}
 
 	return 0;
diff --git a/hypervisor/include/arch/x86/cpu.h b/hypervisor/include/arch/x86/cpu.h
index 19244196b..cf8de386f 100644
--- a/hypervisor/include/arch/x86/cpu.h
+++ b/hypervisor/include/arch/x86/cpu.h
@@ -659,7 +659,7 @@ static inline void xrstors(const struct xsave_area *region_addr, uint64_t mask)
 			"memory");
 }
 
-static inline void loadiwkey(uint32_t eax)
+static inline void asm_loadiwkey(uint32_t eax)
 {
 	asm volatile(".byte 0xf3, 0x0f, 0x38, 0xdc, 0xd1;": : "a" (eax));
 }
diff --git a/hypervisor/include/arch/x86/guest/vcpu.h b/hypervisor/include/arch/x86/guest/vcpu.h
index 45c4337a9..40108b8ff 100644
--- a/hypervisor/include/arch/x86/guest/vcpu.h
+++ b/hypervisor/include/arch/x86/guest/vcpu.h
@@ -602,6 +602,7 @@ struct acrn_vcpu *get_ever_run_vcpu(uint16_t pcpu_id);
 
 void save_xsave_area(struct acrn_vcpu *vcpu, struct ext_context *ectx);
 void rstore_xsave_area(const struct acrn_vcpu *vcpu, const struct ext_context *ectx);
+void load_iwkey(struct acrn_vcpu *vcpu);
 
 /**
  * @brief create a vcpu for the target vm
diff --git a/hypervisor/include/arch/x86/per_cpu.h b/hypervisor/include/arch/x86/per_cpu.h
index e584a37fe..3abf117ab 100644
--- a/hypervisor/include/arch/x86/per_cpu.h
+++ b/hypervisor/include/arch/x86/per_cpu.h
@@ -59,6 +59,7 @@ struct per_cpu_region {
 #endif
 	uint64_t shutdown_vm_bitmap;
 	uint64_t tsc_suspend;
+	struct acrn_vcpu *whose_iwkey;
 	/*
 	 * We maintain a per-pCPU array of vCPUs. vCPUs of a VM won't
 	 * share same pCPU. So the maximum possible # of vCPUs that can