From: Mingwei Zhang mizhang@google.com
commit 683412ccf61294d727ead4a73d97397396e69a6b upstream.
Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned.
Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time.
KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running.
Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs.
Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson seanjc@google.com Reported-by: Mingwei Zhang mizhang@google.com Signed-off-by: Mingwei Zhang mizhang@google.com Message-Id: 20220421031407.2516575-4-mizhang@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com [OP: applied kvm_arch_guest_memory_reclaimed() calls in __kvm_set_memory_region() and kvm_mmu_notifier_invalidate_range_start(); OP: adjusted kvm_arch_guest_memory_reclaimed() to not use static_call_cond()] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 9 +++++++++ arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 16 ++++++++++++++-- 5 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4bc476d7fa6c..7167f94ed250 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1204,6 +1204,7 @@ struct kvm_x86_ops { int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1efcc7d4bc88..95f1293babae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5072,6 +5072,14 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); }
+static void sev_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + wbinvd_on_all_cpus(); +} + static void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu(svm_data, cpu); @@ -7385,6 +7393,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, + .guest_memory_reclaimed = sev_guest_memory_reclaimed,
.nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0b297583df8..bb391ff7a901 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8046,6 +8046,12 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); }
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ + if (kvm_x86_ops->guest_memory_reclaimed) + kvm_x86_ops->guest_memory_reclaimed(kvm); +} + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dd4cdad76b18..9a35585271d8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1408,6 +1408,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end);
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); + #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0008fc49528a..b1cb2ef209ca 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -164,6 +164,10 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, { }
+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ +} + bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) { /* @@ -324,6 +328,12 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); }
+static void kvm_flush_shadow_all(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); + kvm_arch_guest_memory_reclaimed(kvm); +} + int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; @@ -435,6 +445,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock); + kvm_arch_guest_memory_reclaimed(kvm); srcu_read_unlock(&kvm->srcu, idx);
return 0; @@ -538,7 +549,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx;
idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); }
@@ -844,7 +855,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); @@ -1143,6 +1154,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * - kvm_is_visible_gfn (mmu_check_roots) */ kvm_arch_flush_shadow_memslot(kvm, slot); + kvm_arch_guest_memory_reclaimed(kvm);
/* * We can re-use the old_memslots from above, the only difference