Hello,
This is RFC v2 for the TDX intra-host migration patch series. It addresses comments in RFC v1 [1] and is rebased onto the latest kvm/next (v6.16-rc1).
This patchset was built on top of the latest TDX selftests [2] and gmem linking [3] RFC patch series.
Here is the series stitched together for your convenience: https://github.com/googleprodkernel/linux-cc/tree/tdx-copyless-rfc-v2
Changes from RFC v1: + Added patch to prevent deadlock warnings by re-ordering locking order. + Added patch to allow vCPUs to be created for uninitialized VMs. + Minor optimizations to TDX intra-host migration core logic. + Moved lapic state transfer into TDX intra-host migration core logic. + Added logic to handle posted interrupts that are injected during migration. + Added selftests. + Addressed comments from RFC v1. + Various small changes to make patchset compatible with latest version of kvm/next.
[1] https://lore.kernel.org/lkml/20230407201921.2703758-2-sagis@google.com [2] https://lore.kernel.org/lkml/20250414214801.2693294-2-sagis@google.com [3] https://lore.kernel.org/all/cover.1747368092.git.afranji@google.com
Ackerley Tng (2): KVM: selftests: Add TDX support for ucalls KVM: selftests: Add irqfd/interrupts test for TDX with migration
Ryan Afranji (3): KVM: x86: Adjust locking order in move_enc_context_from KVM: TDX: Allow vCPUs to be created for migration KVM: selftests: Refactor userspace_mem_region creation out of vm_mem_add
Sagi Shahar (5): KVM: Split tdp_mmu_pages to mirror and direct counters KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from KVM: TDX: Implement moving mirror pages between 2 TDs KVM: TDX: Add core logic for TDX intra-host migration KVM: selftests: TDX: Add tests for TDX in-place migration
arch/x86/include/asm/kvm_host.h | 7 +- arch/x86/kvm/mmu.h | 2 + arch/x86/kvm/mmu/mmu.c | 66 ++++ arch/x86/kvm/mmu/tdp_mmu.c | 72 +++- arch/x86/kvm/mmu/tdp_mmu.h | 6 + arch/x86/kvm/svm/sev.c | 13 +- arch/x86/kvm/vmx/main.c | 12 +- arch/x86/kvm/vmx/tdx.c | 236 +++++++++++- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 14 +- tools/testing/selftests/kvm/Makefile.kvm | 2 + .../testing/selftests/kvm/include/kvm_util.h | 25 ++ .../selftests/kvm/include/x86/tdx/tdx_util.h | 3 + .../selftests/kvm/include/x86/tdx/test_util.h | 5 + .../testing/selftests/kvm/include/x86/ucall.h | 4 +- tools/testing/selftests/kvm/lib/kvm_util.c | 222 ++++++++---- .../testing/selftests/kvm/lib/ucall_common.c | 2 +- .../selftests/kvm/lib/x86/tdx/tdx_util.c | 63 +++- .../selftests/kvm/lib/x86/tdx/test_util.c | 17 + tools/testing/selftests/kvm/lib/x86/ucall.c | 108 ++++-- .../kvm/x86/tdx_irqfd_migrate_test.c | 264 ++++++++++++++ .../selftests/kvm/x86/tdx_migrate_tests.c | 337 ++++++++++++++++++ 22 files changed, 1349 insertions(+), 132 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c create mode 100644 tools/testing/selftests/kvm/x86/tdx_migrate_tests.c
From: Sagi Shahar sagis@google.com
tdp_mmu_pages counts all the active pages used by the mmu. When we transfer the state during intra-host migration we need to transfer the mirror pages but not the direct ones. The direct pages are going to be re-faulted as needed on the destination, but that approach doesn't work for mirrored pages which stores information in the secure EPT.
Keeping them in separate counters makes this transfer more efficient.
Signed-off-by: Sagi Shahar sagis@google.com Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/include/asm/kvm_host.h | 7 +++++-- arch/x86/kvm/mmu/tdp_mmu.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 999872c13722..b9966394acda 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1484,10 +1484,13 @@ struct kvm_arch { #ifdef CONFIG_X86_64 #ifdef CONFIG_KVM_PROVE_MMU /* - * The number of TDP MMU pages across all roots. Used only to sanity - * check that KVM isn't leaking TDP MMU pages. + * The number of non-mirrored TDP MMU pages across all roots. + * Used only to sanity check that KVM isn't leaking TDP MMU pages. */ atomic64_t tdp_mmu_pages; + + /* Same as tdp_mmu_pages but only for mirror pages. */ + atomic64_t tdp_mirror_mmu_pages; #endif
/* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..115af5e4c5ed 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -42,6 +42,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
#ifdef CONFIG_KVM_PROVE_MMU KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mirror_mmu_pages)); #endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
@@ -328,7 +329,10 @@ static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); #ifdef CONFIG_KVM_PROVE_MMU - atomic64_inc(&kvm->arch.tdp_mmu_pages); + if (sp->role.is_mirror) + atomic64_inc(&kvm->arch.tdp_mirror_mmu_pages); + else + atomic64_inc(&kvm->arch.tdp_mmu_pages); #endif }
@@ -336,7 +340,10 @@ static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); #ifdef CONFIG_KVM_PROVE_MMU - atomic64_dec(&kvm->arch.tdp_mmu_pages); + if (sp->role.is_mirror) + atomic64_dec(&kvm->arch.tdp_mirror_mmu_pages); + else + atomic64_dec(&kvm->arch.tdp_mmu_pages); #endif }
Previously, the order for acquiring the locks required for the migration function move_enc_context_from() was: 1) memslot lock 2) vCPU lock. This can trigger a deadlock warning because a vCPU IOCTL modifying memslots will acquire the locks in reverse order: 1) vCPU lock 2) memslot lock.
This patch adjusts move_enc_context_from() to match vCPU IOCTL’s locking order to prevent deadlock warnings.
Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/kvm/svm/sev.c | 13 +------------ arch/x86/kvm/x86.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402543994b0b..380d5951f8dd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1961,26 +1961,15 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, struct kvm *source_kvm) charged = true; }
- ret = kvm_lock_all_vcpus(kvm); - if (ret) - goto out_dst_cgroup; - ret = kvm_lock_all_vcpus(source_kvm); - if (ret) - goto out_dst_vcpu; - ret = sev_check_source_vcpus(kvm, source_kvm); if (ret) - goto out_source_vcpu; + goto out_dst_cgroup;
sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0;
-out_source_vcpu: - kvm_unlock_all_vcpus(source_kvm); -out_dst_vcpu: - kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1672379a16b..c28fa28a8e42 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6743,10 +6743,18 @@ static int kvm_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) if (r) goto out_mark_migration_done;
- r = kvm_lock_vm_memslots(kvm, source_kvm); + r = kvm_lock_all_vcpus(kvm); if (r) goto out_unlock;
+ r = kvm_lock_all_vcpus(source_kvm); + if (r) + goto out_unlock_vcpus; + + r = kvm_lock_vm_memslots(kvm, source_kvm); + if (r) + goto out_unlock_source_vcpus; + r = kvm_move_memory_ctxt_from(kvm, source_kvm); if (r) goto out_unlock_memslots; @@ -6762,6 +6770,10 @@ static int kvm_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
out_unlock_memslots: kvm_unlock_vm_memslots(kvm, source_kvm); +out_unlock_source_vcpus: + kvm_unlock_all_vcpus(source_kvm); +out_unlock_vcpus: + kvm_unlock_all_vcpus(kvm); out_unlock: kvm_unlock_two_vms(kvm, source_kvm); out_mark_migration_done:
From: Sagi Shahar sagis@google.com
This should mostly match the logic in sev_vm_move_enc_context_from.
Signed-off-by: Sagi Shahar sagis@google.com Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/kvm/vmx/main.c | 12 +++++++++++- arch/x86/kvm/vmx/tdx.c | 24 ++++++++++++++++++++++++ arch/x86/kvm/vmx/x86_ops.h | 1 + 3 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index d1e02e567b57..125af25fd09a 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -879,6 +879,14 @@ static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; }
+static int vt_move_enc_context_from(struct kvm *kvm, struct kvm *source_kvm) +{ + if (!is_td(kvm)) + return -ENOTTY; + + return tdx_vm_move_enc_context_from(kvm, source_kvm); +} + #define vt_op(name) vt_##name #define vt_op_tdx_only(name) vt_##name #else /* CONFIG_KVM_INTEL_TDX */ @@ -1044,7 +1052,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
- .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) + .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level), + + .vm_move_enc_context_from = vt_move_enc_context_from };
struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b952bc673271..07583a11d6e3 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -626,6 +626,7 @@ int tdx_vm_init(struct kvm *kvm) kvm->arch.has_protected_state = true; kvm->arch.has_private_mem = true; kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm->arch.use_vm_enc_ctxt_op = true;
/* * Because guest TD is protected, VMM can't parse the instruction in TD. @@ -3524,3 +3525,26 @@ int __init tdx_bringup(void) enable_tdx = 0; return 0; } + +static __always_inline bool tdx_finalized(struct kvm *kvm) +{ + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm); + + return tdx_kvm->state == TD_STATE_RUNNABLE; +} + +static int tdx_migrate_from(struct kvm *dst, struct kvm *src) +{ + return -EINVAL; +} + +int tdx_vm_move_enc_context_from(struct kvm *kvm, struct kvm *src_kvm) +{ + if (!is_td(kvm) || !is_td(src_kvm)) + return -EINVAL; + + if (tdx_finalized(kvm) || !tdx_finalized(src_kvm)) + return -EINVAL; + + return tdx_migrate_from(kvm, src_kvm); +} diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index b4596f651232..001f1540a560 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -164,6 +164,7 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int tdx_vm_move_enc_context_from(struct kvm *kvm, struct kvm *source_kvm); #endif
#endif /* __KVM_X86_VMX_X86_OPS_H */
From: Sagi Shahar sagis@google.com
Added functionality for moving the mirror EPT table from one TD to a new one.
This function moves the root of the mirror EPT table and overwrites the root of the destination.
Signed-off-by: Sagi Shahar sagis@google.com Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/kvm/mmu.h | 2 ++ arch/x86/kvm/mmu/mmu.c | 66 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 61 ++++++++++++++++++++++++++++++++--- arch/x86/kvm/mmu/tdp_mmu.h | 6 ++++ 4 files changed, 130 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b4b6860ab971..b43d770daa05 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -102,6 +102,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +int kvm_mmu_move_mirror_pages_from(struct kvm_vcpu *vcpu, + struct kvm_vcpu *src_vcpu);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cbc84c6abc2e..09c1892e0ac1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3943,6 +3943,72 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) return r; }
+int kvm_mmu_move_mirror_pages_from(struct kvm_vcpu *vcpu, + struct kvm_vcpu *src_vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_mmu *src_mmu = src_vcpu->arch.mmu; + gfn_t gfn_shared = kvm_gfn_direct_bits(vcpu->kvm); + hpa_t mirror_root_hpa; + int r = -EINVAL; + + if (!gfn_shared) + return r; + + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); + if (r) + return r; + + /* Hold locks for both src and dst. Always take the src lock first. */ + read_lock(&src_vcpu->kvm->mmu_lock); + write_lock_nested(&vcpu->kvm->mmu_lock, SINGLE_DEPTH_NESTING); + + WARN_ON_ONCE(!is_tdp_mmu_active(vcpu)); + WARN_ON_ONCE(!is_tdp_mmu_active(src_vcpu)); + + /* + * The mirror root is moved from the src to the dst and is marked as + * invalid in the src. + */ + mirror_root_hpa = kvm_tdp_mmu_move_mirror_pages_from(vcpu, src_vcpu); + if (mirror_root_hpa == INVALID_PAGE) { + struct kvm_mmu_page *mirror_root; + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + + /* + * This likely means that the mirror root was already moved by + * another vCPU. + */ + role.is_mirror = true; + mirror_root = kvm_tdp_mmu_get_vcpu_root(vcpu, role); + if (!mirror_root) { + r = -EINVAL; + goto out_unlock; + } + mirror_root_hpa = __pa(mirror_root->spt); + } + + mmu->mirror_root_hpa = mirror_root_hpa; + mmu_free_root_page(src_vcpu->kvm, &src_mmu->mirror_root_hpa, NULL); + write_unlock(&vcpu->kvm->mmu_lock); + read_unlock(&src_vcpu->kvm->mmu_lock); + + /* The direct root is allocated normally and is not moved from src. */ + kvm_tdp_mmu_alloc_root(vcpu, false); + + kvm_mmu_load_pgd(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); + + return r; + +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); + read_unlock(&src_vcpu->kvm->mmu_lock); + + return r; +} +EXPORT_SYMBOL(kvm_mmu_move_mirror_pages_from); + static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 115af5e4c5ed..212716ab7e8b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -251,6 +251,22 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); }
+struct kvm_mmu_page * +kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu, + union kvm_mmu_page_role role) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + lockdep_assert_held(&kvm->mmu_lock); + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (root->role.word == role.word && + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) + return root; + } + return NULL; +} + void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -285,11 +301,9 @@ void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) * fails, as the last reference to a root can only be put *after* the * root has been invalidated, which requires holding mmu_lock for write. */ - list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (root->role.word == role.word && - !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) - goto out_spin_unlock; - } + root = kvm_tdp_mmu_get_vcpu_root(vcpu, role); + if (!!root) + goto out_spin_unlock;
root = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_sp(root, NULL, 0, role); @@ -321,6 +335,43 @@ void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) } }
+hpa_t kvm_tdp_mmu_move_mirror_pages_from(struct kvm_vcpu *vcpu, + struct kvm_vcpu *src_vcpu) +{ + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm *kvm = vcpu->kvm; + struct kvm *src_kvm = src_vcpu->kvm; + struct kvm_mmu_page *mirror_root = NULL; + s64 num_mirror_pages, old; + + lockdep_assert_held_read(&src_vcpu->kvm->mmu_lock); + lockdep_assert_held_write(&vcpu->kvm->mmu_lock); + + /* Find the mirror root of the source. */ + role.is_mirror = true; + mirror_root = kvm_tdp_mmu_get_vcpu_root(src_vcpu, role); + if (!mirror_root) + return INVALID_PAGE; + + /* Remove the mirror root from the src kvm and add it to dst kvm. */ + spin_lock(&src_vcpu->kvm->arch.tdp_mmu_pages_lock); + list_del_rcu(&mirror_root->link); + spin_unlock(&src_vcpu->kvm->arch.tdp_mmu_pages_lock); + + /* The destination holds a write lock so no spin_lock required. */ + list_add_rcu(&mirror_root->link, &kvm->arch.tdp_mmu_roots); + +#ifdef CONFIG_KVM_PROVE_MMU + num_mirror_pages = atomic64_read(&src_kvm->arch.tdp_mirror_mmu_pages); + old = atomic64_cmpxchg(&kvm->arch.tdp_mirror_mmu_pages, 0, + num_mirror_pages); + /* The destination VM should have no mirror pages at this point. */ + WARN_ON(old); + atomic64_set(&src_kvm->arch.tdp_mirror_mmu_pages, 0); +#endif + return __pa(mirror_root->spt); +} + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 52acf99d40a0..abb1a84d8b1c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -63,6 +63,12 @@ static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, return root_to_sp(vcpu->arch.mmu->root.hpa); }
+struct kvm_mmu_page * +kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu, + union kvm_mmu_page_role role); +hpa_t kvm_tdp_mmu_move_mirror_pages_from(struct kvm_vcpu *vcpu, + struct kvm_vcpu *src_vcpu); + bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm);
During migration, vCPUs need to be created for an uninitialized VM.
This commit moves the TDX vCPU setup that requires an initialized VM out of tdx_vcpu_create() and into tdx_td_vcpu_init().
Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/kvm/vmx/tdx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 07583a11d6e3..4582f94175b7 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -664,9 +664,6 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); struct vcpu_tdx *tdx = to_tdx(vcpu);
- if (kvm_tdx->state != TD_STATE_INITIALIZED) - return -EIO; - /* * TDX module mandates APICv, which requires an in-kernel local APIC. * Disallow an in-kernel I/O APIC, because level-triggered interrupts @@ -692,12 +689,6 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
- vcpu->arch.guest_state_protected = - !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); - - if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) - vcpu->arch.xfd_no_write_intercept = true; - tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; __pi_set_sn(&tdx->vt.pi_desc);
@@ -3003,8 +2994,9 @@ static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) { - u64 apic_base; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 apic_base; int ret;
if (cmd->flags) @@ -3013,6 +3005,15 @@ static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) return -EINVAL;
+ if (kvm_tdx->state != TD_STATE_INITIALIZED) + return -EIO; + + vcpu->arch.guest_state_protected = !(kvm_tdx->attributes & + TDX_TD_ATTR_DEBUG); + + if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) + vcpu->arch.xfd_no_write_intercept = true; + /* * TDX requires X2APIC, userspace is responsible for configuring guest * CPUID accordingly.
From: Sagi Shahar sagis@google.com
Adds the core logic for transferring state between source and destination TDs during intra-host migration.
Signed-off-by: Sagi Shahar sagis@google.com Co-developed-by: Ryan Afranji afranji@google.com Signed-off-by: Ryan Afranji afranji@google.com --- arch/x86/kvm/vmx/tdx.c | 193 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4582f94175b7..268aca28d878 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3534,9 +3534,200 @@ static __always_inline bool tdx_finalized(struct kvm *kvm) return tdx_kvm->state == TD_STATE_RUNNABLE; }
+#define MAX_APIC_VECTOR 256 + +static int tdx_migrate_vcpus(struct kvm *dst, struct kvm *src) +{ + struct kvm_vcpu *src_vcpu; + struct kvm_tdx *dst_tdx; + unsigned long i; + + dst_tdx = to_kvm_tdx(dst); + + kvm_for_each_vcpu(i, src_vcpu, src) + tdx_flush_vp_on_cpu(src_vcpu); + + /* Copy per-vCPU state. */ + kvm_for_each_vcpu(i, src_vcpu, src) { + struct vcpu_tdx *dst_tdx_vcpu, *src_tdx_vcpu; + struct kvm_lapic_state src_lapic_state; + struct kvm_vcpu *dst_vcpu; + u64 apic_base; + u32 vector; + int ret; + + src_tdx_vcpu = to_tdx(src_vcpu); + dst_vcpu = kvm_get_vcpu(dst, i); + dst_tdx_vcpu = to_tdx(dst_vcpu); + + dst_vcpu->cpu = -1; + + /* Destination vCPU initialization skipped so do it here. */ + apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(dst_vcpu) ? + MSR_IA32_APICBASE_BSP : 0); + if (kvm_apic_set_base(dst_vcpu, apic_base, true)) + return -EINVAL; + + /* Copy lapic state. */ + ret = kvm_apic_get_state(src_vcpu, &src_lapic_state); + if (ret) + return -EINVAL; + + ret = kvm_apic_set_state(dst_vcpu, &src_lapic_state); + if (ret) + return -EINVAL; + + /* + * pi_desc stores state of posted interrupts for VMs which are + * processed by pcpu during VM entry/runtime. For + * non-confidential VMs, this storage is synchronized to vcpu + * state using set_lapic_state(sync_pir_to_virr). + * + * For TDX VMs, KVM doesn't have access to virtual lapic page, + * so in order to preserve the interrupt state, copy over + * pi_desc contents to destination VM during copyless migration. + */ + dst_tdx_vcpu->vt = src_tdx_vcpu->vt; + for (vector = 0; vector < MAX_APIC_VECTOR; vector++) { + if (pi_test_pir(vector, &src_tdx_vcpu->vt.pi_desc)) { + __vmx_deliver_posted_interrupt( + dst_vcpu, + &dst_tdx_vcpu->vt.pi_desc, + vector); + } + } + + /* Copy non-TDX vCPU state. */ + memcpy(dst_vcpu->arch.regs, src_vcpu->arch.regs, + NR_VCPU_REGS * sizeof(src_vcpu->arch.regs[0])); + + dst_vcpu->arch.regs_avail = src_vcpu->arch.regs_avail; + dst_vcpu->arch.regs_dirty = src_vcpu->arch.regs_dirty; + dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset; + dst_vcpu->arch.guest_state_protected = + src_vcpu->arch.guest_state_protected; + dst_vcpu->arch.xfd_no_write_intercept = + src_vcpu->arch.xfd_no_write_intercept; + dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset; + + /* Copy TD structures. */ + dst_tdx_vcpu->vp.tdvpr_page = src_tdx_vcpu->vp.tdvpr_page; + dst_tdx_vcpu->vp.tdcx_pages = src_tdx_vcpu->vp.tdcx_pages; + + td_vmcs_write64(dst_tdx_vcpu, POSTED_INTR_DESC_ADDR, + __pa(&dst_tdx_vcpu->vt.pi_desc)); + + /* Copy current vCPU status. */ + dst_tdx_vcpu->ext_exit_qualification = + src_tdx_vcpu->ext_exit_qualification; + dst_tdx_vcpu->exit_gpa = src_tdx_vcpu->exit_gpa; + dst_tdx_vcpu->vp_enter_args = src_tdx_vcpu->vp_enter_args; + dst_tdx_vcpu->vp_enter_ret = src_tdx_vcpu->vp_enter_ret; + dst_tdx_vcpu->guest_entered = src_tdx_vcpu->guest_entered; + dst_tdx_vcpu->map_gpa_next = src_tdx_vcpu->map_gpa_next; + dst_tdx_vcpu->map_gpa_end = src_tdx_vcpu->map_gpa_end; + + /* Copy mirror EPT tables. */ + vcpu_load(dst_vcpu); + if (kvm_mmu_move_mirror_pages_from(dst_vcpu, src_vcpu)) { + vcpu_put(dst_vcpu); + return -EINVAL; + } + vcpu_put(dst_vcpu); + + dst_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + dst_tdx_vcpu->state = VCPU_TD_STATE_INITIALIZED; + + /* + * Set these source's vCPU migrated structures to NULL to avoid + * freeing them during source VM shutdown. + */ + src_tdx_vcpu->vp.tdvpr_page = NULL; + src_tdx_vcpu->vp.tdcx_pages = NULL; + } + + return 0; +} + static int tdx_migrate_from(struct kvm *dst, struct kvm *src) { - return -EINVAL; + struct kvm_tdx *src_tdx, *dst_tdx; + bool charged = false; + int ret; + + src_tdx = to_kvm_tdx(src); + dst_tdx = to_kvm_tdx(dst); + + ret = -EINVAL; + + if (src_tdx->state != TD_STATE_RUNNABLE) { + pr_warn("Cannot migrate from a non finalized VM\n"); + goto abort; + } + + /* Transfer miscellaneous cgroup. */ + dst_tdx->misc_cg = get_current_misc_cg(); + if (dst_tdx->misc_cg != src_tdx->misc_cg) { + ret = misc_cg_try_charge(MISC_CG_RES_TDX, dst_tdx->misc_cg, 1); + if (ret) + goto abort_dst_cgroup; + charged = true; + } + + dst_tdx->hkid = src_tdx->hkid; + + /* Copy VM data. */ + dst_tdx->attributes = src_tdx->attributes; + dst_tdx->xfam = src_tdx->xfam; + dst_tdx->tsc_offset = src_tdx->tsc_offset; + dst_tdx->tsc_multiplier = src_tdx->tsc_multiplier; + dst_tdx->nr_premapped = src_tdx->nr_premapped; + dst_tdx->wait_for_sept_zap = src_tdx->wait_for_sept_zap; + dst_tdx->kvm.arch.gfn_direct_bits = src_tdx->kvm.arch.gfn_direct_bits; + + /* Copy TD structures. */ + dst_tdx->td.tdcs_nr_pages = src_tdx->td.tdcs_nr_pages; + dst_tdx->td.tdcx_nr_pages = src_tdx->td.tdcx_nr_pages; + dst_tdx->td.tdr_page = src_tdx->td.tdr_page; + dst_tdx->td.tdcs_pages = src_tdx->td.tdcs_pages; + + /* Copy per-vCPU state. */ + ret = tdx_migrate_vcpus(dst, src); + if (ret) + goto late_abort; + + dst->mem_attr_array.xa_head = src->mem_attr_array.xa_head; + src->mem_attr_array.xa_head = NULL; + + dst_tdx->state = TD_STATE_RUNNABLE; + + /* + * Set these source's vCPU migrated structures to NULL to avoid + * freeing them during source VM shutdown. + */ + src_tdx->hkid = -1; + src_tdx->td.tdr_page = NULL; + src_tdx->td.tdcs_pages = NULL; + + return 0; + +late_abort: + /* + * If we aborted after the state transfer already started, the src VM + * is no longer valid. + */ + kvm_vm_dead(src); + +abort_dst_cgroup: + if (charged) + misc_cg_uncharge(MISC_CG_RES_TDX, dst_tdx->misc_cg, 1); + put_misc_cg(dst_tdx->misc_cg); + dst_tdx->misc_cg = NULL; +abort: + dst_tdx->hkid = -1; + dst_tdx->td.tdr_page = 0; + return ret; }
int tdx_vm_move_enc_context_from(struct kvm *kvm, struct kvm *src_kvm)
Refactor the creation and committing of userspace_mem_region to their own functions so that they can reused by future TDX migration functions.
Signed-off-by: Ryan Afranji afranji@google.com --- tools/testing/selftests/kvm/lib/kvm_util.c | 147 +++++++++++++-------- 1 file changed, 89 insertions(+), 58 deletions(-)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2b442639ee2d..3c131718b81a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -974,50 +974,47 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags errno, strerror(errno)); }
- -/* FIXME: This thing needs to be ripped apart and rewritten. */ -void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) +static struct userspace_mem_region *vm_mem_region_alloc(struct kvm_vm *vm, + uint64_t guest_paddr, + uint32_t slot, + size_t npages, + uint32_t flags) { - int ret; struct userspace_mem_region *region; - size_t backing_src_pagesz = get_backing_src_pagesz(src_type); - size_t mem_size = npages * vm->page_size; - size_t alignment;
TEST_REQUIRE_SET_USER_MEMORY_REGION2();
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, - "Number of guest pages is not compatible with the host. " - "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); + "Number of guest pages is not compatible with the host. " + "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " - "address not on a page boundary.\n" - " guest_paddr: 0x%lx vm->page_size: 0x%x", - guest_paddr, vm->page_size); + "address not on a page boundary.\n" + " guest_paddr: 0x%lx vm->page_size: 0x%x", + guest_paddr, vm->page_size); TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) - <= vm->max_gfn, "Physical range beyond maximum " - "supported physical address,\n" - " guest_paddr: 0x%lx npages: 0x%lx\n" - " vm->max_gfn: 0x%lx vm->page_size: 0x%x", - guest_paddr, npages, vm->max_gfn, vm->page_size); + <= vm->max_gfn, "Physical range beyond maximum " + "supported physical address,\n" + " guest_paddr: 0x%lx npages: 0x%lx\n" + " vm->max_gfn: 0x%lx vm->page_size: 0x%x", + guest_paddr, npages, vm->max_gfn, vm->page_size);
/* * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + vm, guest_paddr, + (guest_paddr + npages * vm->page_size) - 1); if (region != NULL) TEST_FAIL("overlapping userspace_mem_region already " - "exists\n" - " requested guest_paddr: 0x%lx npages: 0x%lx " - "page_size: 0x%x\n" - " existing guest_paddr: 0x%lx size: 0x%lx", - guest_paddr, npages, vm->page_size, - (uint64_t) region->region.guest_phys_addr, - (uint64_t) region->region.memory_size); + "exists\n" + " requested guest_paddr: 0x%lx npages: 0x%lx " + "page_size: 0x%x\n" + " existing guest_paddr: 0x%lx size: 0x%lx", + guest_paddr, npages, vm->page_size, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size);
/* Confirm no region with the requested slot already exists. */ hash_for_each_possible(vm->regions.slot_hash, region, slot_node, @@ -1026,19 +1023,73 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, continue;
TEST_FAIL("A mem region with the requested slot " - "already exists.\n" - " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" - " existing slot: %u paddr: 0x%lx size: 0x%lx", - slot, guest_paddr, npages, - region->region.slot, - (uint64_t) region->region.guest_phys_addr, - (uint64_t) region->region.memory_size); + "already exists.\n" + " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" + " existing slot: %u paddr: 0x%lx size: 0x%lx", + slot, guest_paddr, npages, + region->region.slot, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size); }
/* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); TEST_ASSERT(region != NULL, "Insufficient Memory"); - region->mmap_size = mem_size; + + region->unused_phy_pages = sparsebit_alloc(); + if (vm_arch_has_protected_memory(vm)) + region->protected_phy_pages = sparsebit_alloc(); + sparsebit_set_num(region->unused_phy_pages, + guest_paddr >> vm->page_shift, npages); + region->region.slot = slot; + region->region.flags = flags; + region->region.guest_phys_addr = guest_paddr; + region->region.memory_size = npages * vm->page_size; + + region->mmap_start = NULL; + region->mmap_size = 0; + region->host_mem = NULL; + region->fd = -1; + + return region; +} + +static void userspace_mem_region_commit(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + int ret; + + region->region.userspace_addr = (uintptr_t) region->host_mem; + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" + " rc: %i errno: %i\n" + " slot: %u flags: 0x%x\n" + " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", + ret, errno, region->region.slot, region->region.flags, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size, + region->region.guest_memfd); + + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, + region->region.slot); +} + +/* FIXME: This thing needs to be ripped apart and rewritten. */ +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) +{ + int ret; + struct userspace_mem_region *region; + size_t backing_src_pagesz = get_backing_src_pagesz(src_type); + size_t mem_size = npages * vm->page_size; + size_t alignment; + + region = vm_mem_region_alloc(vm, guest_paddr, slot, npages, + flags);
#ifdef __s390x__ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ @@ -1058,6 +1109,8 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz));
+ region->mmap_size = mem_size; + /* Add enough memory to align up if necessary */ if (alignment > 1) region->mmap_size += alignment; @@ -1117,29 +1170,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->region.guest_memfd = -1; }
- region->unused_phy_pages = sparsebit_alloc(); - if (vm_arch_has_protected_memory(vm)) - region->protected_phy_pages = sparsebit_alloc(); - sparsebit_set_num(region->unused_phy_pages, - guest_paddr >> vm->page_shift, npages); - region->region.slot = slot; - region->region.flags = flags; - region->region.guest_phys_addr = guest_paddr; - region->region.memory_size = npages * vm->page_size; - region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" - " rc: %i errno: %i\n" - " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", - ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size, - region->region.guest_memfd); - - /* Add to quick lookup data structures */ - vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); - vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); - hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + userspace_mem_region_commit(vm, region);
/* If shared memory, create an alias. */ if (region->fd >= 0) {
From: Sagi Shahar sagis@google.com
Adds selftests for TDX in-place migration.
Signed-off-by: Ryan Afranji afranji@google.com Signed-off-by: Sagi Shahar sagis@google.com --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/kvm_util.h | 20 + .../selftests/kvm/include/x86/tdx/tdx_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 50 ++- .../selftests/kvm/lib/x86/tdx/tdx_util.c | 3 +- .../selftests/kvm/x86/tdx_migrate_tests.c | 358 ++++++++++++++++++ 6 files changed, 429 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/tdx_migrate_tests.c
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 1c7ea61e9031..d4c8cfb5910f 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -155,6 +155,7 @@ TEST_GEN_PROGS_x86 += pre_fault_memory_test TEST_GEN_PROGS_x86 += x86/tdx_vm_test TEST_GEN_PROGS_x86 += x86/tdx_shared_mem_test TEST_GEN_PROGS_x86 += x86/tdx_upm_test +TEST_GEN_PROGS_x86 += x86/tdx_migrate_tests
# Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 267f78f3f16f..1b6489081e74 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -110,6 +110,9 @@ struct kvm_vm {
struct kvm_binary_stats stats;
+ /* VM was migrated using KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM */ + bool enc_migrated; + /* * KVM region slots. These are the default memslots used by page * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] @@ -673,6 +676,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_migrate_mem_regions(struct kvm_vm *dst_vm, struct kvm_vm *src_vm); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); void vm_populate_vaddr_bitmap(struct kvm_vm *vm); @@ -1132,6 +1136,22 @@ static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, return vcpu; }
+/* + * Adds a vCPU with no defaults. This vcpu will be used for migration + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - The id of the VCPU to add to the VM. + */ +struct kvm_vcpu *vm_arch_vcpu_add_for_migration(struct kvm_vm *vm, + uint32_t vcpu_id); + +static inline struct kvm_vcpu *vm_vcpu_add_for_migration(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_add_for_migration(vm, vcpu_id); +} + /* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id);
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h index ae39b78aa4af..9b495e621225 100644 --- a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h +++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h @@ -9,6 +9,7 @@ extern uint64_t tdx_s_bit; void tdx_filter_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid_data); void __tdx_mask_cpuid_features(struct kvm_cpuid_entry2 *entry); +void tdx_enable_capabilities(struct kvm_vm *vm);
struct kvm_vcpu *td_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3c131718b81a..9dc3c7bf0443 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -805,8 +805,10 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
sparsebit_free(®ion->unused_phy_pages); sparsebit_free(®ion->protected_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + if (!vm->enc_migrated) { + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + } if (region->fd >= 0) { /* There's an extra map when using shared memory. */ ret = munmap(region->mmap_alias, region->mmap_size); @@ -1287,6 +1289,50 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) ret, errno, slot, new_gpa); }
+static void vm_migrate_mem_region(struct kvm_vm *dst_vm, struct kvm_vm *src_vm, + struct userspace_mem_region *src_region) +{ + struct userspace_mem_region *dst_region; + int dst_guest_memfd; + + dst_guest_memfd = + vm_link_guest_memfd(dst_vm, src_region->region.guest_memfd, 0); + + dst_region = vm_mem_region_alloc( + dst_vm, src_region->region.guest_phys_addr, + src_region->region.slot, + src_region->region.memory_size / src_vm->page_size, + src_region->region.flags); + + dst_region->mmap_size = src_region->mmap_size; + dst_region->mmap_start = src_region->mmap_start; + dst_region->host_mem = src_region->host_mem; + + src_region->mmap_start = 0; + src_region->host_mem = 0; + + dst_region->region.guest_memfd = dst_guest_memfd; + dst_region->region.guest_memfd_offset = + src_region->region.guest_memfd_offset; + + userspace_mem_region_commit(dst_vm, dst_region); +} + +void vm_migrate_mem_regions(struct kvm_vm *dst_vm, struct kvm_vm *src_vm) +{ + int bkt; + struct hlist_node *node; + struct userspace_mem_region *region; + + hash_for_each_safe(src_vm->regions.slot_hash, bkt, node, region, + slot_node) { + TEST_ASSERT(region->region.guest_memfd >= 0, + "Migrating mem regions is only supported for GUEST_MEMFD"); + + vm_migrate_mem_region(dst_vm, src_vm, region); + } +} + /* * VM Memory Region Delete * diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c index c5bee67099c5..ef03d42f58d0 100644 --- a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c +++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c @@ -344,7 +344,7 @@ static void register_encrypted_memory_region(struct kvm_vm *vm, * TD creation/setup/finalization */
-static void tdx_enable_capabilities(struct kvm_vm *vm) +void tdx_enable_capabilities(struct kvm_vm *vm) { int rc;
@@ -574,7 +574,6 @@ void td_initialize(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t nr_pages_required;
tdx_enable_capabilities(vm); - tdx_td_init(vm, attributes);
nr_pages_required = vm_nr_pages_required(VM_MODE_DEFAULT, 1, 0); diff --git a/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c b/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c new file mode 100644 index 000000000000..e15da2aa0437 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "tdx/tdcall.h" +#include "tdx/tdx.h" +#include "tdx/tdx_util.h" +#include "tdx/test_util.h" +#include <processor.h> +#include <sys/wait.h> + +#define NR_MIGRATE_TEST_VMS 10 +#define TDX_IOEXIT_TEST_PORT 0x50 + +static int __tdx_migrate_from(int dst_fd, int src_fd) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, + .args = { src_fd } + }; + + return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); +} + + +static void tdx_migrate_from(struct kvm_vm *dst_vm, struct kvm_vm *src_vm) +{ + int ret; + + vm_migrate_mem_regions(dst_vm, src_vm); + ret = __tdx_migrate_from(dst_vm->fd, src_vm->fd); + TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); + src_vm->enc_migrated = true; +} + +void guest_code(void) +{ + int ret; + uint64_t data; + + data = 1; + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE, + &data); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + data++; + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE, + &data); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + tdx_test_success(); +} + +static void test_tdx_migrate_vm_with_private_memory(void) +{ + struct kvm_vm *src_vm; + struct kvm_vm *dst_vm; + struct kvm_vcpu *dst_vcpu; + uint32_t data; + + printf("Verifying migration of VM with private memory:\n"); + + src_vm = td_create(); + td_initialize(src_vm, VM_MEM_SRC_ANONYMOUS, 0); + td_vcpu_add(src_vm, 0, guest_code); + td_finalize(src_vm); + + dst_vm = td_create(); + tdx_enable_capabilities(dst_vm); + dst_vcpu = vm_vcpu_recreate(dst_vm, 0); + + tdx_migrate_from(dst_vm, src_vm); + + kvm_vm_free(src_vm); + + tdx_run(dst_vcpu); + tdx_test_assert_io(dst_vcpu, TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE); + data = *(uint8_t *)((void *)dst_vcpu->run + + dst_vcpu->run->io.data_offset); + TEST_ASSERT_EQ(data, 1); + + tdx_run(dst_vcpu); + tdx_test_assert_io(dst_vcpu, TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE); + data = *(uint8_t *)((void *)dst_vcpu->run + + dst_vcpu->run->io.data_offset); + TEST_ASSERT_EQ(data, 2); + + tdx_run(dst_vcpu); + tdx_test_assert_success(dst_vcpu); + + kvm_vm_free(dst_vm); + + printf("\t ... PASSED\n"); +} + +static void test_tdx_migrate_running_vm(void) +{ + struct kvm_vm *src_vm; + struct kvm_vm *dst_vm; + struct kvm_vcpu *src_vcpu; + struct kvm_vcpu *dst_vcpu; + uint32_t data; + + printf("Verifying migration of a running VM:\n"); + + src_vm = td_create(); + td_initialize(src_vm, VM_MEM_SRC_ANONYMOUS, 0); + src_vcpu = td_vcpu_add(src_vm, 0, guest_code); + td_finalize(src_vm); + + dst_vm = td_create(); + tdx_enable_capabilities(dst_vm); + dst_vcpu = vm_vcpu_recreate(dst_vm, 0); + + tdx_run(src_vcpu); + tdx_test_assert_io(src_vcpu, TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE); + data = *(uint8_t *)((void *)src_vcpu->run + + src_vcpu->run->io.data_offset); + TEST_ASSERT_EQ(data, 1); + + tdx_migrate_from(dst_vm, src_vm); + + kvm_vm_free(src_vm); + + tdx_run(dst_vcpu); + tdx_test_assert_io(dst_vcpu, TDX_IOEXIT_TEST_PORT, 1, + PORT_WRITE); + data = *(uint8_t *)((void *)dst_vcpu->run + + dst_vcpu->run->io.data_offset); + TEST_ASSERT_EQ(data, 2); + + tdx_run(dst_vcpu); + tdx_test_assert_success(dst_vcpu); + + kvm_vm_free(dst_vm); + + printf("\t ... PASSED\n"); +} + +#define TDX_SHARED_MEM_TEST_PRIVATE_GVA (0x80000000) +#define TDX_SHARED_MEM_TEST_VADDR_SHARED_MASK BIT_ULL(30) +#define TDX_SHARED_MEM_TEST_SHARED_GVA \ + (TDX_SHARED_MEM_TEST_PRIVATE_GVA | \ + TDX_SHARED_MEM_TEST_VADDR_SHARED_MASK) + +#define TDX_SHARED_MEM_TEST_PRIVATE_VALUE (100) +#define TDX_SHARED_MEM_TEST_SHARED_VALUE (200) +#define TDX_SHARED_MEM_TEST_DIFF_VALUE (1) + + +static uint64_t test_mem_private_gpa; +static uint64_t test_mem_shared_gpa; + +void guest_with_shared_mem(void) +{ + uint64_t *test_mem_shared_gva = + (uint64_t *)TDX_SHARED_MEM_TEST_SHARED_GVA; + + uint64_t *private_data, *shared_data; + uint64_t placeholder; + uint64_t failed_gpa; + uint64_t data; + int ret; + + /* Map gpa as shared */ + tdg_vp_vmcall_map_gpa(test_mem_shared_gpa, PAGE_SIZE, + &failed_gpa); + + shared_data = test_mem_shared_gva; + private_data = &data; + + *private_data = TDX_SHARED_MEM_TEST_PRIVATE_VALUE; + *shared_data = TDX_SHARED_MEM_TEST_SHARED_VALUE; + + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE, + private_data); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + /* Exit so host can read shared value */ + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE, + &placeholder); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + *private_data += TDX_SHARED_MEM_TEST_DIFF_VALUE; + *shared_data += TDX_SHARED_MEM_TEST_DIFF_VALUE; + + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE, + private_data); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + /* Exit so host can read shared value */ + ret = tdg_vp_vmcall_instruction_io(TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE, + &placeholder); + if (ret) + tdx_test_fatal_with_data(ret, __LINE__); + + tdx_test_success(); +} + +static void test_tdx_migrate_vm_with_shared_mem(void) +{ + uint32_t private_data; + vm_vaddr_t test_mem_private_gva; + uint32_t *test_mem_hva; + struct kvm_vm *src_vm; + struct kvm_vm *dst_vm; + struct kvm_vcpu *src_vcpu; + struct kvm_vcpu *dst_vcpu; + + printf("Verifying migration of a VM with shared memory:\n"); + + src_vm = td_create(); + td_initialize(src_vm, VM_MEM_SRC_ANONYMOUS, 0); + src_vcpu = td_vcpu_add(src_vm, 0, guest_with_shared_mem); + + /* + * Set up shared memory page for testing by first allocating as private + * and then mapping the same GPA again as shared. This way, the TD does + * not have to remap its page tables at runtime. + */ + test_mem_private_gva = vm_vaddr_alloc(src_vm, src_vm->page_size, + TDX_SHARED_MEM_TEST_PRIVATE_GVA); + TEST_ASSERT_EQ(test_mem_private_gva, TDX_SHARED_MEM_TEST_PRIVATE_GVA); + + test_mem_hva = addr_gva2hva(src_vm, test_mem_private_gva); + TEST_ASSERT(test_mem_hva != NULL, + "Guest address not found in guest memory regions\n"); + + test_mem_private_gpa = addr_gva2gpa(src_vm, test_mem_private_gva); + virt_map_shared(src_vm, TDX_SHARED_MEM_TEST_SHARED_GVA, + test_mem_private_gpa, 1); + + test_mem_shared_gpa = test_mem_private_gpa | src_vm->arch.s_bit; + sync_global_to_guest(src_vm, test_mem_shared_gpa); + + td_finalize(src_vm); + + dst_vm = td_create(); + tdx_enable_capabilities(dst_vm); + dst_vcpu = vm_vcpu_recreate(dst_vm, 0); + + vm_enable_cap(src_vm, KVM_CAP_EXIT_HYPERCALL, + BIT_ULL(KVM_HC_MAP_GPA_RANGE)); + + printf("Verifying shared memory accesses for TDX\n"); + + /* Begin guest execution; guest writes to shared memory. */ + printf("\t ... Starting guest execution\n"); + + /* Handle map gpa as shared */ + tdx_run(src_vcpu); + + tdx_run(src_vcpu); + tdx_test_assert_io(src_vcpu, TDX_IOEXIT_TEST_PORT, 4, PORT_WRITE); + TEST_ASSERT_EQ(*(uint32_t *)((void *)src_vcpu->run + + src_vcpu->run->io.data_offset), + TDX_SHARED_MEM_TEST_PRIVATE_VALUE); + + tdx_run(src_vcpu); + tdx_test_assert_io(src_vcpu, TDX_IOEXIT_TEST_PORT, 4, PORT_WRITE); + TEST_ASSERT_EQ(*test_mem_hva, TDX_SHARED_MEM_TEST_SHARED_VALUE); + + tdx_migrate_from(dst_vm, src_vm); + + kvm_vm_free(src_vm); + + tdx_run(dst_vcpu); + tdx_test_assert_io(dst_vcpu, TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE); + private_data = *(uint32_t *)((void *)dst_vcpu->run + + dst_vcpu->run->io.data_offset); + TEST_ASSERT_EQ(private_data, TDX_SHARED_MEM_TEST_PRIVATE_VALUE + + TDX_SHARED_MEM_TEST_DIFF_VALUE); + + tdx_run(dst_vcpu); + tdx_test_assert_io(dst_vcpu, TDX_IOEXIT_TEST_PORT, 4, + PORT_WRITE); + TEST_ASSERT_EQ(*test_mem_hva, TDX_SHARED_MEM_TEST_SHARED_VALUE + + TDX_SHARED_MEM_TEST_DIFF_VALUE); + + tdx_run(dst_vcpu); + tdx_test_assert_success(dst_vcpu); + + kvm_vm_free(dst_vm); + + printf("\t ... PASSED\n"); +} + +void guest_code_empty(void) +{ + tdx_test_success(); +} + +static void test_tdx_migrate_multiple_vms(void) +{ + struct kvm_vm *src_vm; + struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS]; + int i, ret; + + printf("Verifying migration between multiple VMs:\n"); + + src_vm = td_create(); + td_initialize(src_vm, VM_MEM_SRC_ANONYMOUS, 0); + td_vcpu_add(src_vm, 0, guest_code_empty); + td_finalize(src_vm); + + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) { + dst_vms[i] = td_create(); + tdx_enable_capabilities(dst_vms[i]); + vm_vcpu_recreate(dst_vms[i], 0); + } + + /* Initial migration from the src to the first dst. */ + tdx_migrate_from(dst_vms[0], src_vm); + + for (i = 1; i < NR_MIGRATE_TEST_VMS; i++) + tdx_migrate_from(dst_vms[i], dst_vms[i - 1]); + + /* Migrate the guest back to the original VM. */ + ret = __tdx_migrate_from(src_vm->fd, + dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd); + TEST_ASSERT(ret == -1 && errno == EIO, + "VM that was migrated from should be dead. ret %d, errno: %d\n", + ret, errno); + + kvm_vm_free(src_vm); + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) + kvm_vm_free(dst_vms[i]); + + printf("\t ... PASSED\n"); +} + +int main(int argc, char *argv[]) +{ + if (!is_tdx_enabled()) { + print_skip("TDX is not supported by the KVM"); + exit(KSFT_SKIP); + } + + run_in_new_process(&test_tdx_migrate_vm_with_private_memory); + run_in_new_process(&test_tdx_migrate_running_vm); + run_in_new_process(&test_tdx_migrate_vm_with_shared_mem); + run_in_new_process(&test_tdx_migrate_multiple_vms); + + return 0; +}
From: Ackerley Tng ackerleytng@google.com
ucalls for non-Coco VMs work by having the guest write to the rdi register, then perform an io instruction to exit to the host. The host then reads rdi using kvm_get_regs().
CPU registers can't be read using kvm_get_regs() for TDX, so TDX guests use MMIO to pass the struct ucall's hva to the host. MMIO was chosen because it is one of the simplest (hence unlikely to fail) mechanisms that support passing 8 bytes from guest to host.
A new kvm_mem_region_type, MEM_REGION_UCALL, is added so TDX VMs can set up a different memslot for the ucall_pool that is set up as shared memory.
Signed-off-by: Ackerley Tng ackerleytng@google.com Signed-off-by: Ryan Afranji afranji@google.com --- .../testing/selftests/kvm/include/kvm_util.h | 1 + .../testing/selftests/kvm/include/x86/ucall.h | 4 +- .../testing/selftests/kvm/lib/ucall_common.c | 2 +- .../selftests/kvm/lib/x86/tdx/tdx_util.c | 40 +++++++ tools/testing/selftests/kvm/lib/x86/ucall.c | 108 ++++++++++++------ 5 files changed, 118 insertions(+), 37 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 1b6489081e74..8b252a668c78 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -80,6 +80,7 @@ enum kvm_mem_region_type { MEM_REGION_PT, MEM_REGION_TEST_DATA, MEM_REGION_TDX_BOOT_PARAMS, + MEM_REGION_UCALL, NR_MEM_REGIONS, };
diff --git a/tools/testing/selftests/kvm/include/x86/ucall.h b/tools/testing/selftests/kvm/include/x86/ucall.h index d3825dcc3cd9..0494a4a21557 100644 --- a/tools/testing/selftests/kvm/include/x86/ucall.h +++ b/tools/testing/selftests/kvm/include/x86/ucall.h @@ -6,8 +6,6 @@
#define UCALL_EXIT_REASON KVM_EXIT_IO
-static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) -{ -} +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
#endif diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index 42151e571953..5f195d4d15dc 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -33,7 +33,7 @@ void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) int i;
vaddr = vm_vaddr_alloc_shared(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, - MEM_REGION_DATA); + MEM_REGION_UCALL); hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr); memset(hdr, 0, sizeof(*hdr));
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c index ef03d42f58d0..a3612bf187a0 100644 --- a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c +++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c @@ -11,6 +11,7 @@ #include "tdx/td_boot.h" #include "tdx/tdx.h" #include "test_util.h" +#include "ucall_common.h"
uint64_t tdx_s_bit;
@@ -568,6 +569,43 @@ static void td_setup_boot_parameters(struct kvm_vm *vm, enum vm_mem_backing_src_ TEST_ASSERT_EQ(addr, TD_BOOT_PARAMETERS_GPA); }
+/* + * GPA where ucall headers/pool will be set up + * + * TD_UCALL_POOL_GPA is arbitrarily chosen to + * + * + Be within the 4GB address space + * + Not clash with the other memslots for boot parameters, boot code and test + * code + */ +#define TD_UCALL_POOL_GPA 0x30000000 +/* + * GPA to use for ucall MMIO writes + * + * TD_UCALL_MMIO_GPA is arbitrarily chosen to + * + * + Be within the 4GB address space + * + Not clash with the other memslots for boot parameters, boot code and test + * code + * + Not be configured in any memslot (unconfigured GPAs are treated as + * MMIOs). For now, TDX VMs can't be used with KVM_MEM_READONLY so using + * readonly memslots won't work for TDX VMs. + */ +#define TD_UCALL_MMIO_GPA 0x40000000 +#define TD_UCALL_MEMSLOT 4 + +static void td_setup_ucall(struct kvm_vm *vm) +{ + int npages; + + npages = ucall_nr_pages_required(PAGE_SIZE); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, TD_UCALL_POOL_GPA, + TD_UCALL_MEMSLOT, npages, 0); + vm->memslots[MEM_REGION_UCALL] = TD_UCALL_MEMSLOT; + + ucall_init(vm, TD_UCALL_MMIO_GPA); +} + void td_initialize(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t attributes) { @@ -593,6 +631,8 @@ void td_initialize(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
td_setup_boot_code(vm, src_type); td_setup_boot_parameters(vm, src_type); + + td_setup_ucall(vm); }
void td_finalize(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/x86/ucall.c b/tools/testing/selftests/kvm/lib/x86/ucall.c index 1265cecc7dd1..5cf915dbb588 100644 --- a/tools/testing/selftests/kvm/lib/x86/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86/ucall.c @@ -5,52 +5,94 @@ * Copyright (C) 2018, Red Hat, Inc. */ #include "kvm_util.h" +#include "kvm_util_types.h" +#include "tdx/tdx.h"
#define UCALL_PIO_PORT ((uint16_t)0x1000)
+static uint8_t vm_type; +static vm_paddr_t host_ucall_mmio_gpa; +static vm_paddr_t ucall_mmio_gpa; + +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ + vm_type = vm->type; + sync_global_to_guest(vm, vm_type); + + host_ucall_mmio_gpa = ucall_mmio_gpa = mmio_gpa; + +#ifdef __x86_64__ + if (vm_type == KVM_X86_TDX_VM) + ucall_mmio_gpa |= vm->arch.s_bit; +#endif + + sync_global_to_guest(vm, ucall_mmio_gpa); +} + void ucall_arch_do_ucall(vm_vaddr_t uc) { - /* - * FIXME: Revert this hack (the entire commit that added it) once nVMX - * preserves L2 GPRs across a nested VM-Exit. If a ucall from L2, e.g. - * to do a GUEST_SYNC(), lands the vCPU in L1, any and all GPRs can be - * clobbered by L1. Save and restore non-volatile GPRs (clobbering RBP - * in particular is problematic) along with RDX and RDI (which are - * inputs), and clobber volatile GPRs. *sigh* - */ -#define HORRIFIC_L2_UCALL_CLOBBER_HACK \ + switch (vm_type) { + case KVM_X86_TDX_VM: + tdg_vp_vmcall_ve_request_mmio_write(ucall_mmio_gpa, 8, uc); + return; + default: + /* + * FIXME: Revert this hack (the entire commit that added it) + * once nVMX preserves L2 GPRs across a nested VM-Exit. If a + * ucall from L2, e.g. to do a GUEST_SYNC(), lands the vCPU in + * L1, any and all GPRs can be clobbered by L1. Save and + * restore non-volatile GPRs (clobbering RBP in particular is + * problematic) along with RDX and RDI (which are inputs), and + * clobber volatile GPRs. *sigh* + */ +#define HORRIFIC_L2_UCALL_CLOBBER_HACK \ "rcx", "rsi", "r8", "r9", "r10", "r11"
- asm volatile("push %%rbp\n\t" - "push %%r15\n\t" - "push %%r14\n\t" - "push %%r13\n\t" - "push %%r12\n\t" - "push %%rbx\n\t" - "push %%rdx\n\t" - "push %%rdi\n\t" - "in %[port], %%al\n\t" - "pop %%rdi\n\t" - "pop %%rdx\n\t" - "pop %%rbx\n\t" - "pop %%r12\n\t" - "pop %%r13\n\t" - "pop %%r14\n\t" - "pop %%r15\n\t" - "pop %%rbp\n\t" - : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory", - HORRIFIC_L2_UCALL_CLOBBER_HACK); + asm volatile("push %%rbp\n\t" + "push %%r15\n\t" + "push %%r14\n\t" + "push %%r13\n\t" + "push %%r12\n\t" + "push %%rbx\n\t" + "push %%rdx\n\t" + "push %%rdi\n\t" + "in %[port], %%al\n\t" + "pop %%rdi\n\t" + "pop %%rdx\n\t" + "pop %%rbx\n\t" + "pop %%r12\n\t" + "pop %%r13\n\t" + "pop %%r14\n\t" + "pop %%r15\n\t" + "pop %%rbp\n\t" + : + : [port] "d"(UCALL_PIO_PORT), "D"(uc) + : "rax", "memory", HORRIFIC_L2_UCALL_CLOBBER_HACK); + } }
void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run;
- if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { - struct kvm_regs regs; + switch (vm_type) { + case KVM_X86_TDX_VM: + if (vcpu->run->exit_reason == KVM_EXIT_MMIO && + vcpu->run->mmio.phys_addr == host_ucall_mmio_gpa && + vcpu->run->mmio.len == 8 && vcpu->run->mmio.is_write) { + uint64_t data = *(uint64_t *)vcpu->run->mmio.data; + + return (void *)data; + } + return NULL; + default: + if (run->exit_reason == KVM_EXIT_IO && + run->io.port == UCALL_PIO_PORT) { + struct kvm_regs regs;
- vcpu_regs_get(vcpu, ®s); - return (void *)regs.rdi; + vcpu_regs_get(vcpu, ®s); + return (void *)regs.rdi; + } + return NULL; } - return NULL; }
From: Ackerley Tng ackerleytng@google.com
Adds a selftest to verify interrupts sent to a TDX VM before migration are successfully handled by the migrated VM.
Co-developed-by: Ryan Afranji afranji@google.com Signed-off-by: Ryan Afranji afranji@google.com Signed-off-by: Ackerley Tng ackerleytng@google.com --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/kvm_util.h | 4 + .../selftests/kvm/include/x86/tdx/tdx_util.h | 2 + .../selftests/kvm/include/x86/tdx/test_util.h | 5 + tools/testing/selftests/kvm/lib/kvm_util.c | 35 ++- .../selftests/kvm/lib/x86/tdx/tdx_util.c | 20 ++ .../selftests/kvm/lib/x86/tdx/test_util.c | 17 ++ .../kvm/x86/tdx_irqfd_migrate_test.c | 264 ++++++++++++++++++ .../selftests/kvm/x86/tdx_migrate_tests.c | 21 -- 9 files changed, 343 insertions(+), 26 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index d4c8cfb5910f..4ae0d105c2a7 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -156,6 +156,7 @@ TEST_GEN_PROGS_x86 += x86/tdx_vm_test TEST_GEN_PROGS_x86 += x86/tdx_shared_mem_test TEST_GEN_PROGS_x86 += x86/tdx_upm_test TEST_GEN_PROGS_x86 += x86/tdx_migrate_tests +TEST_GEN_PROGS_x86 += x86/tdx_irqfd_migrate_test
# Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 8b252a668c78..f93ac2b9b0ff 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -80,6 +80,7 @@ enum kvm_mem_region_type { MEM_REGION_PT, MEM_REGION_TEST_DATA, MEM_REGION_TDX_BOOT_PARAMS, + MEM_REGION_TDX_SHARED_DATA, MEM_REGION_UCALL, NR_MEM_REGIONS, }; @@ -958,6 +959,9 @@ int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); struct kvm_irq_routing *kvm_gsi_routing_create(void); void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, uint32_t gsi, uint32_t pin); +void kvm_gsi_routing_msi_add(struct kvm_irq_routing *routing, uint32_t gsi, + uint32_t address_lo, uint32_t address_hi, + uint32_t data); int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h index 9b495e621225..4393c8649718 100644 --- a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h +++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h @@ -10,6 +10,8 @@ extern uint64_t tdx_s_bit; void tdx_filter_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid_data); void __tdx_mask_cpuid_features(struct kvm_cpuid_entry2 *entry); void tdx_enable_capabilities(struct kvm_vm *vm); +int __tdx_migrate_from(int dst_fd, int src_fd); +void tdx_migrate_from(struct kvm_vm *dst_vm, struct kvm_vm *src_vm);
struct kvm_vcpu *td_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code);
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/test_util.h b/tools/testing/selftests/kvm/include/x86/tdx/test_util.h index 3330d5a54698..0dd859974cb3 100644 --- a/tools/testing/selftests/kvm/include/x86/tdx/test_util.h +++ b/tools/testing/selftests/kvm/include/x86/tdx/test_util.h @@ -130,4 +130,9 @@ uint64_t tdx_test_read_64bit(struct kvm_vcpu *vcpu, uint64_t port); */ uint64_t tdx_test_read_64bit_report_from_guest(struct kvm_vcpu *vcpu);
+/* + * Enables X2APIC for TDX guests. + */ +void tdx_guest_x2apic_enable(void); + #endif // SELFTEST_TDX_TEST_UTIL_H diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9dc3c7bf0443..bbb489635064 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1293,10 +1293,12 @@ static void vm_migrate_mem_region(struct kvm_vm *dst_vm, struct kvm_vm *src_vm, struct userspace_mem_region *src_region) { struct userspace_mem_region *dst_region; - int dst_guest_memfd; + int dst_guest_memfd = -1;
- dst_guest_memfd = - vm_link_guest_memfd(dst_vm, src_region->region.guest_memfd, 0); + if (src_region->region.guest_memfd != -1) + dst_guest_memfd = vm_link_guest_memfd(dst_vm, + src_region->region.guest_memfd, + 0);
dst_region = vm_mem_region_alloc( dst_vm, src_region->region.guest_phys_addr, @@ -1312,8 +1314,12 @@ static void vm_migrate_mem_region(struct kvm_vm *dst_vm, struct kvm_vm *src_vm, src_region->host_mem = 0;
dst_region->region.guest_memfd = dst_guest_memfd; - dst_region->region.guest_memfd_offset = - src_region->region.guest_memfd_offset; + if (src_region->region.guest_memfd == -1) { + dst_region->fd = src_region->fd; + } else { + dst_region->region.guest_memfd_offset = + src_region->region.guest_memfd_offset; + }
userspace_mem_region_commit(dst_vm, dst_region); } @@ -2057,6 +2063,25 @@ void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, routing->nr++; }
+void kvm_gsi_routing_msi_add(struct kvm_irq_routing *routing, uint32_t gsi, + uint32_t address_lo, uint32_t address_hi, + uint32_t data) +{ + int i; + + assert(routing); + assert(routing->nr < KVM_MAX_IRQ_ROUTES); + + i = routing->nr; + routing->entries[i].gsi = gsi; + routing->entries[i].type = KVM_IRQ_ROUTING_MSI; + routing->entries[i].flags = 0; + routing->entries[i].u.msi.address_lo = address_lo; + routing->entries[i].u.msi.address_hi = address_hi; + routing->entries[i].u.msi.data = data; + routing->nr++; +} + int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) { int ret; diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c index a3612bf187a0..8216a778474a 100644 --- a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c +++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c @@ -372,6 +372,26 @@ static void tdx_apply_cr4_restrictions(struct kvm_sregs *sregs) sregs->cr4 &= ~(X86_CR4_VMXE | X86_CR4_SMXE); }
+int __tdx_migrate_from(int dst_fd, int src_fd) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, + .args = { src_fd } + }; + + return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); +} + +void tdx_migrate_from(struct kvm_vm *dst_vm, struct kvm_vm *src_vm) +{ + int ret; + + vm_migrate_mem_regions(dst_vm, src_vm); + ret = __tdx_migrate_from(dst_vm->fd, src_vm->fd); + TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); + src_vm->enc_migrated = true; +} + static void load_td_boot_code(struct kvm_vm *vm) { void *boot_code_hva = addr_gpa2hva(vm, FOUR_GIGABYTES_GPA - TD_BOOT_CODE_SIZE); diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/test_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/test_util.c index f92ddda2d1ac..7b622ccb2433 100644 --- a/tools/testing/selftests/kvm/lib/x86/tdx/test_util.c +++ b/tools/testing/selftests/kvm/lib/x86/tdx/test_util.c @@ -6,6 +6,7 @@ #include <sys/wait.h> #include <unistd.h>
+#include "apic.h" #include "kvm_util.h" #include "tdx/tdcall.h" #include "tdx/tdx.h" @@ -185,3 +186,19 @@ uint64_t tdx_test_read_64bit_report_from_guest(struct kvm_vcpu *vcpu) { return tdx_test_read_64bit(vcpu, TDX_TEST_REPORT_PORT); } + +void tdx_guest_x2apic_enable(void) +{ + uint64_t x2apic_spiv = APIC_BASE_MSR + (APIC_SPIV >> 4); + uint64_t value, ret; + + /* + * x2apic does not have to be enabled for TDs, TDs already have x2apic + * enabled, and must use x2apic. Hence, we just soft-enable APIC. + */ + ret = tdg_vp_vmcall_instruction_rdmsr(x2apic_spiv, &value); + GUEST_ASSERT_EQ(ret, 0); + ret = tdg_vp_vmcall_instruction_wrmsr(x2apic_spiv, + value | APIC_SPIV_APIC_ENABLED); + GUEST_ASSERT_EQ(ret, 0); +} diff --git a/tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c b/tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c new file mode 100644 index 000000000000..d80cc204bd67 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <stdint.h> +#include <stdio.h> +#include <linux/kvm.h> +#include <string.h> +#include <sys/eventfd.h> + +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "tdx/tdcall.h" +#include "tdx/tdx.h" +#include "tdx/tdx_util.h" +#include "tdx/test_util.h" +#include "test_util.h" +#include "ucall_common.h" + +#define TEST_IRQ_PIN 24 + +#define NUM_INTERRUPTS 256 +#define INTERRUPT_COUNT_GPA 0x100000000ULL +#define INTERRUPT_COUNT_MEMSLOT 5 + +#define MIGRATION_LOOPS 10 + +static uint32_t (*interrupt_count_per_vector)[NUM_INTERRUPTS]; + +static void interrupt_handler_increment_count(struct ex_regs *regs) +{ + (*interrupt_count_per_vector)[regs->vector]++; + x2apic_write_reg(APIC_EOI, 0); +} + +static void guest_code(void) +{ + uint32_t sync_count = 0; + + tdx_guest_x2apic_enable(); + + /* Enable interrupts which are disabled by default. */ + asm volatile("sti"); + + /* Keep guest runnable by continuously looping. */ + while (true) + GUEST_SYNC(++sync_count); +} + +/** + * gsi_route_add - Used to add a GSI route. + * + * @msi_redir_hint: Look up "Message Address Register Format" in Intel SDM + * @dest_mode: Look up "Message Address Register Format" in Intel SDM + * Use false for DM=0 and true for DM=1 + * @trig_mode: Look up "Message Data Register Format" in Intel SDM + * Use false for edge sensitive and true for level sensitive + * @delivery_mode: A 3 bit code: look up "Message Data Register Format" + * + * Add a route by building up the routing information in address_hi, address_lo + * and data according to how it is used in struct kvm_lapic_irq. For full + * details, look up how fields in struct kvm_lapic_irq are used. + * + * Return: None + */ +static void gsi_route_add(struct kvm_irq_routing *table, uint32_t gsi, + bool use_x2apic_format, uint32_t dest_id, + uint8_t vector, bool msi_redir_hint, bool dest_mode, + bool trig_mode, uint8_t delivery_mode) +{ + union { + struct { + u32 vector : 8, delivery_mode : 3, + dest_mode_logical : 1, reserved : 2, + active_low : 1, is_level : 1; + }; + uint32_t as_uint32; + } data = { 0 }; + union { + struct { + u32 reserved_0 : 2, dest_mode_logical : 1, + redirect_hint : 1, reserved_1 : 1, + virt_destid_8_14 : 7, destid_0_7 : 8, + base_address : 12; + }; + uint32_t as_uint32; + } address_lo = { 0 }; + union { + struct { + u32 reserved : 8, destid_8_31 : 24; + }; + uint32_t as_uint32; + } address_hi = { 0 }; + + /* Fixed 0xfee (see Intel SDM "Message Address Register Format") */ + address_lo.base_address = 0xfee; + + address_lo.destid_0_7 = dest_id & 0xff; + if (use_x2apic_format) + address_hi.destid_8_31 = (dest_id & 0xffffff00) >> 8; + + data.vector = vector; + address_lo.dest_mode_logical = dest_mode; + data.is_level = trig_mode; + data.delivery_mode = delivery_mode & 0b111; + address_lo.redirect_hint = msi_redir_hint; + + kvm_gsi_routing_msi_add(table, gsi, address_lo.as_uint32, + address_hi.as_uint32, data.as_uint32); +} + +/** + * Sets up KVM irqfd in @vm + * + * @gsi: irqchip pin toggled by this event + */ +static void set_irqfd(struct kvm_vm *vm, int fd, uint32_t gsi, bool assign) +{ + struct kvm_irqfd ifd = { + .fd = fd, + .gsi = gsi, + .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, + .resamplefd = 0, + }; + + vm_ioctl(vm, KVM_IRQFD, &ifd); +} + +static void setup_interrupt_count_per_vector(struct kvm_vm *vm) +{ + vm_vaddr_t gva; + int npages; + + npages = round_up(sizeof(*interrupt_count_per_vector), PAGE_SIZE); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + INTERRUPT_COUNT_GPA, + INTERRUPT_COUNT_MEMSLOT, npages, 0); + vm->memslots[MEM_REGION_TDX_SHARED_DATA] = INTERRUPT_COUNT_MEMSLOT; + + gva = vm_vaddr_alloc_shared(vm, sizeof(*interrupt_count_per_vector), + KVM_UTIL_MIN_VADDR, + MEM_REGION_TDX_SHARED_DATA); + + interrupt_count_per_vector = addr_gva2hva(vm, gva); + memset(interrupt_count_per_vector, 0, + sizeof(*interrupt_count_per_vector)); + + write_guest_global(vm, interrupt_count_per_vector, + (uint32_t(*)[NUM_INTERRUPTS])gva); +} + +static void handle_vcpu_exit(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit: %s", + exit_reason_str(vcpu->run->exit_reason)); + } +} + +void map_gsis_to_vectors(struct kvm_vm *vm, struct kvm_vcpu *vcpu, int *eventfds) +{ + struct kvm_irq_routing *table; + uint32_t vector_and_gsi; + int efd; + + /* Flush table first. */ + table = kvm_gsi_routing_create(); + kvm_gsi_routing_write(vm, table); + + /* Writing frees table, so we have to create another one. */ + table = kvm_gsi_routing_create(); + + /* Map vectors to gsis 1 to 1 */ + for (vector_and_gsi = 32; vector_and_gsi < NUM_INTERRUPTS; + ++vector_and_gsi) { + gsi_route_add(table, vector_and_gsi, + /*use_x2apic_format=*/true, + /*dest_id=*/vcpu->id, + /*vector=*/vector_and_gsi, + /*msi_redir_hint=*/false, + /*dest_mode=*/false, + /*trig_mode=*/false, + /*delivery_mode=*/0b000); + + efd = eventfd(0, EFD_NONBLOCK); + set_irqfd(vm, efd, vector_and_gsi, true); + + eventfds[vector_and_gsi] = efd; + } + + /* Configure KVM. Writing frees table. */ + kvm_gsi_routing_write(vm, table); + +} + +int main(int argc, char *argv[]) +{ + int eventfds[NUM_INTERRUPTS] = { 0 }; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int vector, migration; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_SPLIT_IRQCHIP)); + + setbuf(stdout, NULL); + + vm = td_create(); + td_initialize(vm, VM_MEM_SRC_ANONYMOUS, 0); + + vcpu = td_vcpu_add(vm, 0, guest_code); + + for (vector = 0; vector < NUM_INTERRUPTS; ++vector) { + vm_install_exception_handler(vm, vector, + interrupt_handler_increment_count); + } + + setup_interrupt_count_per_vector(vm); + + td_finalize(vm); + + map_gsis_to_vectors(vm, vcpu, eventfds); + + tdx_run(vcpu); + handle_vcpu_exit(vcpu); + + for (migration = 0; migration < MIGRATION_LOOPS; ++migration) { + struct kvm_vcpu *next_vcpu; + struct kvm_vm *next_vm; + + next_vm = td_create(); + tdx_enable_capabilities(next_vm); + next_vcpu = vm_vcpu_recreate(next_vm, 0); + + /* Inject on source VM. */ + for (vector = 32; vector < NUM_INTERRUPTS; ++vector) + TEST_ASSERT_EQ(eventfd_write(eventfds[vector], 1), 0); + + map_gsis_to_vectors(next_vm, next_vcpu, eventfds); + + vcpu = next_vcpu; + + tdx_migrate_from(next_vm, vm); + kvm_vm_free(vm); + vm = next_vm; + + tdx_run(vcpu); + handle_vcpu_exit(vcpu); + + for (vector = 32; vector < NUM_INTERRUPTS; ++vector) + TEST_ASSERT_EQ((*interrupt_count_per_vector)[vector], + migration + 1); + } + + kvm_vm_free(vm); + for (vector = 32; vector < NUM_INTERRUPTS; ++vector) + close(eventfds[vector]); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c b/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c index e15da2aa0437..498e42f37697 100644 --- a/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86/tdx_migrate_tests.c @@ -10,27 +10,6 @@ #define NR_MIGRATE_TEST_VMS 10 #define TDX_IOEXIT_TEST_PORT 0x50
-static int __tdx_migrate_from(int dst_fd, int src_fd) -{ - struct kvm_enable_cap cap = { - .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, - .args = { src_fd } - }; - - return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); -} - - -static void tdx_migrate_from(struct kvm_vm *dst_vm, struct kvm_vm *src_vm) -{ - int ret; - - vm_migrate_mem_regions(dst_vm, src_vm); - ret = __tdx_migrate_from(dst_vm->fd, src_vm->fd); - TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); - src_vm->enc_migrated = true; -} - void guest_code(void) { int ret;
linux-kselftest-mirror@lists.linaro.org