From 27592ae8dbe41033261b6fdf27d78998aabd2665 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:03:57 +0000 Subject: KVM: Move wiping of the kvm->vcpus array to common code All architectures have similar loops iterating over the vcpus, freeing one vcpu at a time, and eventually wiping the reference off the vcpus array. They are also inconsistently taking the kvm->lock mutex when wiping the references from the array. Make this code common, which will simplify further changes. The locking is dropped altogether, as this should only be called when there is no further references on the kvm structure. Reviewed-by: Claudio Imbrenda Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-2-maz@kernel.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e0aa4dd53c7f..0e6d11a726cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11423,15 +11423,8 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); + kvm_destroy_vcpus(kvm); } void kvm_arch_sync_events(struct kvm *kvm) -- cgit v1.2.1 From 46808a4cb89708c2e5b264eb9d1035762581921b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:04:02 +0000 Subject: KVM: Use 'unsigned long' as kvm_for_each_vcpu()'s index Everywhere we use kvm_for_each_vpcu(), we use an int as the vcpu index. Unfortunately, we're about to move rework the iterator, which requires this to be upgrade to an unsigned long. Let's bite the bullet and repaint all of it in one go. Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-7-maz@kernel.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 7 ++++--- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 5 +++-- arch/x86/kvm/ioapic.c | 4 ++-- arch/x86/kvm/irq_comm.c | 7 ++++--- arch/x86/kvm/kvm_onhyperv.c | 3 ++- arch/x86/kvm/lapic.c | 6 +++--- arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/sev.c | 9 +++++---- arch/x86/kvm/x86.c | 23 ++++++++++++----------- 10 files changed, 37 insertions(+), 31 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5e19e6e4c2ce..7179fa645eda 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -164,7 +164,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; @@ -1716,7 +1716,8 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; - int i, bank, sbank = 0; + int bank, sbank = 0; + unsigned long i; memset(vp_bitmap, 0, KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); @@ -1863,7 +1864,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, .vector = vector }; struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 5a69cce4d72d..0b65a764ed3a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,7 +242,7 @@ static void pit_do_work(struct kthread_work *work) struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; struct kvm_kpit_state *ps = &pit->pit_state; if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 0b80263d46d8..814064d06016 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu; - int i; + unsigned long i; s->wakeup_needed = false; @@ -270,7 +270,8 @@ int kvm_pic_read_irq(struct kvm *kvm) static void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, i; + int irq; + unsigned long i; struct kvm_vcpu *vcpu; u8 edge_irr = s->irr & ~s->elcr; bool found = false; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 816a82515dcd..decfa36b7891 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -149,7 +149,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; if (RTC_GSI >= IOAPIC_NUM_PINS) return; @@ -184,7 +184,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d5b72a08e566..39ad02d6dc63 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -45,9 +45,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { - int i, r = -1; + int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) @@ -320,7 +320,8 @@ int kvm_set_routing_entry(struct kvm *kvm, bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { - int i, r = 0; + int r = 0; + unsigned long i; struct kvm_vcpu *vcpu; if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index c7db2df50a7a..b469f45e3fe4 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -33,7 +33,8 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, { struct kvm_arch *kvm_arch = &kvm->arch; struct kvm_vcpu *vcpu; - int ret = 0, i, nr_unique_valid_roots; + int ret = 0, nr_unique_valid_roots; + unsigned long i; hpa_t root; spin_lock(&kvm_arch->hv_root_tdp_lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f206fc35deff..451e80306b51 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -185,7 +185,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; - int i; + unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ @@ -1172,8 +1172,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; - unsigned long bitmap; - int i, vcpu_idx; + unsigned long bitmap, i; + int vcpu_idx; bool ret; rcu_read_lock(); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8f9af7b7dbbe..b7200595cbd4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -293,7 +293,7 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, source, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7656a2c5662a..322553322202 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -636,7 +636,8 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_vcpu *vcpu; - int i, ret; + unsigned long i; + int ret; if (!sev_es_guest(kvm)) return -ENOTTY; @@ -1593,7 +1594,7 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) static int sev_lock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, j; + unsigned long i, j; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable(&vcpu->mutex)) @@ -1615,7 +1616,7 @@ out_unlock: static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { mutex_unlock(&vcpu->mutex); @@ -1642,7 +1643,7 @@ static void sev_migrate_from(struct kvm_sev_info *dst, static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) { - int i; + unsigned long i; struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e6d11a726cd..96bcf2035bdc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2816,7 +2816,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct kvm_vcpu *vcpu; - int i; + unsigned long i; write_seqcount_end(&ka->pvclock_sc); raw_spin_unlock_irq(&ka->tsc_write_lock); @@ -3065,7 +3065,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) static void kvmclock_update_fn(struct work_struct *work) { - int i; + unsigned long i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); @@ -5692,7 +5692,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) * VM-Exit. */ struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); @@ -5961,7 +5961,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -8388,7 +8389,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; + int send_ipi = 0; + unsigned long i; /* * We allow guests to temporarily run on slowing clocks, @@ -8561,9 +8563,8 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + unsigned long i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -10672,7 +10673,7 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) { bool inhibit = false; struct kvm_vcpu *vcpu; - int i; + unsigned long i; down_write(&kvm->arch.apicv_update_lock); @@ -11160,7 +11161,7 @@ int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; int ret; u64 local_tsc; u64 max_tsc = 0; @@ -11413,7 +11414,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; /* @@ -11659,7 +11660,7 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; /* * memslots->generation has been incremented. -- cgit v1.2.1 From 537a17b3149300987456e8949ccb991e604047d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:11 +0100 Subject: KVM: Let/force architectures to deal with arch specific memslot data Pass the "old" slot to kvm_arch_prepare_memory_region() and force arch code to handle propagating arch specific data from "new" to "old" when necessary. This is a baby step towards dynamically allocating "new" from the get go, and is a (very) minor performance boost on x86 due to not unnecessarily copying arch data. For PPC HV, copy the rmap in the !CREATE and !DELETE paths, i.e. for MOVE and FLAGS_ONLY. This is functionally a nop as the previous behavior would overwrite the pointer for CREATE, and eventually discard/ignore it for DELETE. For x86, copy the arch data only for FLAGS_ONLY changes. Unlike PPC HV, x86 needs to reallocate arch data in the MOVE case as the size of x86's allocations depend on the alignment of the memslot's gfn. Opportunistically tweak kvm_arch_prepare_memory_region()'s param order to match the "commit" prototype. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero [mss: add missing RISCV kvm_arch_prepare_memory_region() change] Signed-off-by: Maciej S. Szmigiero Message-Id: <67dea5f11bbcfd71e3da5986f11e87f5dd4013f9.1638817639.git.maciej.szmigiero@oracle.com> --- arch/x86/kvm/x86.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96bcf2035bdc..287ff4e43a13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11674,13 +11674,20 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(kvm, memslot, + return kvm_alloc_memslot_metadata(kvm, new, mem->memory_size >> PAGE_SHIFT); + + if (change == KVM_MR_FLAGS_ONLY) + memcpy(&new->arch, &old->arch, sizeof(old->arch)); + else if (WARN_ON_ONCE(change != KVM_MR_DELETE)) + return -EIO; + return 0; } -- cgit v1.2.1 From 9d7d18ee3f48903f7b9bbf6305d690078c67271b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:16 +0100 Subject: KVM: x86: Use "new" memslot instead of userspace memory region Get the number of pages directly from the new memslot instead of computing the same from the userspace memory region when allocating memslot metadata. This will allow a future patch to drop @mem. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Signed-off-by: Maciej S. Szmigiero Message-Id: --- arch/x86/kvm/x86.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 287ff4e43a13..2a7567adb799 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11593,9 +11593,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) } static int kvm_alloc_memslot_metadata(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) + struct kvm_memory_slot *slot) { + unsigned long npages = slot->npages; int i, r; /* @@ -11680,8 +11680,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(kvm, new, - mem->memory_size >> PAGE_SHIFT); + return kvm_alloc_memslot_metadata(kvm, new); if (change == KVM_MR_FLAGS_ONLY) memcpy(&new->arch, &old->arch, sizeof(old->arch)); -- cgit v1.2.1 From 6a99c6e3f52a6f0d4c6ebcfa7359c718a19ffbe6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:18 +0100 Subject: KVM: Stop passing kvm_userspace_memory_region to arch memslot hooks Drop the @mem param from kvm_arch_{prepare,commit}_memory_region() now that its use has been removed in all architectures. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Signed-off-by: Maciej S. Szmigiero Message-Id: --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a7567adb799..f862c514c2c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11674,7 +11674,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) @@ -11778,7 +11777,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) -- cgit v1.2.1 From 77aedf26fe5d2795cd6aa1a75a8dd62dbac503e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:20 +0100 Subject: KVM: x86: Don't assume old/new memslots are non-NULL at memslot commit Play nice with a NULL @old or @new when handling memslot updates so that common KVM can pass NULL for one or the other in CREATE and DELETE cases instead of having to synthesize a dummy memslot. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Signed-off-by: Maciej S. Szmigiero Message-Id: <2eb7788adbdc2bc9a9c5f86844dd8ee5c8428732.1638817640.git.maciej.szmigiero@oracle.com> --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f862c514c2c0..aaf89c001c5c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11709,13 +11709,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; + u32 old_flags = old ? old->flags : 0; + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; /* * Update CPU dirty logging if dirty logging is being toggled. This * applies to all operations. */ - if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); /* @@ -11733,7 +11735,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot(). */ - if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) return; /* @@ -11741,7 +11743,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty * logging isn't being toggled on or off. */ - if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) return; if (!log_dirty_pages) { -- cgit v1.2.1 From e0c2b6338ac8ca30d438157dc45396c3c1148563 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:23 +0100 Subject: KVM: x86: Don't call kvm_mmu_change_mmu_pages() if the count hasn't changed There is no point in calling kvm_mmu_change_mmu_pages() for memslot operations that don't change the total page count, so do it just for KVM_MR_CREATE and KVM_MR_DELETE. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Sean Christopherson Message-Id: <9e56b7616a11f5654e4ab486b3237366b7ba9f2a.1638817640.git.maciej.szmigiero@oracle.com> --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aaf89c001c5c..29a030a3c797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11783,7 +11783,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - if (!kvm->arch.n_requested_mmu_pages) + if (!kvm->arch.n_requested_mmu_pages && + (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); -- cgit v1.2.1 From f5756029eef501bcd39ecd844968e4fb3055c1bd Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:24 +0100 Subject: KVM: x86: Use nr_memslot_pages to avoid traversing the memslots array There is no point in recalculating from scratch the total number of pages in all memslots each time a memslot is created or deleted. Use KVM's cached nr_memslot_pages to compute the default max number of MMU pages. Note that even with nr_memslot_pages capped at ULONG_MAX we can't safely multiply it by KVM_PERMILLE_MMU_PAGES (20) since this operation can possibly overflow an unsigned long variable. Write this "* 20 / 1000" operation as "/ 50" instead to avoid such overflow. Signed-off-by: Maciej S. Szmigiero [sean: use common KVM field and rework changelog accordingly] Reviewed-by: Sean Christopherson Message-Id: --- arch/x86/kvm/mmu/mmu.c | 24 ------------------------ arch/x86/kvm/x86.c | 10 +++++++--- 2 files changed, 7 insertions(+), 27 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e1d012df22..e41cf095f2d1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6150,30 +6150,6 @@ out: return ret; } -/* - * Calculate mmu pages needed for kvm. - */ -unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) -{ - unsigned long nr_mmu_pages; - unsigned long nr_pages = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int i; - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; - } - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29a030a3c797..73e0e40c94b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11784,9 +11784,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (!kvm->arch.n_requested_mmu_pages && - (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) - kvm_mmu_change_mmu_pages(kvm, - kvm_mmu_calculate_default_mmu_pages(kvm)); + (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { + unsigned long nr_mmu_pages; + + nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO; + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } kvm_mmu_slot_apply_flags(kvm, old, new, change); -- cgit v1.2.1 From ed922739c9199bf515a3e7fec3e319ce1edeef2a Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:28 +0100 Subject: KVM: Use interval tree to do fast hva lookup in memslots The current memslots implementation only allows quick binary search by gfn, quick lookup by hva is not possible - the implementation has to do a linear scan of the whole memslots array, even though the operation being performed might apply just to a single memslot. This significantly hurts performance of per-hva operations with higher memslot counts. Since hva ranges can overlap between memslots an interval tree is needed for tracking them. [sean: handle interval tree updates in kvm_replace_memslot()] Signed-off-by: Maciej S. Szmigiero Message-Id: --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 619186138176..7618bef0a4a9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -43,6 +43,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM help Support hosting fully virtualized guest machines using hardware -- cgit v1.2.1 From a54d806688fe1e482350ce759a8a0fc9ebf814b0 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:30 +0100 Subject: KVM: Keep memslots in tree-based structures instead of array-based ones The current memslot code uses a (reverse gfn-ordered) memslot array for keeping track of them. Because the memslot array that is currently in use cannot be modified every memslot management operation (create, delete, move, change flags) has to make a copy of the whole array so it has a scratch copy to work on. Strictly speaking, however, it is only necessary to make copy of the memslot that is being modified, copying all the memslots currently present is just a limitation of the array-based memslot implementation. Two memslot sets, however, are still needed so the VM continues to run on the currently active set while the requested operation is being performed on the second, currently inactive one. In order to have two memslot sets, but only one copy of actual memslots it is necessary to split out the memslot data from the memslot sets. The memslots themselves should be also kept independent of each other so they can be individually added or deleted. These two memslot sets should normally point to the same set of memslots. They can, however, be desynchronized when performing a memslot management operation by replacing the memslot to be modified by its copy. After the operation is complete, both memslot sets once again point to the same, common set of memslot data. This commit implements the aforementioned idea. For tracking of gfns an ordinary rbtree is used since memslots cannot overlap in the guest address space and so this data structure is sufficient for ensuring that lookups are done quickly. The "last used slot" mini-caches (both per-slot set one and per-vCPU one), that keep track of the last found-by-gfn memslot, are still present in the new code. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <17c0cf3663b760a0d3753d4ac08c0753e941b811.1638817641.git.maciej.szmigiero@oracle.com> --- arch/x86/kvm/debugfs.c | 6 +++--- arch/x86/kvm/mmu/mmu.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 54a83a744538..543a8c04025c 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -107,9 +107,10 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + int bkt; + slots = __kvm_memslots(kvm, i); - for (j = 0; j < slots->used_slots; j++) { - slot = &slots->memslots[j]; + kvm_for_each_memslot(slot, bkt, slots) for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { rmap = slot->arch.rmap[k]; lpage_size = kvm_mmu_slot_lpages(slot, k + 1); @@ -121,7 +122,6 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) cur[index]++; } } - } } write_unlock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e41cf095f2d1..c61430994d19 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3409,7 +3409,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - int r = 0, i; + int r = 0, i, bkt; /* * Check if this is the first shadow root being allocated before @@ -3434,7 +3434,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { + kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both @@ -5730,14 +5730,14 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; bool flush = false; gfn_t start, end; - int i; + int i, bkt; if (!kvm_memslots_have_rmaps(kvm)) return flush; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { + kvm_for_each_memslot(memslot, bkt, slots) { start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); if (start >= end) -- cgit v1.2.1 From f4209439b522432d140d33393d4a3f12e695527b Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:32 +0100 Subject: KVM: Optimize gfn lookup in kvm_zap_gfn_range() Introduce a memslots gfn upper bound operation and use it to optimize kvm_zap_gfn_range(). This way this handler can do a quick lookup for intersecting gfns and won't have to do a linear scan of the whole memslot set. Signed-off-by: Maciej S. Szmigiero Message-Id: --- arch/x86/kvm/mmu/mmu.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c61430994d19..b83ae4804176 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5728,19 +5728,22 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; + struct kvm_memslot_iter iter; bool flush = false; gfn_t start, end; - int i, bkt; + int i; if (!kvm_memslots_have_rmaps(kvm)) return flush; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, bkt, slots) { + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + memslot = iter.slot; start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) + if (WARN_ON_ONCE(start >= end)) continue; flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, @@ -5761,6 +5764,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) bool flush; int i; + if (WARN_ON_ONCE(gfn_end <= gfn_start)) + return; + write_lock(&kvm->mmu_lock); kvm_inc_notifier_count(kvm, gfn_start, gfn_end); -- cgit v1.2.1 From 907afa48e9d0f24713a34135428d981e4239a3be Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:21 -0400 Subject: KVM: nSVM: move nested_vmcb_check_cr3_cr4 logic in nested_vmcb_valid_sregs Inline nested_vmcb_check_cr3_cr4 as it is not called by anyone else. Doing so simplifies next patches. Signed-off-by: Emanuele Giuseppe Esposito Reviewed-by: Maxim Levitsky Message-Id: <20211103140527.752797-2-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f8b7bc04b3e7..946c06a25d37 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -275,27 +275,6 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, return true; } -static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) -{ - /* - * These checks are also performed by KVM_SET_SREGS, - * except that EFER.LMA is not checked by SVM against - * CR0.PG && EFER.LME. - */ - if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { - if (CC(!(save->cr4 & X86_CR4_PAE)) || - CC(!(save->cr0 & X86_CR0_PE)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) - return false; - } - - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) - return false; - - return true; -} - /* Common checks that apply to both L1 and L2 state. */ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, struct vmcb_save_area *save) @@ -317,7 +296,19 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) return false; - if (!nested_vmcb_check_cr3_cr4(vcpu, save)) + /* + * These checks are also performed by KVM_SET_SREGS, + * except that EFER.LMA is not checked by SVM against + * CR0.PG && EFER.LME. + */ + if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { + if (CC(!(save->cr4 & X86_CR4_PAE)) || + CC(!(save->cr0 & X86_CR0_PE)) || + CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + return false; + } + + if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) -- cgit v1.2.1 From f2740a8d851a57068c9f3624c6dc8edcf91754b2 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:22 -0400 Subject: KVM: nSVM: introduce svm->nested.save to cache save area before checks This is useful in the next patch, to keep a saved copy of vmcb12 registers and pass it around more easily. Instead of blindly copying everything, we just copy EFER, CR0, CR3, CR4, DR6 and DR7 which are needed by the VMRUN checks. If more fields will need to be checked, it will be quite obvious to see that they must be added in struct vmcb_save_area_cached and in nested_copy_vmcb_save_to_cache(). __nested_copy_vmcb_save_to_cache() takes a vmcb_save_area_cached parameter, which is useful in order to save the state to a local variable. Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20211103140527.752797-3-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 23 +++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 17 +++++++++++++++++ 3 files changed, 41 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 946c06a25d37..ceafe40ec0f9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -328,6 +328,28 @@ void nested_load_control_from_vmcb12(struct vcpu_svm *svm, svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; } +static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, + struct vmcb_save_area *from) +{ + /* + * Copy only fields that are validated, as we need them + * to avoid TOC/TOU races. + */ + to->efer = from->efer; + to->cr0 = from->cr0; + to->cr3 = from->cr3; + to->cr4 = from->cr4; + + to->dr6 = from->dr6; + to->dr7 = from->dr7; +} + +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save) +{ + __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); +} + /* * Synchronize fields that are written by the processor, so that * they can be copied back into the vmcb12. @@ -670,6 +692,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) return -EINVAL; nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0f68d11ec70..20d19162a7f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4435,6 +4435,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb12 = map.hva; nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); unmap_save: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1c7306c370fa..4c7365582a5e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -105,6 +105,15 @@ struct kvm_vmcb_info { uint64_t asid_generation; }; +struct vmcb_save_area_cached { + u64 efer; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -122,6 +131,12 @@ struct svm_nested_state { /* cache for control fields of the guest */ struct vmcb_control_area ctl; + /* + * Note: this struct is not kept up-to-date while L2 runs; it is only + * valid within nested_svm_vmrun. + */ + struct vmcb_save_area_cached save; + bool initialized; }; @@ -496,6 +511,8 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_load_control_from_vmcb12(struct vcpu_svm *svm, struct vmcb_control_area *control); +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); -- cgit v1.2.1 From 7907160dbf1a1063b19201c79566069ec0da054c Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:23 -0400 Subject: KVM: nSVM: rename nested_load_control_from_vmcb12 in nested_copy_vmcb_control_to_cache Following the same naming convention of the previous patch, rename nested_load_control_from_vmcb12. In addition, inline copy_vmcb_control_area as it is only called by this function. __nested_copy_vmcb_control_to_cache() works with vmcb_control_area parameters and it will be useful in next patches, when we use local variables instead of svm cached state. Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20211103140527.752797-4-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 80 +++++++++++++++++++++++------------------------ arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 4 +-- 3 files changed, 43 insertions(+), 43 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ceafe40ec0f9..d6c7030e4ac0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -163,37 +163,6 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } -static void copy_vmcb_control_area(struct vmcb_control_area *dst, - struct vmcb_control_area *from) -{ - unsigned int i; - - for (i = 0; i < MAX_INTERCEPT; i++) - dst->intercepts[i] = from->intercepts[i]; - - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - /* asid not copied, it is handled manually for svm->vmcb. */ - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; -} - static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { /* @@ -317,15 +286,46 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, return true; } -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +static +void __nested_copy_vmcb_control_to_cache(struct vmcb_control_area *to, + struct vmcb_control_area *from) { - copy_vmcb_control_area(&svm->nested.ctl, control); + unsigned int i; + + for (i = 0; i < MAX_INTERCEPT; i++) + to->intercepts[i] = from->intercepts[i]; + + to->iopm_base_pa = from->iopm_base_pa; + to->msrpm_base_pa = from->msrpm_base_pa; + to->tsc_offset = from->tsc_offset; + to->tlb_ctl = from->tlb_ctl; + to->int_ctl = from->int_ctl; + to->int_vector = from->int_vector; + to->int_state = from->int_state; + to->exit_code = from->exit_code; + to->exit_code_hi = from->exit_code_hi; + to->exit_info_1 = from->exit_info_1; + to->exit_info_2 = from->exit_info_2; + to->exit_int_info = from->exit_int_info; + to->exit_int_info_err = from->exit_int_info_err; + to->nested_ctl = from->nested_ctl; + to->event_inj = from->event_inj; + to->event_inj_err = from->event_inj_err; + to->nested_cr3 = from->nested_cr3; + to->virt_ext = from->virt_ext; + to->pause_filter_count = from->pause_filter_count; + to->pause_filter_thresh = from->pause_filter_thresh; + + /* Copy asid here because nested_vmcb_check_controls will check it. */ + to->asid = from->asid; + to->msrpm_base_pa &= ~0x0fffULL; + to->iopm_base_pa &= ~0x0fffULL; +} - /* Copy it here because nested_svm_check_controls will check it. */ - svm->nested.ctl.asid = control->asid; - svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; - svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control) +{ + __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); } static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, @@ -691,7 +691,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || @@ -1436,7 +1436,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); - nested_load_control_from_vmcb12(svm, ctl); + nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 20d19162a7f8..745351a2293c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4434,7 +4434,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) */ vmcb12 = map.hva; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4c7365582a5e..3f44ec79597b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -509,8 +509,8 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control); +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); -- cgit v1.2.1 From b7a3d8b6f433d293e9033b9d63651b6d17bf5654 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:24 -0400 Subject: KVM: nSVM: use vmcb_save_area_cached in nested_vmcb_valid_sregs() Now that struct vmcb_save_area_cached contains the required vmcb fields values (done in nested_load_save_from_vmcb12()), check them to see if they are correct in nested_vmcb_valid_sregs(). While at it, rename nested_vmcb_valid_sregs in nested_vmcb_check_save. __nested_vmcb_check_save takes the additional @save parameter, so it is helpful when we want to check a non-svm save state, like in svm_set_nested_state. The reason for that is that save is the L1 state, not L2, so we check it without moving it to svm->nested.save. Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20211103140527.752797-5-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d6c7030e4ac0..545d0ad19de4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -245,8 +245,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, } /* Common checks that apply to both L1 and L2 state. */ -static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { /* * FIXME: these should be done after copying the fields, @@ -286,6 +286,14 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, return true; } +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area_cached *save = &svm->nested.save; + + return __nested_vmcb_check_save(vcpu, save); +} + static void __nested_copy_vmcb_control_to_cache(struct vmcb_control_area *to, struct vmcb_control_area *from) @@ -694,7 +702,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || + if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; @@ -1330,6 +1338,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + struct vmcb_save_area_cached save_cached; unsigned long cr0; int ret; @@ -1397,10 +1406,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). */ + __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) + !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; /* -- cgit v1.2.1 From 355d0473b1a11d7cf526fbd43c3908224e08a909 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:25 -0400 Subject: KVM: nSVM: use svm->nested.save to load vmcb12 registers and avoid TOC/TOU races Use the already checked svm->nested.save cached fields (EFER, CR0, CR4, ...) instead of vmcb12's in nested_vmcb02_prepare_save(). This prevents from creating TOC/TOU races, since the guest could modify the vmcb12 fields. This also avoids the need of force-setting EFER_SVME in nested_vmcb02_prepare_save. Signed-off-by: Emanuele Giuseppe Esposito Reviewed-by: Maxim Levitsky Message-Id: <20211103140527.752797-6-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 545d0ad19de4..aad09d560ec0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -248,13 +248,6 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, struct vmcb_save_area_cached *save) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ if (CC(!(save->efer & EFER_SVME))) return false; @@ -511,15 +504,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); + svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -534,8 +522,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; vmcb_mark_dirty(svm->vmcb, VMCB_DR); } } @@ -649,7 +637,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_control(svm); nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) return ret; -- cgit v1.2.1 From bd95926c2b2b9b66013a36b6558aa426147ed11f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Nov 2021 09:14:08 -0500 Subject: KVM: nSVM: split out __nested_vmcb_check_controls Remove the struct vmcb_control_area parameter from nested_vmcb_check_controls, for consistency with the functions that operate on the save area. This way, VMRUN uses the version without underscores for both areas, while KVM_SET_NESTED_STATE uses the version with underscores. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index aad09d560ec0..565d9d401f43 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -219,8 +219,8 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) } } -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) +static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_control_area *control) { if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) return false; @@ -287,6 +287,14 @@ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) return __nested_vmcb_check_save(vcpu, save); } +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *ctl = &svm->nested.ctl; + + return __nested_vmcb_check_controls(vcpu, ctl); +} + static void __nested_copy_vmcb_control_to_cache(struct vmcb_control_area *to, struct vmcb_control_area *from) @@ -691,7 +699,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); if (!nested_vmcb_check_save(vcpu) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -1379,7 +1387,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(vcpu, ctl)) + if (!__nested_vmcb_check_controls(vcpu, ctl)) goto out_free; /* -- cgit v1.2.1 From 8fc78909c05d1691c0d087cb1b9a4858762c747d Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Wed, 3 Nov 2021 10:05:26 -0400 Subject: KVM: nSVM: introduce struct vmcb_ctrl_area_cached This structure will replace vmcb_control_area in svm_nested_state, providing only the fields that are actually used by the nested state. This avoids having and copying around uninitialized fields. The cost of this, however, is that all functions (in this case vmcb_is_intercept) expect the old structure, so they need to be duplicated. In addition, in svm_get_nested_state() user space expects a vmcb_control_area struct, so we need to copy back all fields in a temporary structure before copying it to userspace. Signed-off-by: Emanuele Giuseppe Esposito Reviewed-by: Maxim Levitsky Message-Id: <20211103140527.752797-7-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 83 +++++++++++++++++++++++++++++++++++++---------- arch/x86/kvm/svm/svm.c | 4 +-- arch/x86/kvm/svm/svm.h | 39 +++++++++++++++++++--- 3 files changed, 103 insertions(+), 23 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 565d9d401f43..598843cfe6c4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -58,8 +58,9 @@ static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_excep struct vcpu_svm *svm = to_svm(vcpu); WARN_ON(!is_guest_mode(vcpu)); - if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { + if (vmcb12_is_intercept(&svm->nested.ctl, + INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && + !svm->nested.nested_run_pending) { svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = fault->error_code; @@ -121,7 +122,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h, *g; + struct vmcb_control_area *c, *h; + struct vmcb_ctrl_area_cached *g; unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -172,7 +174,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -220,9 +222,9 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) } static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) + struct vmcb_ctrl_area_cached *control) { - if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) + if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; if (CC(control->asid == 0)) @@ -290,13 +292,13 @@ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *ctl = &svm->nested.ctl; + struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; return __nested_vmcb_check_controls(vcpu, ctl); } static -void __nested_copy_vmcb_control_to_cache(struct vmcb_control_area *to, +void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, struct vmcb_control_area *from) { unsigned int i; @@ -1006,7 +1008,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -1033,7 +1035,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -1064,12 +1066,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -1087,7 +1089,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -1165,7 +1167,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static int svm_check_nested_events(struct kvm_vcpu *vcpu) @@ -1269,11 +1271,47 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); } +/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ +static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, + struct vmcb_ctrl_area_cached *from) +{ + unsigned int i; + + memset(dst, 0, sizeof(struct vmcb_control_area)); + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) { struct vcpu_svm *svm; + struct vmcb_control_area *ctl; + unsigned long r; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_SVM, @@ -1315,9 +1353,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, */ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) return -EFAULT; - if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, - sizeof(user_vmcb->control))) + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); + r = copy_to_user(&user_vmcb->control, ctl, + sizeof(user_vmcb->control)); + kfree(ctl); + if (r) return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; @@ -1335,6 +1382,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb_control_area *ctl; struct vmcb_save_area *save; struct vmcb_save_area_cached save_cached; + struct vmcb_ctrl_area_cached ctl_cached; unsigned long cr0; int ret; @@ -1387,7 +1435,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!__nested_vmcb_check_controls(vcpu, ctl)) + __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 745351a2293c..907ba85609a2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2508,7 +2508,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, bool ret = false; if (!is_guest_mode(vcpu) || - (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -4215,7 +4215,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - if (!(vmcb_is_intercept(&svm->nested.ctl, + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3f44ec79597b..929bd60d754d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -114,6 +114,31 @@ struct vmcb_save_area_cached { u64 dr6; }; +struct vmcb_ctrl_area_cached { + u32 intercepts[MAX_INTERCEPT]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 virt_ext; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -129,7 +154,7 @@ struct svm_nested_state { bool nested_run_pending; /* cache for control fields of the guest */ - struct vmcb_control_area ctl; + struct vmcb_ctrl_area_cached ctl; /* * Note: this struct is not kept up-to-date while L2 runs; it is only @@ -318,6 +343,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) return test_bit(bit, (unsigned long *)&control->intercepts); } +static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); +} + static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -470,17 +501,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, -- cgit v1.2.1 From ce92ef7604ffe74da84f559f6eba8c6053250451 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Nov 2021 18:08:42 -0800 Subject: KVM: x86/mmu: Use shadow page role to detect PML-unfriendly pages for L2 Rework make_spte() to query the shadow page's role, specifically whether or not it's a guest_mode page, a.k.a. a page for L2, when determining if the SPTE is compatible with PML. This eliminates a dependency on @vcpu, with a future goal of being able to create SPTEs without a specific vCPU. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 7 +++---- arch/x86/kvm/mmu/spte.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 52c6527b1a06..5897ce4cdf10 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -104,7 +104,7 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return kvm_mmu_role_as_id(sp->role); } -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -112,10 +112,9 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) * on write protection to record dirty pages, which bypasses PML, since * writes now result in a vmexit. Note, the check on CPU dirty logging * being enabled is mandatory as the bits used to denote WP-only SPTEs - * are reserved for NPT w/ PAE (32-bit KVM). + * are reserved for PAE paging (32-bit KVM). */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu && - kvm_x86_ops.cpu_dirty_log_size; + return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0c76c45fdb68..84e64dbdd89e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -101,7 +101,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) + else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* -- cgit v1.2.1 From 9d395a0a7aca75caa72a8ab11a6efc9909c5a918 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 15:45:53 -0800 Subject: KVM: x86/mmu: Remove need for a vcpu from kvm_slot_page_track_is_active kvm_slot_page_track_is_active only uses its vCPU argument to get a pointer to the assoicated struct kvm, so just pass in the struct KVM to remove the need for a vCPU pointer. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20211115234603.2908381-6-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/page_track.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b83ae4804176..2ea6e5e1fde9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2591,7 +2591,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -3888,7 +3888,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index cc4eb5b7fb76..35c221d5f6ce 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, +bool kvm_slot_page_track_is_active(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode) { @@ -186,7 +186,7 @@ bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, return false; if (mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(vcpu->kvm)) + !kvm_page_track_write_tracking_enabled(kvm)) return false; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); -- cgit v1.2.1 From 4d78d0b39ad03e7357452a669938653a379cfebd Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 15:45:54 -0800 Subject: KVM: x86/mmu: Remove need for a vcpu from mmu_try_to_unsync_pages The vCPU argument to mmu_try_to_unsync_pages is now only used to get a pointer to the associated struct kvm, so pass in the kvm pointer from the beginning to remove the need for a vCPU when calling the function. Signed-off-by: Ben Gardon Message-Id: <20211115234603.2908381-7-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 16 ++++++++-------- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/mmu/spte.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2ea6e5e1fde9..29bcf26b0cb3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2565,10 +2565,10 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return r; } -static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; + ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); @@ -2580,7 +2580,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; @@ -2591,7 +2591,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2600,7 +2600,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, * that case, KVM must complete emulation of the guest TLB flush before * allowing shadow pages to become unsync (writable by the guest). */ - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { if (!can_unsync) return -EPERM; @@ -2619,7 +2619,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, */ if (!locked) { locked = true; - spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU @@ -2634,10 +2634,10 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } WARN_ON(sp->role.level != PG_LEVEL_4K); - kvm_unsync_page(vcpu, sp); + kvm_unsync_page(kvm, sp); } if (locked) - spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5897ce4cdf10..787b8c553b9e 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -117,7 +117,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 84e64dbdd89e..8d3fe4311bc1 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -161,7 +161,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) { + if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); wrprot = true; -- cgit v1.2.1 From 8283e36abfff507c64fe8289ac30ea7ab59648aa Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 15:45:58 -0800 Subject: KVM: x86/mmu: Propagate memslot const qualifier In preparation for implementing in-place hugepage promotion, various functions will need to be called from zap_collapsible_spte_range, which has the const qualifier on its memslot argument. Propagate the const qualifier to the various functions which will be needed. This just serves to simplify the following patch. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20211115234603.2908381-11-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/mmu/page_track.c | 4 ++-- arch/x86/kvm/mmu/spte.c | 2 +- arch/x86/kvm/mmu/spte.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 29bcf26b0cb3..c28cf7eeb79d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2580,7 +2580,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm *kvm, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 787b8c553b9e..da6166b5c377 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -117,7 +117,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } -int mmu_try_to_unsync_pages(struct kvm *kvm, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 35c221d5f6ce..68eb1fb548b6 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -174,8 +174,8 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); * check if the corresponding access on the specified guest page is tracked. */ bool kvm_slot_page_track_is_active(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) + const struct kvm_memory_slot *slot, + gfn_t gfn, enum kvm_page_track_mode mode) { int index; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8d3fe4311bc1..8a7b03207762 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -90,7 +90,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct kvm_memory_slot *slot, + const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index cc432f9a966b..a4af2a42695c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -330,7 +330,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct kvm_memory_slot *slot, + const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -- cgit v1.2.1 From fb43496c8362b8b379b4348b581f8f88f47cd1f8 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 15:45:59 -0800 Subject: KVM: x86/MMU: Simplify flow of vmx_get_mt_mask Remove the gotos from vmx_get_mt_mask. It's easier to build the whole memory type at once, than it is to combine separate cacheability and ipat fields. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20211115234603.2908381-12-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9453743ce0c4..01eab6d6ec22 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6947,7 +6947,6 @@ static int __init vmx_check_processor_compat(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; - u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -6967,30 +6966,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * EPT memory type is used to emulate guest CD/MTRR. */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) -- cgit v1.2.1 From 98a26b69d8c3b7b2bd51350b1a1218c518d32123 Mon Sep 17 00:00:00 2001 From: Vihas Mak Date: Sun, 14 Nov 2021 22:13:12 +0530 Subject: KVM: x86: change TLB flush indicator to bool change 0 to false and 1 to true to fix following cocci warnings: arch/x86/kvm/mmu/mmu.c:1485:9-10: WARNING: return of 0/1 in function 'kvm_set_pte_rmapp' with return type bool arch/x86/kvm/mmu/mmu.c:1636:10-11: WARNING: return of 0/1 in function 'kvm_test_age_rmapp' with return type bool Signed-off-by: Vihas Mak Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Message-Id: <20211114164312.GA28736@makvihas> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c28cf7eeb79d..ede63912b1ac 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1454,7 +1454,7 @@ static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int need_flush = 0; + bool need_flush = false; u64 new_spte; kvm_pfn_t new_pfn; @@ -1466,7 +1466,7 @@ restart: rmap_printk("spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); - need_flush = 1; + need_flush = true; if (pte_write(pte)) { pte_list_remove(kvm, rmap_head, sptep); @@ -1482,7 +1482,7 @@ restart: if (need_flush && kvm_available_flush_tlb_with_range()) { kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - return 0; + return false; } return need_flush; @@ -1623,8 +1623,8 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) - return 1; - return 0; + return true; + return false; } #define RMAP_RECYCLE_THRESHOLD 1000 -- cgit v1.2.1 From 1831fa44df743a7cdffdf1c12c799bf6f3c12b8c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 16 Nov 2021 09:32:47 -0500 Subject: KVM: VMX: Don't unblock vCPU w/ Posted IRQ if IRQs are disabled in guest Don't configure the wakeup handler when a vCPU is blocking with IRQs disabled, in which case any IRQ, posted or otherwise, should not be recognized and thus should not wake the vCPU. Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked") Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1c94783b5a54..41f946e2123e 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -147,7 +147,8 @@ int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_can_use_vtd_pi(vcpu->kvm) || + vmx_interrupt_blocked(vcpu)) return 0; WARN_ON(irqs_disabled()); -- cgit v1.2.1 From 91b01895071770ed0c256869d0f94d69a2fb8ecf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:11:55 -0700 Subject: KVM: SVM: Ensure target pCPU is read once when signalling AVIC doorbell Ensure vcpu->cpu is read once when signalling the AVIC doorbell. If the compiler rereads the field and the vCPU is migrated between the check and writing the doorbell, KVM would signal the wrong physical CPU. Functionally, signalling the wrong CPU in this case is not an issue as task migration means the vCPU has exited and will pick up any pending interrupts on the next VMRUN. Add the READ_ONCE() purely to clean up the code. Opportunistically add a comment explaining the task migration behavior, and rename cpuid=>cpu to avoid conflating the CPU number with KVM's more common usage of CPUID. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b7200595cbd4..0e5b49294086 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -675,10 +675,18 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) smp_mb__after_atomic(); if (avic_vcpu_is_running(vcpu)) { - int cpuid = vcpu->cpu; + int cpu = READ_ONCE(vcpu->cpu); - if (cpuid != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); + /* + * Note, the vCPU could get migrated to a different pCPU at any + * point, which could result in signalling the wrong/previous + * pCPU. But if that happens the vCPU is guaranteed to do a + * VMRUN (after being migrated) and thus will process pending + * interrupts, i.e. a doorbell is not needed (and the spurious + * one is harmless). + */ + if (cpu != get_cpu()) + wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); } else kvm_vcpu_wake_up(vcpu); -- cgit v1.2.1 From 1460179dcd76a4ae4121e2da29b586f41715dd1d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:05 -0700 Subject: KVM: x86: Tweak halt emulation helper names to free up kvm_vcpu_halt() Rename a variety of HLT-related helpers to free up the function name "kvm_vcpu_halt" for future use in generic KVM code, e.g. to differentiate between "block" and "halt". No functional change intended. Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9c941535f78c..e885f557fcbe 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3603,7 +3603,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 01eab6d6ec22..b4defe000db8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4697,7 +4697,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } return 1; } @@ -5368,7 +5368,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 73e0e40c94b6..43cabc747318 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8725,7 +8725,7 @@ void kvm_arch_exit(void) #endif } -static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { @@ -8737,11 +8737,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) } } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) { - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -8750,7 +8750,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered * KVM_EXIT_DEBUG here. */ - return kvm_vcpu_halt(vcpu) && ret; + return kvm_emulate_halt_noskip(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -8758,7 +8758,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { int ret = kvm_skip_emulated_instruction(vcpu); - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); -- cgit v1.2.1 From 91b99ea7065786d0bff1c9281b002455dbaeb08b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:06 -0700 Subject: KVM: Rename kvm_vcpu_block() => kvm_vcpu_halt() Rename kvm_vcpu_block() to kvm_vcpu_halt() in preparation for splitting the actual "block" sequences into a separate helper (to be named kvm_vcpu_block()). x86 will use the standalone block-only path to handle non-halt cases where the vCPU is not runnable. Rename block_ns to halt_ns to match the new function name. No functional change intended. Reviewed-by: David Matlack Reviewed-by: Christian Borntraeger Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43cabc747318..e3dd76f251e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8727,6 +8727,13 @@ void kvm_arch_exit(void) static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = state; @@ -9999,7 +10006,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); + kvm_vcpu_halt(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (kvm_x86_ops.post_block) @@ -10196,7 +10203,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } - kvm_vcpu_block(vcpu); + kvm_vcpu_halt(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; -- cgit v1.2.1 From c91d44971459073537874fcdd2f445e94cfb4f07 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:10 -0700 Subject: KVM: x86: Directly block (instead of "halting") UNINITIALIZED vCPUs Go directly to kvm_vcpu_block() when handling the case where userspace attempts to run an UNINITIALIZED vCPU. The vCPU is not halted, nor is it likely that halt-polling will be successful in this case. Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-18-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3dd76f251e9..9d628ec38414 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10203,7 +10203,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } - kvm_vcpu_halt(vcpu); + kvm_vcpu_block(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; -- cgit v1.2.1 From cdafece4b964a27b2d3d76bf5725b49415bbaaea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:11 -0700 Subject: KVM: x86: Invoke kvm_vcpu_block() directly for non-HALTED wait states Call kvm_vcpu_block() directly for all wait states except HALTED so that kvm_vcpu_halt() is no longer a misnomer on x86. Functionally, this means KVM will never attempt halt-polling or adjust vcpu->halt_poll_ns for INIT_RECEIVED (a.k.a. Wait-For-SIPI (WFS)) or AP_RESET_HOLD; UNINITIALIZED is handled in kvm_arch_vcpu_ioctl_run(), and x86 doesn't use any other "wait" states. As mentioned above, the motivation of this is purely so that "halt" isn't overloaded on x86, e.g. in KVM's stats. Skipping halt-polling for WFS (and RESET_HOLD) has no meaningful effect on guest performance as there are typically single-digit numbers of INIT-SIPI sequences per AP vCPU, per boot, versus thousands of HLTs just to boot to console. Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d628ec38414..50450ebe709f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10006,7 +10006,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_halt(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + kvm_vcpu_halt(vcpu); + else + kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (kvm_x86_ops.post_block) -- cgit v1.2.1 From d92a5d1c6c757f659ffb9c2c2e65fcf3d571c14e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:12 -0700 Subject: KVM: Add helpers to wake/query blocking vCPU Add helpers to wake and query a blocking vCPU. In addition to providing nice names, the helpers reduce the probability of KVM neglecting to use kvm_arch_vcpu_get_wait(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 451e80306b51..bbac8477b3ec 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1931,7 +1931,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(rcuwait_active(&vcpu->wait)); + WARN_ON(kvm_vcpu_is_blocking(vcpu)); apic_timer_expired(apic, false); cancel_hv_timer(apic); -- cgit v1.2.1 From 057aa61bc992f2d27218b6558b0115d5623f1a7b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:13 -0700 Subject: KVM: VMX: Skip Posted Interrupt updates if APICv is hard disabled Explicitly skip posted interrupt updates if APICv is disabled in all of KVM, or if the guest doesn't have an in-kernel APIC. The PI descriptor is kept up-to-date if APICv is inhibited, e.g. so that re-enabling APICv doesn't require a bunch of updates, but neither the module param nor the APIC type can be changed on-the-fly. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 41f946e2123e..ee4ddc6e268b 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -29,11 +29,14 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) unsigned int dest; /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. + * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and + * PI.SN up-to-date even if there is no assigned device or if APICv is + * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. */ + if (!enable_apicv || !lapic_in_kernel(vcpu)) + return; + + /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; -- cgit v1.2.1 From c95717218add161f3e93bd454743506ed5bdd8e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:15 -0700 Subject: KVM: VMX: Drop unnecessary PI logic to handle impossible conditions Drop sanity checks on the validity of the previous pCPU when handling vCPU block/unlock for posted interrupts. The intention behind the sanity checks is to avoid memory corruption in case of a race or incorrect locking, but the code has been stable for a few years now and the checks get in the way of eliminating kvm_vcpu.pre_cpu. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ee4ddc6e268b..f15d4a7450a5 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -123,12 +123,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; } /* @@ -156,14 +154,12 @@ int pi_pre_block(struct kvm_vcpu *vcpu) WARN_ON(irqs_disabled()); local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); do { old.control = new.control = pi_desc->control; -- cgit v1.2.1 From 74ba5bc872d3fb173b94fe9a1b8f6eaa807fc4ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:16 -0700 Subject: KVM: VMX: Use boolean returns for Posted Interrupt "test" helpers Return bools instead of ints for the posted interrupt "test" helpers. The bit position of the flag being test does not matter to the callers, and is in fact lost by virtue of test_bit() itself returning a bool. Returning ints is potentially dangerous, e.g. "pi_test_on(pi_desc) == 1" is safe-ish because ON is bit 0 and thus any sane implementation of pi_test_on() will work, but for SN (bit 1), checking "== 1" would rely on pi_test_on() to return 0 or 1, a.k.a. bools, as opposed to 0 or 2 (the positive bit position). Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 4 ++-- arch/x86/kvm/vmx/posted_intr.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index f15d4a7450a5..8abd5d891bde 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -189,7 +189,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.control) != old.control); /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) + if (pi_test_on(pi_desc)) __pi_post_block(vcpu); local_irq_enable(); @@ -220,7 +220,7 @@ void pi_wakeup_handler(void) blocked_vcpu_list) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (pi_test_on(pi_desc) == 1) + if (pi_test_on(pi_desc)) kvm_vcpu_kick(vcpu); } spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 7f7b2326caf5..36ae035f14aa 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,7 +40,7 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } @@ -74,13 +74,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_on(struct pi_desc *pi_desc) +static inline bool pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } -static inline int pi_test_sn(struct pi_desc *pi_desc) +static inline bool pi_test_sn(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); -- cgit v1.2.1 From 29802380b6793eabcac648e1c097c7bd6333f3d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:17 -0700 Subject: KVM: VMX: Drop pointless PI.NDST update when blocking Don't update Posted Interrupt's NDST, a.k.a. the target pCPU, in the pre-block path, as NDST is guaranteed to be up-to-date. The comment about the vCPU being preempted during the update is simply wrong, as the update path runs with IRQs disabled (from before snapshotting vcpu->cpu, until after the update completes). Since commit 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts", 2017-09-27) The vCPU can get preempted _before_ the update starts, but not during. And if the vCPU is preempted before, vmx_vcpu_pi_load() is responsible for updating NDST when the vCPU is scheduled back in. In that case, the check against the wakeup vector in vmx_vcpu_pi_load() cannot be true as that would require the notification vector to have been set to the wakeup vector _before_ blocking. Opportunistically switch to using vcpu->cpu for the list/lock lookups, which do not need pre_pcpu since the same commit. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-25-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 8abd5d891bde..bb93e9637494 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -135,7 +135,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) * - Store the vCPU to the wakeup list, so when interrupts happen * we can find the right vCPU to wake up. * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR * - If 'ON' is set during this process, which means at least one * interrupt is posted for this vCPU, we cannot block it, in @@ -144,7 +143,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -156,10 +154,10 @@ int pi_pre_block(struct kvm_vcpu *vcpu) local_irq_disable(); vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); do { old.control = new.control = pi_desc->control; @@ -168,21 +166,6 @@ int pi_pre_block(struct kvm_vcpu *vcpu) "Warning: SN field of posted-interrupts " "is set before blocking\n"); - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (cmpxchg64(&pi_desc->control, old.control, -- cgit v1.2.1 From 89ef0f21cf96200dfa46cec92228ef435681589f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:18 -0700 Subject: KVM: VMX: Save/restore IRQs (instead of CLI/STI) during PI pre/post block Save/restore IRQs when disabling IRQs in posted interrupt pre/post block in preparation for moving the code into vcpu_put/load(), where it would be called with IRQs already disabled. No functional changed intended. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-26-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index bb93e9637494..b72dbe80f87a 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -145,13 +145,13 @@ int pi_pre_block(struct kvm_vcpu *vcpu) { struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + unsigned long flags; if (!vmx_can_use_vtd_pi(vcpu->kvm) || vmx_interrupt_blocked(vcpu)) return 0; - WARN_ON(irqs_disabled()); - local_irq_disable(); + local_irq_save(flags); vcpu->pre_pcpu = vcpu->cpu; spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); @@ -175,19 +175,20 @@ int pi_pre_block(struct kvm_vcpu *vcpu) if (pi_test_on(pi_desc)) __pi_post_block(vcpu); - local_irq_enable(); + local_irq_restore(flags); return (vcpu->pre_pcpu == -1); } void pi_post_block(struct kvm_vcpu *vcpu) { + unsigned long flags; + if (vcpu->pre_pcpu == -1) return; - WARN_ON(irqs_disabled()); - local_irq_disable(); + local_irq_save(flags); __pi_post_block(vcpu); - local_irq_enable(); + local_irq_restore(flags); } /* -- cgit v1.2.1 From cfb0e1306a3790eb055ebf7cdb7b0ee8a23e9b6e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:19 -0700 Subject: KVM: VMX: Read Posted Interrupt "control" exactly once per loop iteration Use READ_ONCE() when loading the posted interrupt descriptor control field to ensure "old" and "new" have the same base value. If the compiler emits separate loads, and loads into "new" before "old", KVM could theoretically drop the ON bit if it were set between the loads. Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-27-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index b72dbe80f87a..a1ce598e9544 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) /* The full case. */ do { - old.control = new.control = pi_desc->control; + old.control = new.control = READ_ONCE(pi_desc->control); dest = cpu_physical_id(cpu); @@ -107,7 +107,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) unsigned int dest; do { - old.control = new.control = pi_desc->control; + old.control = new.control = READ_ONCE(pi_desc->control); WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, "Wakeup handler not enabled while the VCPU is blocked\n"); @@ -160,7 +160,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); do { - old.control = new.control = pi_desc->control; + old.control = new.control = READ_ONCE(pi_desc->control); WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " -- cgit v1.2.1 From 724b3962ef808388005b860450bfbef6bade26b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:20 -0700 Subject: KVM: VMX: Move Posted Interrupt ndst computation out of write loop Hoist the CPU => APIC ID conversion for the Posted Interrupt descriptor out of the loop to write the descriptor, preemption is disabled so the CPU won't change, and if the APIC ID changes KVM has bigger problems. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-28-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index a1ce598e9544..e29a40042978 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -52,17 +52,15 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) goto after_clear_sn; } - /* The full case. */ + /* The full case. Set the new destination and clear SN. */ + dest = cpu_physical_id(cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; + do { old.control = new.control = READ_ONCE(pi_desc->control); - dest = cpu_physical_id(cpu); - - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - + new.ndst = dest; new.sn = 0; } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); @@ -106,17 +104,16 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; unsigned int dest; + dest = cpu_physical_id(vcpu->cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; + do { old.control = new.control = READ_ONCE(pi_desc->control); WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, "Wakeup handler not enabled while the VCPU is blocked\n"); - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + new.ndst = dest; /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; -- cgit v1.2.1 From baed82c8e4893a3258267dad198e04691d2f7c09 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:21 -0700 Subject: KVM: VMX: Remove vCPU from PI wakeup list before updating PID.NV Remove the vCPU from the wakeup list before updating the notification vector in the posted interrupt post-block helper. There is no need to wake the current vCPU as it is by definition not blocking. Practically speaking this is a nop as it only shaves a few meager cycles in the unlikely case that the vCPU was migrated and the previous pCPU gets a wakeup IRQ right before PID.NV is updated. The real motivation is to allow for more readable code in the future, when post-block is merged with vmx_vcpu_pi_load(), at which point removal from the list will be conditional on the old notification vector. Opportunistically add comments to document why KVM has a per-CPU spinlock that, at first glance, appears to be taken only on the owning CPU. Explicitly call out that the spinlock must be taken with IRQs disabled, a detail that was "lost" when KVM switched from spin_lock_irqsave() to spin_lock(), with IRQs disabled for the entirety of the relevant path. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-29-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index e29a40042978..4db2b14ee7c6 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -11,10 +11,22 @@ #include "vmx.h" /* - * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. + * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() + * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when + * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. + * The vCPUs posted interrupt descriptor is updated at the same time to set its + * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices + * wake the target vCPUs. vCPUs are removed from the list and the notification + * vector is reset when the vCPU is scheduled in. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +/* + * Protect the per-CPU list with a per-CPU spinlock to handle task migration. + * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the + * ->sched_in() path will need to take the vCPU off the list of the _previous_ + * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will + * occur if a wakeup IRQ arrives and attempts to acquire the lock. + */ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) @@ -104,6 +116,14 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; unsigned int dest; + /* + * Remove the vCPU from the wakeup list of the _previous_ pCPU, which + * will not be the same as the current pCPU if the task was migrated. + */ + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + dest = cpu_physical_id(vcpu->cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -120,9 +140,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } -- cgit v1.2.1 From b1d66dad65dcc8a6e5942db27027a086aa4f5c16 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 17 Nov 2021 16:03:04 +0800 Subject: KVM: x86/svm: Add module param to control PMU virtualization For Intel, the guest PMU can be disabled via clearing the PMU CPUID. For AMD, all hw implementations support the base set of four performance counters, with current mainstream hardware indicating the presence of two additional counters via X86_FEATURE_PERFCTR_CORE. In the virtualized world, the AMD guest driver may detect the presence of at least one counter MSR. Most hypervisor vendors would introduce a module param (like lbrv for svm) to disable PMU for all guests. Another control proposal per-VM is to pass PMU disable information via MSR_IA32_PERF_CAPABILITIES or one bit in CPUID Fn4000_00[FF:00]. Both of methods require some guest-side changes, so a module parameter may not be sufficiently granular, but practical enough. Signed-off-by: Like Xu Message-Id: <20211117080304.38989-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/pmu.c | 4 ++++ arch/x86/kvm/svm/svm.c | 11 +++++++++++ arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 07e9215e911d..0b920e12bb6d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -523,7 +523,7 @@ void kvm_set_cpu_caps(void) F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE) + F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b4095dfeeee6..0cf05e4caa4c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,6 +16,7 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, @@ -100,6 +101,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + if (!pmu) + return NULL; + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 907ba85609a2..64a10cdb2356 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -192,6 +192,10 @@ module_param(vgif, int, 0444); static int lbrv = true; module_param(lbrv, int, 0444); +/* enable/disable PMU virtualization */ +bool pmu = true; +module_param(pmu, bool, 0444); + static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); @@ -954,6 +958,10 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + /* AMD PMU PERFCTR_CORE CPUID */ + if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); } @@ -1087,6 +1095,9 @@ static __init int svm_hardware_setup(void) pr_info("LBR virtualization supported\n"); } + if (!pmu) + pr_info("PMU virtualization is disabled\n"); + svm_set_cpu_caps(); /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 929bd60d754d..a57390473013 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,6 +32,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; +extern bool pmu; /* * Clean bits in VMCB. -- cgit v1.2.1 From 2c5653caecc4807b8abfe9c41880ac38417be7bf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:53 +0800 Subject: KVM: X86: Ensure that dirty PDPTRs are loaded For VMX with EPT, dirty PDPTRs need to be loaded before the next vmentry via vmx_load_mmu_pgd() But not all paths that call load_pdptrs() will cause vmx_load_mmu_pgd() to be invoked. Normally, kvm_mmu_reset_context() is used to cause KVM_REQ_LOAD_MMU_PGD, but sometimes it is skipped: * commit d81135a57aa6("KVM: x86: do not reset mmu if CR0.CD and CR0.NW are changed") skips kvm_mmu_reset_context() after load_pdptrs() when changing CR0.CD and CR0.NW. * commit 21823fbda552("KVM: x86: Invalidate all PGDs for the current PCID on MOV CR3 w/ flush") skips KVM_REQ_LOAD_MMU_PGD after load_pdptrs() when rewriting the CR3 with the same value. * commit a91a7c709600("KVM: X86: Don't reset mmu context when toggling X86_CR4_PGE") skips kvm_mmu_reset_context() after load_pdptrs() when changing CR4.PGE. Fixes: d81135a57aa6 ("KVM: x86: do not reset mmu if CR0.CD and CR0.NW are changed") Fixes: 21823fbda552 ("KVM: x86: Invalidate all PGDs for the current PCID on MOV CR3 w/ flush") Fixes: a91a7c709600 ("KVM: X86: Don't reset mmu context when toggling X86_CR4_PGE") Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50450ebe709f..ba2704fce181 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -830,6 +830,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; return 1; -- cgit v1.2.1 From c0d6956e43054e397f4f661d0a62d490278ddef9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:54 +0800 Subject: KVM: VMX: Mark VCPU_EXREG_PDPTR available in ept_save_pdptrs() mmu->pdptrs[] and vmcs.GUEST_PDPTR[0-3] are synced, so mmu->pdptrs is available and GUEST_PDPTR[0-3] is not dirty. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b4defe000db8..cc87cd268eb4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2989,7 +2989,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ -- cgit v1.2.1 From 40e49c4f5fb0699b4b5b5b1ee0a1bc88b4fec00d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:55 +0800 Subject: KVM: SVM: Track dirtiness of PDPTRs even if NPT is disabled Use the same logic to handle the availability of VCPU_EXREG_PDPTR as VMX, also removing a branch in svm_vcpu_run(). Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 64a10cdb2356..9b6386282198 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1598,10 +1598,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_EXREG_PDPTR: - BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + /* + * When !npt_enabled, mmu->pdptrs[] is already available since + * it is always updated per SDM when moving to CRs. + */ + if (npt_enabled) + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: KVM_BUG_ON(1, vcpu->kvm); @@ -3974,8 +3980,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) - kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); + kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); /* * We need to handle MC intercepts here before the vcpu has a chance to -- cgit v1.2.1 From 5ec60aad547f716530ad308266eeab378a4e287c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:56 +0800 Subject: KVM: VMX: Add and use X86_CR4_TLBFLUSH_BITS when !enable_ept In set_cr4_guest_host_mask(), X86_CR4_PGE is set to be intercepted when !enable_ept just because X86_CR4_PGE is the only bit that is responsible for flushing TLB but listed in KVM_POSSIBLE_CR4_GUEST_BITS. It is clearer and self-documented to use X86_CR4_TLBFLUSH_BITS instead. No functionality changed. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-5-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 90e1ffdc05b7..828f55ce816b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,6 +9,8 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) +#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) + #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cc87cd268eb4..35d4df40af2a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4045,7 +4045,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) - vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; -- cgit v1.2.1 From a37ebdce168f57732ff2917a685980fc21133417 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:57 +0800 Subject: KVM: VMX: Add and use X86_CR4_PDPTR_BITS when !enable_ept In set_cr4_guest_host_mask(), all cr4 pdptr bits are already set to be intercepted in an unclear way. Add X86_CR4_PDPTR_BITS to make it clear and self-documented. No functionality changed. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-6-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 1 + arch/x86/kvm/vmx/vmx.c | 4 +++- arch/x86/kvm/x86.c | 4 +--- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 828f55ce816b..7c9f6455fc04 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -10,6 +10,7 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) +#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 35d4df40af2a..77d79a3aa35a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4044,8 +4044,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; - if (!enable_ept) + if (!enable_ept) { vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba2704fce181..865938f1e94c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1051,8 +1051,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP; if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; @@ -1063,7 +1061,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if ((cr4 ^ old_cr4) & X86_CR4_LA57) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & pdptr_bits) + && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; -- cgit v1.2.1 From e63f315d74eeeb6ddf9096223d898730494da6f4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:58 +0800 Subject: KVM: X86: Move CR0 pdptr_bits into header file as X86_CR0_PDPTR_BITS Not functionality changed. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-7-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 3 +++ arch/x86/kvm/x86.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7c9f6455fc04..6e6d0d01f18d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,9 +9,12 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) +#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) #define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP) +static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS)); + #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 865938f1e94c..9e0e74a2eaeb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -857,7 +857,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; cr0 |= X86_CR0_ET; @@ -887,7 +886,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && - is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; -- cgit v1.2.1 From 8f29bf12a37807aa0a544485ab3a853481786203 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:43:59 +0800 Subject: KVM: SVM: Remove outdated comment in svm_load_mmu_pgd() The comment had been added in the commit 689f3bf21628 ("KVM: x86: unify callbacks to load paging root") and its related code was removed later, and it has nothing to do with the next line of code. So the comment should be removed too. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-8-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9b6386282198..b4874529f425 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4010,7 +4010,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); - /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; cr3 = vcpu->arch.cr3; -- cgit v1.2.1 From aec9c2402f74b898ad637a97360bfc001fb711e6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:44:00 +0800 Subject: KVM: SVM: Remove references to VCPU_EXREG_CR3 VCPU_EXREG_CR3 is never cleared from vcpu->arch.regs_avail or vcpu->arch.regs_dirty in SVM; therefore, marking CR3 as available is merely a NOP, and testing it will likewise always succeed. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-9-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 1 - arch/x86/kvm/svm/svm.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 598843cfe6c4..2d0a7dd7e2a6 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -468,7 +468,6 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b4874529f425..7f493ffc1f8d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4010,8 +4010,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - return; cr3 = vcpu->arch.cr3; } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); -- cgit v1.2.1 From 3883bc9d28ed348d419d2e405d11f0924783f721 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:44:02 +0800 Subject: KVM: X86: Mark CR3 dirty when vcpu->arch.cr3 is changed When vcpu->arch.cr3 is changed, it should be marked dirty unless it is being updated to the value of the architecture guest CR3 (i.e. VMX.GUEST_CR3 or vmcb->save.cr3 when tdp is enabled). This patch has no functionality changed because kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3) is superset of kvm_register_mark_available(vcpu, VCPU_EXREG_CR3) with additional change to vcpu->arch.regs_dirty, but no code uses regs_dirty for VCPU_EXREG_CR3. (vmx_load_mmu_pgd() uses vcpu->arch.regs_avail instead to test if VCPU_EXREG_CR3 dirty which means current code (ab)uses regs_avail for VCPU_EXREG_CR3 dirty information.) Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-11-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e885f557fcbe..b240776151c6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1104,7 +1104,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e0e74a2eaeb..be3b4a073e72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1158,7 +1158,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); handle_tlb_flush: /* @@ -10567,7 +10567,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); -- cgit v1.2.1 From c62c7bd4f95b8f2a28098c4139a369670998aef2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:44:03 +0800 Subject: KVM: VMX: Update vmcs.GUEST_CR3 only when the guest CR3 is dirty When vcpu->arch.cr3 is changed, it is marked dirty, so vmcs.GUEST_CR3 can be updated only when kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3). Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-12-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 77d79a3aa35a..ffe45435b77e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3114,9 +3114,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; - else /* vmcs01.GUEST_CR3 is already up-to-date. */ + else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { -- cgit v1.2.1 From 41e68b6964ebf20082af55ad1394523cf86c4c6a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 26 Nov 2021 07:00:15 -0500 Subject: KVM: vmx, svm: clean up mass updates to regs_avail/regs_dirty bits Document the meaning of the three combinations of regs_avail and regs_dirty. Update regs_dirty just after writeback instead of doing it later after vmexit. After vmexit, instead, we clear the regs_avail bits corresponding to lazily-loaded registers. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 7 +++++++ arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/svm/svm.h | 10 ++++++++++ arch/x86/kvm/vmx/nested.c | 8 +++++++- arch/x86/kvm/vmx/vmx.c | 3 ++- arch/x86/kvm/vmx/vmx.h | 28 +++++++++++++++------------- 6 files changed, 43 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6e6d0d01f18d..ac3d3bd662f4 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -43,6 +43,13 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif +/* + * avail dirty + * 0 0 register in VMCS/VMCB + * 0 1 *INVALID* + * 1 0 register in vcpu->arch + * 1 1 register in vcpu->arch, needs to be stored back + */ static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7f493ffc1f8d..de872098071d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3946,6 +3946,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; } + vcpu->arch.regs_dirty = 0; if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu); @@ -3980,7 +3981,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; /* * We need to handle MC intercepts here before the vcpu has a chance to diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a57390473013..9f153c59f2c8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -326,6 +326,16 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +/* + * Only the PDPTRs are loaded on demand into the shadow MMU. All other + * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * + * CR3 might be out of date in the VMCB but it is not marked dirty; instead, + * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 + * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB. + */ +#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) + static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b240776151c6..dc5041ad860f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -269,7 +269,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; + + /* + * All lazily updated registers will be reloaded from VMCS12 on both + * vmentry and vmexit. + */ + vcpu->arch.regs_dirty = 0; } /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ffe45435b77e..c65ff62e11f5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6649,6 +6649,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { @@ -6743,7 +6744,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; pt_guest_exit(vmx); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4df2ac24ffc1..f978699480e3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -473,19 +473,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) -static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR0) - | (1 << VCPU_EXREG_CR3) - | (1 << VCPU_EXREG_CR4) - | (1 << VCPU_EXREG_EXIT_INFO_1) - | (1 << VCPU_EXREG_EXIT_INFO_2)); - vcpu->arch.regs_dirty = 0; -} +/* + * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the + * cache on demand. Other registers not listed here are synced to + * the cache immediately after VM-Exit. + */ +#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ + (1 << VCPU_REGS_RSP) | \ + (1 << VCPU_EXREG_RFLAGS) | \ + (1 << VCPU_EXREG_PDPTR) | \ + (1 << VCPU_EXREG_SEGMENTS) | \ + (1 << VCPU_EXREG_CR0) | \ + (1 << VCPU_EXREG_CR3) | \ + (1 << VCPU_EXREG_CR4) | \ + (1 << VCPU_EXREG_EXIT_INFO_1) | \ + (1 << VCPU_EXREG_EXIT_INFO_2)) static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { -- cgit v1.2.1 From 2e9ebd55096f70b76c2a5edf93903c8c2f778a9f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Nov 2021 20:44:06 +0800 Subject: KVM: X86: Remove kvm_register_clear_available() It has no user. Signed-off-by: Lai Jiangshan Message-Id: <20211108124407.12187-15-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ac3d3bd662f4..3febc342360c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -68,13 +68,6 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } -static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { -- cgit v1.2.1 From 24cd19a28cb7174df502162641d6e1e12e7ffbd9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 11 Nov 2021 22:45:26 +0800 Subject: KVM: X86: Update mmu->pdptrs only when it is changed It is unchanged in most cases. Signed-off-by: Lai Jiangshan Message-Id: <20211111144527.88852-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be3b4a073e72..d4f1dff258cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -828,9 +828,12 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } } - memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); - kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); + if (memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) { + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + } vcpu->arch.pdptrs_from_userspace = false; return 1; -- cgit v1.2.1 From 6ab8a4053f7114d130fe1f3485d71efec20f5806 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:01 +0800 Subject: KVM: VMX: Avoid to rdmsrl(MSR_IA32_SYSENTER_ESP) The value of host MSR_IA32_SYSENTER_ESP is known to be constant for each CPU: (cpu_entry_stack(cpu) + 1) when 32 bit syscall is enabled or NULL is 32 bit syscall is not enabled. So rdmsrl() can be avoided for the first case and both rdmsrl() and vmcs_writel() can be avoided for the second case. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c65ff62e11f5..56c46a50d85a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1271,7 +1271,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (!already_loaded) { void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale @@ -1287,8 +1286,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } vmx->loaded_vmcs->cpu = cpu; } @@ -4026,6 +4028,12 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites + * HOST_IA32_SYSENTER_ESP. + */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ -- cgit v1.2.1 From 3ab4ac877cfabd209d12cbd6af0aa02077bb778d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:02 +0800 Subject: KVM: VMX: Update msr value after kvm_set_user_return_msr() succeeds Aoid earlier modification. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 56c46a50d85a..73cb1322d7f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -602,15 +602,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; - u64 old_msr_data = msr->data; - msr->data = data; if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); - if (ret) - msr->data = old_msr_data; } + if (!ret) + msr->data = data; return ret; } -- cgit v1.2.1 From 15ad9762d69fd8e40a4a51828c1d6b0c1b8fbea0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:03 +0800 Subject: KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest() The host CR3 in the vcpu thread can only be changed when scheduling. Moving the code in vmx_prepare_switch_to_guest() makes the code simpler. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-5-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +------- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++------- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index dc5041ad860f..b03df82fc57e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3033,7 +3033,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr4; bool vm_fail; if (!nested_early_check) @@ -3056,12 +3056,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 73cb1322d7f1..1358770ca715 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1103,6 +1103,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1167,6 +1168,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + + /* Host CR3 including its PCID is stable when guest state is loaded. */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != host_state->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host_state->cr3 = cr3; + } + vmx->guest_state_loaded = true; } @@ -6612,7 +6621,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6657,12 +6666,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); -- cgit v1.2.1 From ed07ef5a66e486215bf3f51037c44d10fc9a5a1c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:05 +0800 Subject: KVM: VMX: Use kvm_set_msr_common() for MSR_IA32_TSC_ADJUST in the default way MSR_IA32_TSC_ADJUST can be left to the default way which also uese kvm_set_msr_common(). Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-7-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1358770ca715..00f8203da726 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2104,9 +2104,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & -- cgit v1.2.1 From fe26f91d30fb129a56f68a880a03ad49d127c07a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:06 +0800 Subject: KVM: VMX: Change comments about vmx_get_msr() The variable name is changed in the code. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-8-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 00f8203da726..041ab537e160 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1757,7 +1757,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } /* - * Reads an msr value (of 'msr_index') into 'pdata'. + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -- cgit v1.2.1 From 1af4a1199a41f80b4a792ae76d4c79a01d0b5d41 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:07 +0800 Subject: KVM: SVM: Rename get_max_npt_level() to get_npt_level() It returns the only proper NPT level, so the "max" in the name is not appropriate. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-9-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index de872098071d..7252dc389812 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -269,7 +269,7 @@ u32 svm_msrpm_offset(u32 msr) #define MAX_INST_SIZE 15 -static int get_max_npt_level(void) +static int get_npt_level(void) { #ifdef CONFIG_X86_64 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; @@ -1037,9 +1037,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - /* Force VM NPT level equal to the host's max NPT level */ - kvm_configure_mmu(npt_enabled, get_max_npt_level(), - get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ -- cgit v1.2.1 From 58356767107a6b02c9277810809aca8a36c473d7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:08 +0800 Subject: KVM: SVM: Allocate sd->save_area with __GFP_ZERO And remove clear_page() on it. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-10-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7252dc389812..581b9dfe63f2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -589,12 +589,10 @@ static int svm_cpu_init(int cpu) if (!sd) return ret; sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); + sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) goto free_cpu_data; - clear_page(page_address(sd->save_area)); - ret = sev_cpu_init(sd); if (ret) goto free_save_area; -- cgit v1.2.1 From 27f4fca29f9cfd740dcb7b2bc577bcfd02ae367d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:09 +0800 Subject: KVM: X86: Skip allocating pae_root for vcpu->arch.guest_mmu when !tdp_enabled It is never used. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-11-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ede63912b1ac..644a5cb4cad9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5514,6 +5514,10 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ + if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) + return 0; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU -- cgit v1.2.1 From 84432316cd9aec6923bb3368e86d8f6166b60067 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:10 +0800 Subject: KVM: X86: Fix comment in __kvm_mmu_create() The allocation of special roots is moved to mmu_alloc_special_roots(). Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-12-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 644a5cb4cad9..740e9f52d324 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5527,7 +5527,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit - * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). + * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; -- cgit v1.2.1 From 41e35604eaff2266ba8523787ebe99c5ca4c4045 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:12 +0800 Subject: KVM: X86: Remove useless code to set role.gpte_is_8_bytes when role.direct role.gpte_is_8_bytes is unused when role.direct; there is no point in changing a bit in the role, the value that was set when the MMU is initialized is just fine. Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-14-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 740e9f52d324..c1b6ade6b4a7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2086,8 +2086,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; - if (role.direct) - role.gpte_is_8_bytes = true; role.access = access; if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); -- cgit v1.2.1 From b46a13cb7ea1137b2e01dfaafcacd5cd79db8390 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Nov 2021 19:08:13 +0800 Subject: KVM: X86: Calculate quadrant when !role.gpte_is_8_bytes role.quadrant is only valid when gpte size is 4 bytes and only be calculated when gpte size is 4 bytes. Although "vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL" also means gpte size is 4 bytes, but using "!role.gpte_is_8_bytes" is clearer Signed-off-by: Lai Jiangshan Message-Id: <20211118110814.2568-15-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c1b6ade6b4a7..c22ed56ece25 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2087,7 +2087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.level = level; role.direct = direct; role.access = access; - if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && !role.gpte_is_8_bytes) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; -- cgit v1.2.1 From 1f5a21ee8400ccc82c67dc8c153301f694a04099 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:44 +0800 Subject: KVM: X86: Add parameter struct kvm_mmu *mmu into mmu->gva_to_gpa() The mmu->gva_to_gpa() has no "struct kvm_mmu *mmu", so an extra FNAME(gva_to_gpa_nested) is needed. Add the parameter can simplify the code. And it makes it explicit that the walk is upon vcpu->arch.walk_mmu for gva and vcpu->arch.mmu for L2 gpa in translate_nested_gpa() via the new parameter. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 ++++++++---------------- arch/x86/kvm/mmu/paging_tmpl.h | 41 +++++------------------------------------ arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 66 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c22ed56ece25..62e74296857f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3732,21 +3732,13 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, struct x86_exception *exception) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t vaddr, u32 access, + struct x86_exception *exception) { if (exception) exception->error_code = 0; - return vaddr; -} - -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) -{ - if (exception) - exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); + return mmu->translate_gpa(vcpu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -5001,13 +4993,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) - g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else - g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 708a5d297fe1..d16fb5eda02c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -547,16 +547,6 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } -#if PTTYPE != PTTYPE_EPT -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} -#endif - static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) @@ -1000,50 +990,29 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, addr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= addr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -#if PTTYPE != PTTYPE_EPT -/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ - WARN_ON_ONCE(vaddr >> 32); + WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); #endif - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; return gpa; } -#endif /* * Using the cached information from sp->gfns is safe because: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d4f1dff258cc..7a7bad7ec3a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6506,13 +6506,14 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.mmu; gpa_t t_gpa; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); return t_gpa; } @@ -6520,25 +6521,31 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); @@ -6546,19 +6553,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -6586,13 +6595,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, + exception); if (unlikely(gpa == UNMAPPED_GVA)) return X86EMUL_PROPAGATE_FAULT; @@ -6651,13 +6661,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes struct kvm_vcpu *vcpu, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -6744,6 +6753,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); @@ -6761,7 +6771,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 1; } - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); if (*gpa == UNMAPPED_GVA) return -1; @@ -12312,12 +12322,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; struct x86_exception fault; u32 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed -- cgit v1.2.1 From c59a0f57fa32cfa77643daa17a8e55377cc9fe0b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:45 +0800 Subject: KVM: X86: Remove mmu->translate_gpa Reduce an indirect function call (retpoline) and some intialization code. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 13 +++++++++++++ arch/x86/kvm/mmu/mmu.c | 11 +---------- arch/x86/kvm/mmu/paging_tmpl.h | 7 +++---- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9ae6168d381e..97e13c2988b3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -351,4 +351,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) { atomic64_add(count, &kvm->stat.pages[level - 1]); } + +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); + +static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + if (mmu != &vcpu->arch.nested_mmu) + return gpa; + return translate_nested_gpa(vcpu, gpa, access, exception); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 62e74296857f..327da013ab64 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -335,12 +335,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - static int is_cpuid_PSE36(void) { return 1; @@ -3738,7 +3732,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, { if (exception) exception->error_code = 0; - return mmu->translate_gpa(vcpu, vaddr, access, exception); + return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -5500,7 +5494,6 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root_hpa = INVALID_PAGE; mmu->root_pgd = 0; - mmu->translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5562,8 +5555,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index d16fb5eda02c..5b5bdac97c7b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -403,9 +403,8 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - nested_access, - &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), + nested_access, &walker->fault); /* * FIXME: This can happen if emulation (for of an INS/OUTS @@ -467,7 +466,7 @@ retry_walk: if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7a7bad7ec3a7..432c6e3cb746 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -810,8 +810,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated * to an L1 GPA. */ - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn), - PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); if (real_gpa == UNMAPPED_GVA) return 0; -- cgit v1.2.1 From 84ea5c09a66d19eff2eaebffafa667e6bf9a7905 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:48 +0800 Subject: KVM: X86: Add huge_page_level to __reset_rsvds_bits_mask_ept() Bit 7 on pte depends on the level of supported large page. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-7-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 327da013ab64..ad7e3c5903e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4358,22 +4358,28 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly) + u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; + if (huge_page_level < PG_LEVEL_1G) + large_1g_rsvd = rsvd_bits(7, 7); + if (huge_page_level < PG_LEVEL_2M) + large_2m_rsvd = rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4389,10 +4395,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) + struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - vcpu->arch.reserved_gpa_bits, execonly); + vcpu->arch.reserved_gpa_bits, execonly, + huge_page_level); } static inline u64 reserved_hpa_bits(void) @@ -4468,7 +4475,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - reserved_hpa_bits(), false); + reserved_hpa_bits(), false, + max_huge_page_level); if (!shadow_me_mask) return; @@ -4488,7 +4496,8 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - reserved_hpa_bits(), execonly); + reserved_hpa_bits(), execonly, + max_huge_page_level); } #define BYTE_MASK(access) \ @@ -4923,7 +4932,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(context, true); context->pkru_mask = 0; - reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_rsvds_bits_mask_ept(vcpu, context, execonly, max_huge_page_level); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); -- cgit v1.2.1 From cc022ae144c1ce318643f821461295337280a1c0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:49 +0800 Subject: KVM: X86: Add parameter huge_page_level to kvm_init_shadow_ept_mmu() The level of supported large page on nEPT affects the rsvds_bits_mask. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-8-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 5 +++-- arch/x86/kvm/vmx/capabilities.h | 9 +++++++++ arch/x86/kvm/vmx/nested.c | 8 +++++--- 4 files changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 97e13c2988b3..e9fbb2c8bbe2 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,7 +71,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty, gpa_t new_eptp); + int huge_page_level, bool accessed_dirty, + gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ad7e3c5903e7..41613963a455 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4905,7 +4905,8 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty, gpa_t new_eptp) + int huge_page_level, bool accessed_dirty, + gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); @@ -4932,7 +4933,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(context, true); context->pkru_mask = 0; - reset_rsvds_bits_mask_ept(vcpu, context, execonly, max_huge_page_level); + reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4705ad55abb5..c8029b7845b6 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline int ept_caps_to_lpage_level(u32 ept_caps) +{ + if (ept_caps & VMX_EPT_1GB_PAGE_BIT) + return PG_LEVEL_1G; + if (ept_caps & VMX_EPT_2MB_PAGE_BIT) + return PG_LEVEL_2M; + return PG_LEVEL_4K; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b03df82fc57e..e6230cd55b44 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -397,9 +397,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; + int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); + + kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), nested_ept_get_eptp(vcpu)); } -- cgit v1.2.1 From f8cd457f061d3ca79518f9061c4205590348a0a1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:50 +0800 Subject: KVM: VMX: Use ept_caps_to_lpage_level() in hardware_setup() Using ept_caps_to_lpage_level is simpler. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-9-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 041ab537e160..cee69d895370 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7700,7 +7700,7 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, ept_lpage_level; + int r; store_idt(&dt); host_idt_base = dt.address; @@ -7797,16 +7797,8 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); - if (!enable_ept) - ept_lpage_level = 0; - else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PG_LEVEL_1G; - else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PG_LEVEL_2M; - else - ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), - ept_lpage_level); + ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT -- cgit v1.2.1 From bb3b394d35e80d7a58ce015191e4960a13f54ba5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:51 +0800 Subject: KVM: X86: Rename gpte_is_8_bytes to has_4_byte_gpte and invert the direction This bit is very close to mean "role.quadrant is not in use", except that it is false also when the MMU is mapping guest physical addresses directly. In that case, role.quadrant is indeed not in use, but there are no guest PTEs at all. Changing the name and direction of the bit removes the special case, since a guest with paging disabled, or not considering guest paging structures as is the case for two-dimensional paging, does not have to deal with 4-byte guest PTEs. Suggested-by: Paolo Bonzini Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-10-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ arch/x86/kvm/mmu/mmutrace.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 41613963a455..1ccee4d17481 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2081,7 +2081,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.level = level; role.direct = direct; role.access = access; - if (!direct_mmu && !role.gpte_is_8_bytes) { + if (role.has_4_byte_gpte) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -4746,7 +4746,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; return role; } @@ -4791,7 +4791,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); + role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); return role; } @@ -4890,7 +4890,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; role.base.level = level; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; @@ -5168,7 +5168,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, gpa, bytes, sp->role.word); offset = offset_in_page(gpa); - pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; + pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status @@ -5192,7 +5192,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; - if (!sp->role.gpte_is_8_bytes) { + if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index b8151bbca36a..de5e8e4e1aa7 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -35,7 +35,7 @@ " %snxe %sad root %u %s%c", \ __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ - role.gpte_is_8_bytes ? 8 : 4, \ + role.has_4_byte_gpte ? 4 : 8, \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1db8496259ad..b69e47e68307 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -165,7 +165,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = true; - role.gpte_is_8_bytes = true; + role.has_4_byte_gpte = false; role.access = ACC_ALL; role.ad_disabled = !shadow_accessed_mask; -- cgit v1.2.1 From 2df4a5eb6c5a7aab471dc0b279efe0e49194ce77 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:52 +0800 Subject: KVM: X86: Remove mmu parameter from load_pdptrs() It uses vcpu->arch.walk_mmu always; nested EPT does not have PDPTRs, and nested NPT treats them like all other non-leaf page table levels instead of caching them. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-11-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/x86.c | 12 ++++++------ 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2d0a7dd7e2a6..cf206855ebf0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -461,7 +461,7 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; if (!nested_npt) @@ -1517,7 +1517,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; if (!nested_svm_vmrun_msrpm(svm)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 581b9dfe63f2..208566f63bce 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1605,7 +1605,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) * it is always updated per SDM when moving to CRs. */ if (npt_enabled) - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); break; default: KVM_BUG_ON(1, vcpu->kvm); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e6230cd55b44..08e785871985 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1103,7 +1103,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, * must not be dereferenced. */ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + CC(!load_pdptrs(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } @@ -3147,7 +3147,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 432c6e3cb746..2f232eb0c989 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -798,8 +798,9 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; gpa_t real_gpa; int i; @@ -890,7 +891,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) + !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) @@ -1064,8 +1065,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { @@ -1154,7 +1154,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; - if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) return 1; if (cr3 != kvm_read_cr3(vcpu)) @@ -10597,7 +10597,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (update_pdptrs) { idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); *mmu_reset_needed = 1; } srcu_read_unlock(&vcpu->kvm->srcu, idx); -- cgit v1.2.1 From ce5977b181c1613072eafbc7546bcb6c463ea68c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 4 Nov 2021 19:56:13 +0800 Subject: KVM: x86: don't print when fail to read/write pv eoi memory If guest gives MSR_KVM_PV_EOI_EN a wrong value, this printk() will be trigged, and kernel log is spammed with the useless message Fixes: 0d88800d5472 ("kvm: x86: ioapic and apic debug macros cleanup") Reported-by: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Signed-off-by: Li RongQing Cc: stable@kernel.org Message-Id: <1636026974-50555-1-git-send-email-lirongqing@baidu.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bbac8477b3ec..8f4d872f3ffa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -676,31 +676,25 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) { u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) { - printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); + if (pv_eoi_get_user(vcpu, &val) < 0) return false; - } + return val & KVM_PV_EOI_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) return; - } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) { - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) return; - } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -- cgit v1.2.1 From 51b1209c6125273c345aee6767ffaccb765e5e36 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 4 Nov 2021 19:56:14 +0800 Subject: KVM: Clear pv eoi pending bit only when it is set merge pv_eoi_get_pending and pv_eoi_clr_pending into a single function pv_eoi_test_and_clear_pending, which returns and clear the value of the pending bit. This makes it possible to clear the pending bit only if the guest had set it, and otherwise skip the call to pv_eoi_put_user(). This can save up to 300 nsec on AMD EPYC processors. Suggested-by: Vitaly Kuznetsov Suggested-by: Paolo Bonzini Signed-off-by: Li RongQing Message-Id: <1636026974-50555-2-git-send-email-lirongqing@baidu.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8f4d872f3ffa..40270d7bc597 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -673,15 +673,6 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) -{ - u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) - return false; - - return val & KVM_PV_EOI_ENABLED; -} - static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) @@ -690,12 +681,26 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu) { - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) - return; + u8 val; + + if (pv_eoi_get_user(vcpu, &val) < 0) + return false; + + val &= KVM_PV_EOI_ENABLED; + + if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) + return false; + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); + + return val; } static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) @@ -2671,7 +2676,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { - bool pending; int vector; /* * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host @@ -2685,14 +2689,8 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, * -> host enabled PV EOI, guest executed EOI. */ BUG_ON(!pv_eoi_enabled(vcpu)); - pending = pv_eoi_get_pending(vcpu); - /* - * Clear pending bit in any case: it will be set again on vmentry. - * While this might not be ideal from performance point of view, - * this makes sure pv eoi is only enabled when we know it's safe. - */ - pv_eoi_clr_pending(vcpu); - if (pending) + + if (pv_eoi_test_and_clr_pending(vcpu)) return; vector = apic_set_eoi(apic); trace_kvm_pv_eoi(apic, vector); -- cgit v1.2.1 From 5e854864ee4384736f27a986633bae21731a4e4e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Nov 2021 17:15:29 +0800 Subject: KVM: x86: Handle 32-bit wrap of EIP for EMULTYPE_SKIP with flat code seg Truncate the new EIP to a 32-bit value when handling EMULTYPE_SKIP as the decode phase does not truncate _eip. Wrapping the 32-bit boundary is legal if and only if CS is a flat code segment, but that check is implicitly handled in the form of limit checks in the decode phase. Opportunstically prepare for a future fix by storing the result of any truncation in "eip" instead of "_eip". Fixes: 1957aa63be53 ("KVM: VMX: Handle single-step #DB for EMULTYPE_SKIP on EPT misconfig") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Message-Id: <093eabb1eab2965201c9b018373baf26ff256d85.1635842679.git.houwenlong93@linux.alibaba.com> --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f232eb0c989..d0cc4051ee26 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8139,7 +8139,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * updating interruptibility state and injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; + else + ctxt->eip = ctxt->_eip; + + kvm_rip_write(vcpu, ctxt->eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return 1; -- cgit v1.2.1 From 906fa90416fdb703467926ca4f6f55438cd7ea82 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 2 Nov 2021 17:15:30 +0800 Subject: KVM: x86: Add an emulation type to handle completion of user exits The next patch would use kvm_emulate_instruction() with EMULTYPE_SKIP in complete_userspace_io callback to fix a problem in msr access emulation. However, EMULTYPE_SKIP only updates RIP, more things like updating interruptibility state and injecting single-step #DBs would be done in the callback. Since the emulator also does those things after x86_emulate_insn(), add a new emulation type to pair with EMULTYPE_SKIP to do those things for completion of user exits within the emulator. Suggested-by: Sean Christopherson Signed-off-by: Hou Wenlong Signed-off-by: Paolo Bonzini Message-Id: <8f8c8e268b65f31d55c2881a4b30670946ecfa0d.1635842679.git.houwenlong93@linux.alibaba.com> --- arch/x86/kvm/x86.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0cc4051ee26..4464aa7931cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8134,9 +8134,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks - * for kvm_skip_emulated_instruction(). The caller is responsible for - * updating interruptibility state and injecting single-step #DBs. + * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for + * use *only* by vendor callbacks for kvm_skip_emulated_instruction(). + * The caller is responsible for updating interruptibility state and + * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -8144,6 +8145,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else ctxt->eip = ctxt->_eip; + if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) { + r = 1; + goto writeback; + } + kvm_rip_write(vcpu, ctxt->eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); @@ -8213,6 +8219,7 @@ restart: else r = 1; +writeback: if (writeback) { unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); -- cgit v1.2.1 From d2f7d49826ae62b8b5c9829292e84861d2bda2b6 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 2 Nov 2021 17:15:31 +0800 Subject: KVM: x86: Use different callback if msr access comes from the emulator If msr access triggers an exit to userspace, the complete_userspace_io callback would skip instruction by vendor callback for kvm_skip_emulated_instruction(). However, when msr access comes from the emulator, e.g. if kvm.force_emulation_prefix is enabled and the guest uses rdmsr/wrmsr with kvm prefix, VM_EXIT_INSTRUCTION_LEN in vmcs is invalid and kvm_emulate_instruction() should be used to skip instruction instead. As Sean noted, unlike the previous case, there's no #UD if unrestricted guest is disabled and the guest accesses an MSR in Big RM. So the correct way to fix this is to attach a different callback when the msr access comes from the emulator. Suggested-by: Sean Christopherson Signed-off-by: Hou Wenlong Signed-off-by: Paolo Bonzini Message-Id: <34208da8f51580a06e45afefac95afea0e3f96e3.1635842679.git.houwenlong93@linux.alibaba.com> --- arch/x86/kvm/x86.c | 85 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 36 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4464aa7931cd..16f7d20ed19c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -118,6 +118,7 @@ static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); @@ -710,6 +711,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_COMPLETE_USER_EXIT); +} + void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; @@ -1815,22 +1827,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { - int err = vcpu->run->msr.error; - if (!err) { + if (!vcpu->run->msr.error) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } +} - return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) +{ + return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); } -static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_emulated_msr_access(vcpu); +} + +static int complete_fast_msr_access(struct kvm_vcpu *vcpu) { return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -1865,18 +1891,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) -{ - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, - complete_emulated_rdmsr, r); -} - -static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) -{ - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, - complete_emulated_wrmsr, r); -} - int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); @@ -1885,18 +1899,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) r = kvm_get_msr(vcpu, ecx, &data); - /* MSR read failed? See if we should ask user space */ - if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { - /* Bounce to user space */ - return 0; - } - if (!r) { trace_kvm_msr_read(ecx, data); kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { + /* MSR read failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, + complete_fast_rdmsr, r)) + return 0; trace_kvm_msr_read_ex(ecx); } @@ -1912,19 +1924,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) r = kvm_set_msr(vcpu, ecx, data); - /* MSR write failed? See if we should ask user space */ - if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) - /* Bounce to user space */ - return 0; - - /* Signal all other negative errors to userspace */ - if (r < 0) - return r; - - if (!r) + if (!r) { trace_kvm_msr_write(ecx, data); - else + } else { + /* MSR write failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + complete_fast_msr_access, r)) + return 0; + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; trace_kvm_msr_write_ex(ecx, data); + } return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } @@ -7400,7 +7411,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, r = kvm_get_msr(vcpu, msr_index, pdata); - if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) { /* Bounce to user space */ return X86EMUL_IO_NEEDED; } @@ -7416,7 +7428,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, r = kvm_set_msr(vcpu, msr_index, data); - if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) { /* Bounce to user space */ return X86EMUL_IO_NEEDED; } -- cgit v1.2.1 From adbfb12d4c4517a8adde23a7fc46538953d56eea Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 2 Nov 2021 17:15:32 +0800 Subject: KVM: x86: Exit to userspace if emulation prepared a completion callback em_rdmsr() and em_wrmsr() return X86EMUL_IO_NEEDED if MSR accesses required an exit to userspace. However, x86_emulate_insn() doesn't return X86EMUL_*, so x86_emulate_instruction() doesn't directly act on X86EMUL_IO_NEEDED; instead, it looks for other signals to differentiate between PIO, MMIO, etc. causing RDMSR/WRMSR emulation not to exit to userspace now. Nevertheless, if the userspace_msr_exit_test testcase in selftests is changed to test RDMSR/WRMSR with a forced emulation prefix, the test passes. What happens is that first userspace exit information is filled but the userspace exit does not happen. Because x86_emulate_instruction() returns 1, the guest retries the instruction---but this time RIP has already been adjusted past the forced emulation prefix, so the guest executes RDMSR/WRMSR and the userspace exit finally happens. Since the X86EMUL_IO_NEEDED path has provided a complete_userspace_io callback, x86_emulate_instruction() can just return 0 if the callback is not NULL. Then RDMSR/WRMSR instruction emulation will exit to userspace directly, without the RDMSR/WRMSR vmexit. Fixes: 1ae099540e8c7 ("KVM: x86: Allow deflecting unknown MSR accesses to user space") Signed-off-by: Hou Wenlong Signed-off-by: Paolo Bonzini Message-Id: <56f9df2ee5c05a81155e2be366c9dc1f7adc8817.1635842679.git.houwenlong93@linux.alibaba.com> --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16f7d20ed19c..1aaf37e1bd0f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8227,6 +8227,9 @@ restart: writeback = false; r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; + } else if (vcpu->arch.complete_userspace_io) { + writeback = false; + r = 0; } else if (r == EMULATION_RESTART) goto restart; else -- cgit v1.2.1 From b84155c38076b36d625043a06a2f1c90bde62903 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 29 Nov 2021 10:47:02 +0100 Subject: KVM: VMX: Introduce vmx_msr_bitmap_l01_changed() helper In preparation to enabling 'Enlightened MSR Bitmap' feature for Hyper-V guests move MSR bitmap update tracking to a dedicated helper. Note: vmx_msr_bitmap_l01_changed() is called when MSR bitmap might be updated. KVM doesn't check if the bit we're trying to set is already set (or the bit it's trying to clear is already cleared). Such situations should not be common and a few false positives should not be a problem. No functional change intended. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Message-Id: <20211129094704.326635-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9d7f26eaf05f..7ee50671191b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3688,6 +3688,17 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) +{ + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); +} + void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3696,8 +3707,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3741,8 +3751,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed -- cgit v1.2.1 From ed2a4800ae9d491e4bf1b8b60b15001ce3b88fcd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 29 Nov 2021 10:47:03 +0100 Subject: KVM: nVMX: Track whether changes in L0 require MSR bitmap for L2 to be rebuilt Introduce a flag to keep track of whether MSR bitmap for L2 needs to be rebuilt due to changes in MSR bitmap for L1 or switching to a different L2. This information will be used for Enlightened MSR Bitmap feature for Hyper-V guests. Note, setting msr_bitmap_changed to 'true' from set_current_vmptr() is not really needed for Enlightened MSR Bitmap as the feature can only be used in conjunction with Enlightened VMCS but let's keep tracking information complete, it's cheap and in the future similar PV feature can easily be implemented for KVM on KVM too. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211129094704.326635-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 9 ++++++++- arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/vmx/vmx.h | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 08e785871985..e7aa9576441c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -672,6 +672,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + vmx->nested.force_msr_bitmap_recalc = false; + return true; } @@ -2029,10 +2031,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ - if (from_launch || evmcs_gpa_changed) + if (from_launch || evmcs_gpa_changed) { vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.force_msr_bitmap_recalc = true; + } + return EVMPTRLD_SUCCEEDED; } @@ -5260,6 +5265,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; } /* Emulate the VMPTRLD instruction */ @@ -6395,6 +6401,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) goto error_guest_mode; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7ee50671191b..9c729d6d4e7d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3697,6 +3697,8 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) */ if (static_branch_unlikely(&enable_evmcs)) evmcs_touch_msr_bitmap(); + + vmx->nested.force_msr_bitmap_recalc = true; } void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f978699480e3..6c2c1aff1c3d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -158,6 +158,15 @@ struct nested_vmx { bool need_vmcs12_to_shadow_sync; bool dirty_vmcs12; + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; + /* * Indicates lazily loaded guest state has not yet been decached from * vmcs02. -- cgit v1.2.1 From 502d2bf5f2fd7c05adc2d4f057910bd5d4c4c63e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 29 Nov 2021 10:47:04 +0100 Subject: KVM: nVMX: Implement Enlightened MSR Bitmap feature Updating MSR bitmap for L2 is not cheap and rearly needed. TLFS for Hyper-V offers 'Enlightened MSR Bitmap' feature which allows L1 hypervisor to inform L0 when it changes MSR bitmap, this eliminates the need to examine L1's MSR bitmap for L2 every time when 'real' MSR bitmap for L2 gets constructed. Use 'vmx->nested.msr_bitmap_changed' flag to implement the feature. Note, KVM already uses 'Enlightened MSR bitmap' feature when it runs as a nested hypervisor on top of Hyper-V. The newly introduced feature is going to be used by Hyper-V guests on KVM. When the feature is enabled for Win10+WSL2, it shaves off around 700 CPU cycles from a nested vmexit cost (tight cpuid loop test). Signed-off-by: Vitaly Kuznetsov Message-Id: <20211129094704.326635-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 ++ arch/x86/kvm/vmx/nested.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7179fa645eda..a91424ed436d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2517,6 +2517,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + if (evmcs_ver) + ent->eax |= HV_X64_NESTED_MSR_BITMAP; break; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e7aa9576441c..2f6f465e575f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -599,6 +599,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ @@ -606,6 +607,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; + /* + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature + * and tells KVM (L0) there were no changes in MSR bitmap for L2. + */ + if (!vmx->nested.force_msr_bitmap_recalc && evmcs && + evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; -- cgit v1.2.1 From 8e819d75cbcf541a833219521379114a76a645a6 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 9 Dec 2021 13:54:36 +0200 Subject: KVM: x86: add a tracepoint for APICv/AVIC interrupt delivery This allows to see how many interrupts were delivered via the APICv/AVIC from the host. Signed-off-by: Maxim Levitsky Message-Id: <20211209115440.394441-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ arch/x86/kvm/trace.h | 24 ++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 28 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 40270d7bc597..c5028e6b0f96 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1100,6 +1100,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); } break; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 953b0fcb21ee..92e6f6702f00 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1356,6 +1356,30 @@ TRACE_EVENT(kvm_apicv_update_request, __entry->bit) ); +TRACE_EVENT(kvm_apicv_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), + TP_ARGS(apicid, dm, tm, vec), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u16, tm ) + __field( __u8, vec ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + ), + + TP_printk("apicid %x vec %u (%s|%s)", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge") +); + /* * Tracepoint for AMD AVIC */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1aaf37e1bd0f..26cb3a4cd0e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12693,6 +12693,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); -- cgit v1.2.1 From 83c98007d9fb5c827cd954fc48e9cba034ef6fdc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:12 +0000 Subject: KVM: nVMX: Ensure vCPU honors event request if posting nested IRQ fails Add a memory barrier between writing vcpu->requests and reading vcpu->guest_mode to ensure the read is ordered after the write when (potentially) delivering an IRQ to L2 via nested posted interrupt. If the request were to be completed after reading vcpu->mode, it would be possible for the target vCPU to enter the guest without posting the interrupt and without handling the event request. Note, the barrier is only for documentation since atomic operations are serializing on x86. Suggested-by: Paolo Bonzini Fixes: 6b6977117f50 ("KVM: nVMX: Fix races when sending nested PI while dest enters/leaves L2") Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9c729d6d4e7d..63615d242bdf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3943,6 +3943,19 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) kvm_vcpu_kick(vcpu); @@ -3976,6 +3989,12 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ if (vcpu != kvm_get_running_vcpu() && !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); -- cgit v1.2.1 From 45af1bb99b72e36c16714390a8a3c9445e432938 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:13 +0000 Subject: KVM: VMX: Clean up PI pre/post-block WARNs Move the WARN sanity checks out of the PI descriptor update loop so as not to spam the kernel log if the condition is violated and the update takes multiple attempts due to another writer. This also eliminates a few extra uops from the retry path. Technically not checking every attempt could mean KVM will now fail to WARN in a scenario that would have failed before, but any such failure would be inherently racy as some other agent (CPU or device) would have to concurrent modify the PI descriptor. Add a helper to handle the actual write and more importantly to document why the write may need to be retried. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 4db2b14ee7c6..88c53c521094 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,6 +34,20 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +{ + /* + * PID.ON can be set at any time by a different vCPU or by hardware, + * e.g. a device. PID.control must be written atomically, and the + * update must be retried with a fresh snapshot an ON change causes + * the cmpxchg to fail. + */ + if (cmpxchg64(&pi_desc->control, old, new) != old) + return -EBUSY; + + return 0; +} + void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -74,8 +88,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.ndst = dest; new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); after_clear_sn: @@ -128,17 +141,17 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the vCPU was blocking"); + do { old.control = new.control = READ_ONCE(pi_desc->control); - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); new.ndst = dest; /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); vcpu->pre_pcpu = -1; } @@ -173,17 +186,15 @@ int pi_pre_block(struct kvm_vcpu *vcpu) &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + WARN(pi_desc->sn == 1, + "Posted Interrupt Suppress Notification set before blocking"); + do { old.control = new.control = READ_ONCE(pi_desc->control); - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); /* We should not block the vCPU if an interrupt is posted for it. */ if (pi_test_on(pi_desc)) -- cgit v1.2.1 From dc70ec217cec504e6f8fee8fd91bf5c118af05f2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 21 Nov 2021 12:54:40 +0000 Subject: KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING I'd like to make the build include dirty_ring.c based on whether the arch wants it or not. That's a whole lot simpler if there's a config symbol instead of doing it implicitly on KVM_DIRTY_LOG_PAGE_OFFSET being set to something non-zero. Signed-off-by: David Woodhouse Message-Id: <20211121125451.9489-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7618bef0a4a9..03b2ce34e7f4 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select HAVE_KVM_DIRTY_RING select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING -- cgit v1.2.1 From 6f2cdbdba43e4afad8df1ab06797c83e3af4a3dc Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 21 Nov 2021 12:54:41 +0000 Subject: KVM: Add Makefile.kvm for common files, use it for x86 Splitting kvm_main.c out into smaller and better-organized files is slightly non-trivial when it involves editing a bunch of per-arch KVM makefiles. Provide virt/kvm/Makefile.kvm for them to include. Signed-off-by: David Woodhouse Acked-by: Marc Zyngier Message-Id: <20211121125451.9489-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 75dfd27b6e8a..30f244b64523 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,12 +7,7 @@ ifeq ($(CONFIG_FRAME_POINTER),y) OBJECT_FILES_NON_STANDARD_vmenter.o := y endif -KVM := ../../../virt/kvm - -kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ - $(KVM)/dirty_ring.o $(KVM)/binary_stats.o -kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o +include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ -- cgit v1.2.1 From 46cbc0400f85987954f6e2c110409f8f60725232 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Dec 2021 18:13:37 -0500 Subject: Revert "KVM: X86: Update mmu->pdptrs only when it is changed" This reverts commit 24cd19a28cb7174df502162641d6e1e12e7ffbd9. Sean Christopherson reports: "Commit 24cd19a28cb7 ('KVM: X86: Update mmu->pdptrs only when it is changed') breaks nested VMs with EPT in L0 and PAE shadow paging in L2. Reproducing is trivial, just disable EPT in L1 and run a VM. I haven't investigating how it breaks things." Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8912f44c640d..074a0578979f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -841,12 +841,9 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } } - kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); - if (memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) { - memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); - kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - } + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; return 1; -- cgit v1.2.1 From a9f2705ec84449e3b8d70c804766f8e97e23080d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 16 Dec 2021 10:19:36 +0800 Subject: KVM: VMX: Save HOST_CR3 in vmx_set_host_fs_gs() The host CR3 in the vcpu thread can only be changed when scheduling, so commit 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()") changed vmx.c to only save it in vmx_prepare_switch_to_guest(). However, it also has to be synced in vmx_sync_vmcs_host_state() when switching VMCS. vmx_set_host_fs_gs() is called in both places, so rename it to vmx_set_vmcs_host_state() and make it update HOST_CR3. Fixes: 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()") Signed-off-by: Lai Jiangshan Message-Id: <20211216021938.11752-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 20 +++++++++----------- arch/x86/kvm/vmx/vmx.h | 5 +++-- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2f6f465e575f..d07a7fa75783 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -245,7 +245,8 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel, + src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9bf65e5e4840..ebf511f83903 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1069,9 +1069,14 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } -void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base) +void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, + u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) { + if (unlikely(cr3 != host->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host->cr3 = cr3; + } if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); @@ -1103,7 +1108,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif - unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1167,14 +1171,8 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - - /* Host CR3 including its PCID is stable when guest state is loaded. */ - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != host_state->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host_state->cr3 = cr3; - } + vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(), + fs_sel, gs_sel, fs_base, gs_base); vmx->guest_state_loaded = true; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6c2c1aff1c3d..18111368cf85 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -371,8 +371,9 @@ int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); -void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base); +void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, + u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); -- cgit v1.2.1 From 6b123c3a89a90ac6418e4d64b1e23f09d458a77d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 16 Dec 2021 10:19:37 +0800 Subject: KVM: x86/mmu: Reconstruct shadow page root if the guest PDPTEs is changed For shadow paging, the page table needs to be reconstructed before the coming VMENTER if the guest PDPTEs is changed. But not all paths that call load_pdptrs() will cause the page tables to be reconstructed. Normally, kvm_mmu_reset_context() and kvm_mmu_free_roots() are used to launch later reconstruction. The commit d81135a57aa6("KVM: x86: do not reset mmu if CR0.CD and CR0.NW are changed") skips kvm_mmu_reset_context() after load_pdptrs() when changing CR0.CD and CR0.NW. The commit 21823fbda552("KVM: x86: Invalidate all PGDs for the current PCID on MOV CR3 w/ flush") skips kvm_mmu_free_roots() after load_pdptrs() when rewriting the CR3 with the same value. The commit a91a7c709600("KVM: X86: Don't reset mmu context when toggling X86_CR4_PGE") skips kvm_mmu_reset_context() after load_pdptrs() when changing CR4.PGE. Guests like linux would keep the PDPTEs unchanged for every instance of pagetable, so this missing reconstruction has no problem for linux guests. Fixes: d81135a57aa6("KVM: x86: do not reset mmu if CR0.CD and CR0.NW are changed") Fixes: 21823fbda552("KVM: x86: Invalidate all PGDs for the current PCID on MOV CR3 w/ flush") Fixes: a91a7c709600("KVM: X86: Don't reset mmu context when toggling X86_CR4_PGE") Suggested-by: Sean Christopherson Signed-off-by: Lai Jiangshan Message-Id: <20211216021938.11752-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 074a0578979f..9ddad9493cb8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -841,6 +841,13 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } } + /* + * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. + * Shadow page roots need to be reconstructed instead. + */ + if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) + kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT); + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); -- cgit v1.2.1 From 5b61178cd2fd67890a70ae9febbd4df20bbd8c40 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 16 Dec 2021 10:19:38 +0800 Subject: KVM: VMX: Mark VCPU_EXREG_CR3 dirty when !CR0_PG -> CR0_PG if EPT + !URG When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. So VCPU_EXREG_CR3 is considered to be dirty and GUEST_CR3 needs to be updated in this case. Reported-by: Maxim Levitsky Suggested-by: Sean Christopherson Signed-off-by: Lai Jiangshan Message-Id: <20211216021938.11752-4-jiangshanlai@gmail.com> Fixes: c62c7bd4f95b ("KVM: VMX: Update vmcs.GUEST_CR3 only when the guest CR3 is dirty") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ebf511f83903..1d53b8144f83 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3071,6 +3071,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + + /* + * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but + * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. + */ + if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ -- cgit v1.2.1 From 006a0f0607e1504950dd8fa3b6ca8e438ec6c9d2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Dec 2021 14:10:04 -0500 Subject: KVM: x86: avoid out of bounds indices for fixed performance counters Because IceLake has 4 fixed performance counters but KVM only supports 3, it is possible for reprogram_fixed_counters to pass to reprogram_fixed_counter an index that is out of bounds for the fixed_pmc_events array. Ultimately intel_find_fixed_event, which is the only place that uses fixed_pmc_events, handles this correctly because it checks against the size of fixed_pmc_events anyway. Every other place operates on the fixed_counters[] array which is sized according to INTEL_PMC_MAX_FIXED. However, it is cleaner if the unsupported performance counters are culled early on in reprogram_fixed_counters. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 1b7456b2177b..d33e9799276e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -91,7 +91,7 @@ static unsigned intel_find_fixed_event(int idx) u32 event; size_t size = ARRAY_SIZE(fixed_pmc_events); - if (idx >= size) + if (WARN_ON_ONCE(idx >= size)) return PERF_COUNT_HW_MAX; event = fixed_pmc_events[array_index_nospec(idx, size)]; @@ -500,8 +500,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - x86_pmu.num_counters_fixed); + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t) x86_pmu.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = -- cgit v1.2.1 From 761875634a5e2c3fed36c439fc4acac6f85a96eb Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:16 +0800 Subject: KVM: x86/pmu: Setup pmc->eventsel for fixed PMCs The current pmc->eventsel for fixed counter is underutilised. The pmc->eventsel can be setup for all known available fixed counters since we have mapping between fixed pmc index and the intel_arch_events array. Either gp or fixed counter, it will simplify the later checks for consistency between eventsel and perf_hw_id. Signed-off-by: Like Xu Message-Id: <20211130074221.93635-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index d33e9799276e..965fefbad56d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -459,6 +459,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -507,6 +522,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); } pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | -- cgit v1.2.1 From 7c174f305cbee6bdba5018aae02b84369e7ab995 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:17 +0800 Subject: KVM: x86/pmu: Refactoring find_arch_event() to pmc_perf_hw_id() The find_arch_event() returns a "unsigned int" value, which is used by the pmc_reprogram_counter() to program a PERF_TYPE_HARDWARE type perf_event. The returned value is actually the kernel defined generic perf_hw_id, let's rename it to pmc_perf_hw_id() with simpler incoming parameters for better self-explanation. Signed-off-by: Like Xu Message-Id: <20211130074221.93635-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 8 +------- arch/x86/kvm/pmu.h | 3 +-- arch/x86/kvm/svm/pmu.c | 8 ++++---- arch/x86/kvm/vmx/pmu_intel.c | 9 +++++---- 4 files changed, 11 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 09873f6488f7..3b3ccf5b1106 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -174,7 +174,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; - u8 event_select, unit_mask; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; int i; @@ -206,17 +205,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!allow_event) return; - event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc), - event_select, - unit_mask); + config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 59d6b76203d5..dd7dbb1c5048 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -24,8 +24,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select, - u8 unit_mask); + unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0cf05e4caa4c..fb0ce8cda8a7 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -138,10 +138,10 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned amd_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) @@ -323,7 +323,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops = { - .find_arch_event = amd_find_arch_event, + .pmc_perf_hw_id = amd_pmc_perf_hw_id, .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 965fefbad56d..7b530de7ae23 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -68,10 +68,11 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) reprogram_counter(pmu, bit); } -static unsigned intel_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) @@ -720,7 +721,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops intel_pmu_ops = { - .find_arch_event = intel_find_arch_event, + .pmc_perf_hw_id = intel_pmc_perf_hw_id, .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, -- cgit v1.2.1 From 6ed1298eb0bf6641b0a66c2c38369f5767a2575c Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:18 +0800 Subject: KVM: x86/pmu: Reuse pmc_perf_hw_id() and drop find_fixed_event() Since we set the same semantic event value for the fixed counter in pmc->eventsel, returning the perf_hw_id for the fixed counter via find_fixed_event() can be painlessly replaced by pmc_perf_hw_id() with the help of pmc_is_fixed() check. Signed-off-by: Like Xu Message-Id: <20211130074221.93635-4-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/pmu.h | 1 - arch/x86/kvm/svm/pmu.c | 11 ++++------- arch/x86/kvm/vmx/pmu_intel.c | 19 +++---------------- 4 files changed, 8 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3b3ccf5b1106..b7a1ae28ab87 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -262,7 +262,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->find_fixed_event(idx), + kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi, false, false); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index dd7dbb1c5048..c91d9725aafd 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -25,7 +25,6 @@ struct kvm_event_hw_type_mapping { struct kvm_pmu_ops { unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); - unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fb0ce8cda8a7..12d8b301065a 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -144,6 +144,10 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; + /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ + if (WARN_ON(pmc_is_fixed(pmc))) + return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) if (amd_event_mapping[i].eventsel == event_select && amd_event_mapping[i].unit_mask == unit_mask) @@ -155,12 +159,6 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) return amd_event_mapping[i].event_type; } -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; -} - /* check if a PMC is enabled by comparing it against global_ctrl bits. Because * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). */ @@ -324,7 +322,6 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu_ops amd_pmu_ops = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, - .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7b530de7ae23..5e0ac57d6d1b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -76,9 +76,9 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) int i; for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) + if (intel_arch_events[i].eventsel == event_select && + intel_arch_events[i].unit_mask == unit_mask && + (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) break; if (i == ARRAY_SIZE(intel_arch_events)) @@ -87,18 +87,6 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) return intel_arch_events[i].event_type; } -static unsigned intel_find_fixed_event(int idx) -{ - u32 event; - size_t size = ARRAY_SIZE(fixed_pmc_events); - - if (WARN_ON_ONCE(idx >= size)) - return PERF_COUNT_HW_MAX; - - event = fixed_pmc_events[array_index_nospec(idx, size)]; - return intel_arch_events[event].event_type; -} - /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { @@ -722,7 +710,6 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) struct kvm_pmu_ops intel_pmu_ops = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, - .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, -- cgit v1.2.1 From 40ccb96d5483c7ef773f50db15f82f0ab587cf8a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:19 +0800 Subject: KVM: x86/pmu: Add pmc->intr to refactor kvm_perf_overflow{_intr}() Depending on whether intr should be triggered or not, KVM registers two different event overflow callbacks in the perf_event context. The code skeleton of these two functions is very similar, so the pmc->intr can be stored into pmc from pmc_reprogram_counter() which provides smaller instructions footprint against the u-architecture branch predictor. The __kvm_perf_overflow() can be called in non-nmi contexts and a flag is needed to distinguish the caller context and thus avoid a check on kvm_is_in_guest(), otherwise we might get warnings from suspicious RCU or check_preemption_disabled(). Suggested-by: Paolo Bonzini Signed-off-by: Like Xu Message-Id: <20211130074221.93635-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 58 ++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 30 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b7a1ae28ab87..a20207ee4014 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) kvm_pmu_deliver_pmi(vcpu); } -static void kvm_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { - struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - } + /* Ignore counters that have been reprogrammed already. */ + if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) + return; + + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + + if (!pmc->intr) + return; + + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (in_pmi && !kvm_is_in_guest()) + irq_work_queue(&pmc_to_pmu(pmc)->irq_work); + else + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } -static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (!kvm_is_in_guest()) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); - } + __kvm_perf_overflow(pmc, true); } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, @@ -126,7 +124,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } event = perf_event_create_kernel_counter(&attr, -1, current, - intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", @@ -138,6 +135,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; + pmc->intr = intr; } static void pmc_pause_counter(struct kvm_pmc *pmc) -- cgit v1.2.1 From 9cd803d496e72cd1dd3287c9a6cb4afa636ee16a Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Tue, 30 Nov 2021 15:42:20 +0800 Subject: KVM: x86: Update vPMCs when retiring instructions When KVM retires a guest instruction through emulation, increment any vPMCs that are configured to monitor "instructions retired," and update the sample period of those counters so that they will overflow at the right time. Signed-off-by: Eric Hankland [jmattson: - Split the code to increment "branch instructions retired" into a separate commit. - Added 'static' to kvm_pmu_incr_counter() definition. - Modified kvm_pmu_incr_counter() to check pmc->perf_event->state == PERF_EVENT_STATE_ACTIVE. ] Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") Signed-off-by: Jim Mattson [likexu: - Drop checks for pmc->perf_event or event state or event type - Increase a counter once its umask bits and the first 8 select bits are matched - Rewrite kvm_pmu_incr_counter() with a less invasive approach to the host perf; - Rename kvm_pmu_record_event to kvm_pmu_trigger_event; - Add counter enable and CPL check for kvm_pmu_trigger_event(); ] Cc: Peter Zijlstra Signed-off-by: Like Xu Message-Id: <20211130074221.93635-6-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/x86.c | 3 +++ 3 files changed, 64 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a20207ee4014..8abdadb7e22a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -482,6 +482,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 prev_count; + + prev_count = pmc->counter; + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + + reprogram_counter(pmu, pmc->idx); + if (pmc->counter < prev_count) + __kvm_perf_overflow(pmc, false); +} + +static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, + unsigned int perf_hw_id) +{ + u64 old_eventsel = pmc->eventsel; + unsigned int config; + + pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); + config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + pmc->eventsel = old_eventsel; + return config == perf_hw_id; +} + +static inline bool cpl_is_matched(struct kvm_pmc *pmc) +{ + bool select_os, select_user; + u64 config = pmc->current_config; + + if (pmc_is_gp(pmc)) { + select_os = config & ARCH_PERFMON_EVENTSEL_OS; + select_user = config & ARCH_PERFMON_EVENTSEL_USR; + } else { + select_os = config & 0x1; + select_user = config & 0x2; + } + + return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; +} + +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + continue; + + /* Ignore checks for edge detect, pin control, invert and CMASK bits */ + if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) + kvm_pmu_incr_counter(pmc); + } +} +EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index c91d9725aafd..7a7b8d5b775e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -157,6 +157,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ddad9493cb8..4fbde3a9b9d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7990,6 +7990,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -8252,6 +8254,7 @@ writeback: vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); -- cgit v1.2.1 From 018d70ffcfec8a01f77b0d840527203d337dd7f9 Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Tue, 30 Nov 2021 15:42:21 +0800 Subject: KVM: x86: Update vPMCs when retiring branch instructions When KVM retires a guest branch instruction through emulation, increment any vPMCs that are configured to monitor "branch instructions retired," and update the sample period of those counters so that they will overflow at the right time. Signed-off-by: Eric Hankland [jmattson: - Split the code to increment "branch instructions retired" into a separate commit. - Moved/consolidated the calls to kvm_pmu_trigger_event() in the emulation of VMLAUNCH/VMRESUME to accommodate the evolution of that code. ] Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") Signed-off-by: Jim Mattson Message-Id: <20211130074221.93635-7-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 55 ++++++++++++++++++++++++++-------------------- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/vmx/nested.c | 7 ++++-- arch/x86/kvm/x86.c | 2 ++ 4 files changed, 39 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 28b1a4e57827..166a145fc1e6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -175,6 +175,7 @@ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ +#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -191,8 +192,9 @@ #define FASTOP_SIZE 8 struct opcode { - u64 flags : 56; - u64 intercept : 8; + u64 flags; + u8 intercept; + u8 pad[7]; union { int (*execute)(struct x86_emulate_ctxt *ctxt); const struct opcode *group; @@ -4364,10 +4366,10 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | NearBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps, em_call_far), - I(SrcMem | NearBranch, em_jmp_abs), - I(SrcMemFAddr | ImplicitOps, em_jmp_far), + I(SrcMem | NearBranch | IsBranch, em_call_near_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), + I(SrcMem | NearBranch | IsBranch, em_jmp_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; @@ -4577,7 +4579,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ - X16(D(SrcImmByte | NearBranch)), + X16(D(SrcImmByte | NearBranch | IsBranch)), /* 0x80 - 0x87 */ G(ByteOp | DstMem | SrcImm, group1), G(DstMem | SrcImm, group1), @@ -4596,7 +4598,7 @@ static const struct opcode opcode_table[256] = { DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64, em_call_far), N, + I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), @@ -4616,17 +4618,19 @@ static const struct opcode opcode_table[256] = { X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm), - I(ImplicitOps | NearBranch, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), + I(ImplicitOps | NearBranch | IsBranch, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | SrcImmU16, em_ret_far_imm), - I(ImplicitOps, em_ret_far), - D(ImplicitOps), DI(SrcImmByte, intn), - D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), + I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), + I(Stack | IsBranch, em_leave), + I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), + I(ImplicitOps | IsBranch, em_ret_far), + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), + D(ImplicitOps | No64 | IsBranch), + II(ImplicitOps | IsBranch, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), @@ -4637,14 +4641,15 @@ static const struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ - X3(I(SrcImmByte | NearBranch, em_loop)), - I(SrcImmByte | NearBranch, em_jcxz), + X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)), + I(SrcImmByte | NearBranch | IsBranch, em_jcxz), I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch), - I(SrcImmFAddr | No64, em_jmp_far), - D(SrcImmByte | ImplicitOps | NearBranch), + I(SrcImm | NearBranch | IsBranch, em_call), + D(SrcImm | ImplicitOps | NearBranch | IsBranch), + I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), + D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ @@ -4660,7 +4665,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | EmulateOnUD, em_syscall), + N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4691,8 +4696,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | EmulateOnUD, em_sysenter), - I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), + I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -4710,7 +4715,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ - X16(D(SrcImm | NearBranch)), + X16(D(SrcImm | NearBranch | IsBranch)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ @@ -5224,6 +5229,8 @@ done_prefixes: ctxt->d |= opcode.flags; } + ctxt->is_branch = opcode.flags & IsBranch; + /* Unrecognised? */ if (ctxt->d == 0) return EMULATION_FAILED; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 68b420289d7e..39eded2426ff 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -369,6 +369,7 @@ struct x86_emulate_ctxt { struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; + bool is_branch; }; /* Repeat String Operation Prefix */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d07a7fa75783..f235f77cbc03 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3526,10 +3526,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { - return nested_vmx_failInvalid(vcpu); } + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) + return nested_vmx_failInvalid(vcpu); + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4fbde3a9b9d0..42bde45a1bc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8255,6 +8255,8 @@ writeback: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + if (ctxt->is_branch) + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); -- cgit v1.2.1 From 982ed0de4753ed6e71dbd40f82a5a066baf133ed Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:21 +0000 Subject: KVM: Reinstate gfn_to_pfn_cache with invalidation support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This can be used in two modes. There is an atomic mode where the cached mapping is accessed while holding the rwlock, and a mode where the physical address is used by a vCPU in guest mode. For the latter case, an invalidation will wake the vCPU with the new KVM_REQ_GPC_INVALIDATE, and the architecture will need to refresh any caches it still needs to access before entering guest mode again. Only one vCPU can be targeted by the wake requests; it's simple enough to make it wake all vCPUs or even a mask but I don't see a use case for that additional complexity right now. Invalidation happens from the invalidate_range_start MMU notifier, which needs to be able to sleep in order to wake the vCPU and wait for it. This means that revalidation potentially needs to "wait" for the MMU operation to complete and the invalidate_range_end notifier to be invoked. Like the vCPU when it takes a page fault in that period, we just spin — fixing that in a future patch by implementing an actual *wait* may be another part of shaving this particularly hirsute yak. As noted in the comments in the function itself, the only case where the invalidate_range_start notifier is expected to be called *without* being able to sleep is when the OOM reaper is killing the process. In that case, we expect the vCPU threads already to have exited, and thus there will be nothing to wake, and no reason to wait. So we clear the KVM_REQUEST_WAIT bit and send the request anyway, then complain loudly if there actually *was* anything to wake up. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 03b2ce34e7f4..ebc8ce9ec917 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select HAVE_KVM_IRQCHIP + select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING select IRQ_BYPASS_MANAGER -- cgit v1.2.1 From 1cfc9c4b9d4606a1e90e7dbc50058b9f0c1d43a6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:22 +0000 Subject: KVM: x86/xen: Maintain valid mapping of Xen shared_info page Use the newly reinstated gfn_to_pfn_cache to maintain a kernel mapping of the Xen shared_info page so that it can be accessed in atomic context. Note that we do not participate in dirty tracking for the shared info page and we do not explicitly mark it dirty every single tim we deliver an event channel interrupts. We wouldn't want to do that even if we *did* have a valid vCPU context with which to do so. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-4-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index dff2bdf9507a..da4bf2c6407f 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -23,16 +23,21 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { - ret = -EFAULT; + if (gfn == GPA_INVALID) { + kvm_gfn_to_pfn_cache_destroy(kvm, gpc); goto out; } - kvm->arch.xen.shinfo_gfn = gfn; + + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, gpa, + PAGE_SIZE, false); + if (ret) + goto out; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -260,15 +265,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; - r = 0; - break; - } r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); break; - case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; @@ -299,7 +298,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn; + if (kvm->arch.xen.shinfo_cache.active) + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); + else + data->u.shared_info.gfn = GPA_INVALID; r = 0; break; @@ -661,11 +663,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) void kvm_xen_init_vm(struct kvm *kvm) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; } void kvm_xen_destroy_vm(struct kvm *kvm) { + kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + if (kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } -- cgit v1.2.1 From 14243b387137a4afbe1df5d9dc15182d6657bb79 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:23 +0000 Subject: KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery This adds basic support for delivering 2 level event channels to a guest. Initially, it only supports delivery via the IRQ routing table, triggered by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast() function which will use the pre-mapped shared_info page if it already exists and is still valid, while the slow path through the irqfd_inject workqueue will remap the shared_info page if necessary. It sets the bits in the shared_info page but not the vcpu_info; that is deferred to __kvm_xen_has_interrupt() which raises the vector to the appropriate vCPU. Add a 'verbose' mode to xen_shinfo_test while adding test cases for this. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-5-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 12 +++ arch/x86/kvm/x86.c | 3 +- arch/x86/kvm/xen.c | 262 +++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/xen.h | 9 ++ 4 files changed, 280 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 39ad02d6dc63..6e0dab04320e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -24,6 +24,7 @@ #include "hyperv.h" #include "x86.h" +#include "xen.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return r; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn_fast(e, kvm); +#endif default: break; } @@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif default: return -EINVAL; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bde45a1bc2..3050601d5d73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4188,7 +4188,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_SHARED_INFO; + KVM_XEN_HVM_CONFIG_SHARED_INFO | + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index da4bf2c6407f..ceddabd1f5c6 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "trace.h" @@ -195,6 +196,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); + bool atomic = in_atomic() || !task_is_running(current); int err; u8 rc = 0; @@ -204,6 +207,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool ghc_valid = slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot; + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ @@ -219,8 +225,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * cache in kvm_read_guest_offset_cached(), but just uses * __get_user() instead. And falls back to the slow path. */ - if (likely(slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + if (!evtchn_pending_sel && ghc_valid) { /* Fast path */ pagefault_disable(); err = __get_user(rc, (u8 __user *)ghc->hva + offset); @@ -239,11 +244,82 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ - if (in_atomic() || !task_is_running(current)) + if (atomic) return 1; - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + if (!ghc_valid) { + err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); + if (err || !ghc->memslot) { + /* + * If this failed, userspace has screwed up the + * vcpu_info mapping. No interrupts for you. + */ + return 0; + } + } + + /* + * Now we have a valid (protected by srcu) userspace HVA in + * ghc->hva which points to the struct vcpu_info. If there + * are any bits in the in-kernel evtchn_pending_sel then + * we need to write those to the guest vcpu_info and set + * its evtchn_upcall_pending flag. If there aren't any bits + * to add, we only want to *check* evtchn_upcall_pending. + */ + if (evtchn_pending_sel) { + bool long_mode = v->kvm->arch.xen.long_mode; + + if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info))) + return 0; + + if (IS_ENABLED(CONFIG_64BIT) && long_mode) { + struct vcpu_info __user *vi = (void __user *)ghc->hva; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" + "\tnotq %0\n" + "\t" LOCK_PREFIX "andq %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel)); + } else { + struct compat_vcpu_info __user *vi = (void __user *)ghc->hva; + u32 evtchn_pending_sel32 = evtchn_pending_sel; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" + "\tnotl %0\n" + "\t" LOCK_PREFIX "andl %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel32), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel32)); + } + rc = 1; + unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); + + err: + user_access_end(); + + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + } else { + __get_user(rc, (u8 __user *)ghc->hva + offset); + } return rc; } @@ -740,3 +816,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } + +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return EVTCHN_2L_NR_CHANNELS; + else + return COMPAT_EVTCHN_2L_NR_CHANNELS; +} + +/* + * This follows the kvm_set_irq() API, so it returns: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + struct kvm_vcpu *vcpu; + unsigned long *pending_bits, *mask_bits; + unsigned long flags; + int port_word_bit; + bool kick_vcpu = false; + int idx; + int rc; + + vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); + if (!vcpu) + return -1; + + if (!vcpu->arch.xen.vcpu_info_set) + return -1; + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -1; + + rc = -EWOULDBLOCK; + read_lock_irqsave(&gpc->lock, flags); + + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + goto out_rcu; + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 64; + } else { + struct compat_shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 32; + } + + /* + * If this port wasn't already set, and if it isn't masked, then + * we try to set the corresponding bit in the in-kernel shadow of + * evtchn_pending_sel for the target vCPU. And if *that* wasn't + * already set, then we kick the vCPU in question to write to the + * *real* evtchn_pending_sel in its own guest vcpu_info struct. + */ + if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) { + rc = 0; /* It was already raised */ + } else if (test_bit(e->xen_evtchn.port, mask_bits)) { + rc = -1; /* Masked */ + } else { + rc = 1; /* Delivered. But was the vCPU waking already? */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + } + + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + read_unlock_irqrestore(&gpc->lock, flags); + + if (kick_vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + + return rc; +} + +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + bool mm_borrowed = false; + int rc; + + if (!level) + return -1; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + return rc; + + if (current->mm != kvm->mm) { + /* + * If not on a thread which already belongs to this KVM, + * we'd better be in the irqfd workqueue. + */ + if (WARN_ON_ONCE(current->mm)) + return -EINVAL; + + kthread_use_mm(kvm->mm); + mm_borrowed = true; + } + + /* + * For the irqfd workqueue, using the main kvm->lock mutex is + * fine since this function is invoked from kvm_set_irq() with + * no other lock held, no srcu. In future if it will be called + * directly from a vCPU thread (e.g. on hypercall for an IPI) + * then it may need to switch to using a leaf-node mutex for + * serializing the shared_info mapping. + */ + mutex_lock(&kvm->lock); + + /* + * It is theoretically possible for the page to be unmapped + * and the MMU notifier to invalidate the shared_info before + * we even get to use it. In that case, this looks like an + * infinite loop. It was tempting to do it via the userspace + * HVA instead... but that just *hides* the fact that it's + * an infinite loop, because if a fault occurs and it waits + * for the page to come back, it can *still* immediately + * fault and have to wait again, repeatedly. + * + * Conversely, the page could also have been reinstated by + * another thread before we even obtain the mutex above, so + * check again *first* before remapping it. + */ + do { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + int idx; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + break; + + idx = srcu_read_lock(&kvm->srcu); + rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, + PAGE_SIZE, false); + srcu_read_unlock(&kvm->srcu, idx); + } while(!rc); + + mutex_unlock(&kvm->lock); + + if (mm_borrowed) + kthread_unuse_mm(kvm->mm); + + return rc; +} + +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) + +{ + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e->xen_evtchn.port = ue->u.xen_evtchn.port; + e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.priority = ue->u.xen_evtchn.priority; + e->set = evtchn_set_fn; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..adbcc9ed59db 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm); +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && @@ -134,6 +140,9 @@ struct compat_shared_info { struct compat_arch_shared_info arch; }; +#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \ + sizeof_field(struct compat_shared_info, \ + evtchn_pending)) struct compat_vcpu_runstate_info { int state; uint64_t state_entry_time; -- cgit v1.2.1 From 55749769fe608fa3f4a075e42e89d237c8e37637 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:24 +0000 Subject: KVM: x86: Fix wall clock writes in Xen shared_info not to mark page dirty When dirty ring logging is enabled, any dirty logging without an active vCPU context will cause a kernel oops. But we've already declared that the shared_info page doesn't get dirty tracking anyway, since it would be kind of insane to mark it dirty every time we deliver an event channel interrupt. Userspace is supposed to just assume it's always dirty any time a vCPU can run or event channels are routed. So stop using the generic kvm_write_wall_clock() and just write directly through the gfn_to_pfn_cache that we already have set up. We can make kvm_write_wall_clock() static in x86.c again now, but let's not remove the 'sec_hi_ofs' argument even though it's not used yet. At some point we *will* want to use that for KVM guests too. Fixes: 629b5348841a ("KVM: x86/xen: update wallclock region") Reported-by: butt3rflyh4ck Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-6-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 1 - arch/x86/kvm/xen.c | 62 ++++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3050601d5d73..6492329f2e9a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2135,7 +2135,7 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4abcd8d9836d..da7031e80f23 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -301,7 +301,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu); } -void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ceddabd1f5c6..0e3f7d6e9fd7 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,8 +25,11 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + struct pvclock_wall_clock *wc; gpa_t gpa = gfn_to_gpa(gfn); - int wc_ofs, sec_hi_ofs; + u32 *wc_sec_hi; + u32 wc_version; + u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); @@ -35,32 +38,63 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) goto out; } - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, gpa, - PAGE_SIZE, false); - if (ret) - goto out; + do { + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, + gpa, PAGE_SIZE, false); + if (ret) + goto out; + + /* + * This code mirrors kvm_write_wall_clock() except that it writes + * directly through the pfn cache and doesn't mark the page dirty. + */ + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + + /* It could be invalid again already, so we need to check */ + read_lock_irq(&gpc->lock); + + if (gpc->valid) + break; + + read_unlock_irq(&gpc->lock); + } while (1); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - /* 32-bit location by default */ - wc_ofs = offsetof(struct compat_shared_info, wc); - sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi); - #ifdef CONFIG_X86_64 /* Paranoia checks on the 64-bit struct layout */ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); - if (kvm->arch.xen.long_mode) { - wc_ofs = offsetof(struct shared_info, wc); - sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi); - } + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + + wc_sec_hi = &shinfo->wc_sec_hi; + wc = &shinfo->wc; + } else #endif + { + struct compat_shared_info *shinfo = gpc->khva; + + wc_sec_hi = &shinfo->arch.wc_sec_hi; + wc = &shinfo->wc; + } + + /* Increment and ensure an odd value */ + wc_version = wc->version = (wc->version + 1) | 1; + smp_wmb(); + + wc->nsec = do_div(wall_nsec, 1000000000); + wc->sec = (u32)wall_nsec; + *wc_sec_hi = wall_nsec >> 32; + smp_wmb(); + + wc->version = wc_version + 1; + read_unlock_irq(&gpc->lock); - kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); out: -- cgit v1.2.1 From 907d139318b5109e5b676b32b0f4a2c666a8d9ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Dec 2021 11:07:40 +0100 Subject: KVM: VMX: Provide vmread version using asm-goto-with-outputs Use asm-goto-output for smaller fast path code. Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx_ops.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 9e9ef47e988c..67f745250e50 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + + asm_volatile_goto("1: vmread %[field], %[output]\n\t" + "jna %l[do_fail]\n\t" + + _ASM_EXTABLE(1b, %l[do_exception]) + + : [output] "=r" (value) + : [field] "r" (field) + : "cc" + : do_fail, do_exception); + + return value; + +do_fail: + WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); + pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + return 0; + +do_exception: + kvm_spurious_fault(); + return 0; + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + asm volatile("1: vmread %2, %1\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -101,6 +126,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) _ASM_EXTABLE(1b, 4b) : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); return value; + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ } static __always_inline u16 vmcs_read16(unsigned long field) -- cgit v1.2.1 From 405329fc9aeef1e3e2eccaadf32b539ad6c7120f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 16 Dec 2021 11:13:54 -0600 Subject: KVM: SVM: include CR3 in initial VMSA state for SEV-ES guests Normally guests will set up CR3 themselves, but some guests, such as kselftests, and potentially CONFIG_PVH guests, rely on being booted with paging enabled and CR3 initialized to a pre-allocated page table. Currently CR3 updates via KVM_SET_SREGS* are not loaded into the guest VMCB until just prior to entering the guest. For SEV-ES/SEV-SNP, this is too late, since it will have switched over to using the VMSA page prior to that point, with the VMSA CR3 copied from the VMCB initial CR3 value: 0. Address this by sync'ing the CR3 value into the VMCB save area immediately when KVM_SET_SREGS* is issued so it will find it's way into the initial VMSA. Suggested-by: Tom Lendacky Signed-off-by: Michael Roth Message-Id: <20211216171358.61140-10-michael.roth@amd.com> [Remove vmx_post_set_cr3; add a remark about kvm_set_cr3 not calling the new hook. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 19 +++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 2 ++ 3 files changed, 22 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5557867dcb69..c3d9006478a4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1801,6 +1801,24 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } +static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4624,6 +4642,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .post_set_cr3 = svm_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d53b8144f83..7b5abe25e1e5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3134,6 +3134,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6492329f2e9a..a9f1044dd6b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1179,6 +1179,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* @@ -10618,6 +10619,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); -- cgit v1.2.1 From cc04b6a21d431359eceeec0d812b492088b04af5 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:14 -0800 Subject: kvm: x86: Fix xstate_required_size() to follow XSTATE alignment rule CPUID.0xD.1.EBX enumerates the size of the XSAVE area (in compacted format) required by XSAVES. If CPUID.0xD.i.ECX[1] is set for a state component (i), this state component should be located on the next 64-bytes boundary following the preceding state component in the compacted layout. Fix xstate_required_size() to follow the alignment rule. AMX is the first state component with 64-bytes alignment to catch this bug. Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-4-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0b920e12bb6d..f3e6fda6b858 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -42,7 +42,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - offset = compacted ? ret : ebx; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = ebx; ret = max(ret, offset + eax); } -- cgit v1.2.1 From 445ecdf79be0c71ca248f7611aeefceaea3ec59f Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:15 -0800 Subject: kvm: x86: Exclude unpermitted xfeatures at KVM_GET_SUPPORTED_CPUID KVM_GET_SUPPORTED_CPUID should not include any dynamic xstates in CPUID[0xD] if they have not been requested with prctl. Otherwise a process which directly passes KVM_GET_SUPPORTED_CPUID to KVM_SET_CPUID2 would now fail even if it doesn't intend to use a dynamically enabled feature. Userspace must know that prctl is required and allocate >4K xstate buffer before setting any dynamic bit. Suggested-by: Paolo Bonzini Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-5-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f3e6fda6b858..eb52dde5deec 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -815,11 +815,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; - case 0xd: - entry->eax &= supported_xcr0; + case 0xd: { + u64 guest_perm = xstate_get_guest_group_perm(); + + entry->eax &= supported_xcr0 & guest_perm; entry->ebx = xstate_required_size(supported_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= supported_xcr0 >> 32; + entry->edx &= (supported_xcr0 & guest_perm) >> 32; if (!supported_xcr0) break; @@ -866,6 +868,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { -- cgit v1.2.1 From 5ab2f45bba4894a0db4af8567da3efd6228dd010 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:19 -0800 Subject: kvm: x86: Enable dynamic xfeatures at KVM_SET_CPUID2 KVM can request fpstate expansion in two approaches: 1) When intercepting guest updates to XCR0 and XFD MSR; 2) Before vcpu runs (e.g. at KVM_SET_CPUID2); The first option doesn't waste memory for legacy guest if it doesn't support XFD. However doing so introduces more complexity and also imposes an order requirement in the restoring path, i.e. XCR0/XFD must be restored before XSTATE. Given that the agreement is to do the static approach. This is considered a better tradeoff though it does waste 8K memory for legacy guest if its CPUID includes dynamically-enabled xfeatures. Successful fpstate expansion requires userspace VMM to acquire guest xstate permissions before calling KVM_SET_CPUID2. Also take the chance to adjust the indent in kvm_set_cpuid(). Signed-off-by: Jing Liu Signed-off-by: Sean Christopherson Signed-off-by: Kevin Tian Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-9-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index eb52dde5deec..a0fedf1514ab 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -84,9 +84,12 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( return NULL; } -static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; + u64 xfeatures; /* * The existing code assumes virtual address is 48-bit or 57-bit in the @@ -100,7 +103,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return -EINVAL; } - return 0; + /* + * Exposing dynamic xfeatures to the guest requires additional + * enabling in the FPU, e.g. to expand the guest XSAVE state size. + */ + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + xfeatures = best->eax | ((u64)best->edx << 32); + xfeatures &= XFEATURE_MASK_USER_DYNAMIC; + if (!xfeatures) + return 0; + + return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) @@ -280,21 +296,21 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { - int r; + int r; - r = kvm_check_cpuid(e2, nent); - if (r) - return r; + r = kvm_check_cpuid(vcpu, e2, nent); + if (r) + return r; - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = nent; + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); + kvm_update_kvm_cpuid_base(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); - return 0; + return 0; } /* when an old userspace process fills a new kernel module */ -- cgit v1.2.1 From 820a6ee944e74e57255ac2e90916ecdaade57b95 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:21 -0800 Subject: kvm: x86: Add emulation for IA32_XFD Intel's eXtended Feature Disable (XFD) feature allows the software to dynamically adjust fpstate buffer size for XSAVE features which have large state. Because guest fpstate has been expanded for all possible dynamic xstates at KVM_SET_CPUID2, emulation of the IA32_XFD MSR is straightforward. For write just call fpu_update_guest_xfd() to update the guest fpu container once all the sanity checks are passed. For read simply return the cached value in the container. Signed-off-by: Jing Liu Signed-off-by: Zeng Guang Signed-off-by: Wei Wang Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-11-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a9f1044dd6b2..b18d2838606f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1377,6 +1377,7 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + MSR_IA32_XFD, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -3686,6 +3687,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.msr_misc_features_enables = data; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~(XFEATURE_MASK_USER_DYNAMIC & + vcpu->arch.guest_supported_xcr0)) + return 1; + + fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); + break; +#endif default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -4006,6 +4020,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; + break; +#endif default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); @@ -6441,6 +6464,10 @@ static void kvm_init_msr_list(void) min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; + case MSR_IA32_XFD: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + continue; + break; default: break; } -- cgit v1.2.1 From ec5be88ab29fd9145c7ced20b58fb96f7c6b6890 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:23 -0800 Subject: kvm: x86: Intercept #NM for saving IA32_XFD_ERR Guest IA32_XFD_ERR is generally modified in two places: - Set by CPU when #NM is triggered; - Cleared by guest in its #NM handler; Intercept #NM for the first case when a nonzero value is written to IA32_XFD. Nonzero indicates that the guest is willing to do dynamic fpstate expansion for certain xfeatures, thus KVM needs to manage and virtualize guest XFD_ERR properly. The vcpu exception bitmap is updated in XFD write emulation according to guest_fpu::xfd. Save the current XFD_ERR value to the guest_fpu container in the #NM VM-exit handler. This must be done with interrupt disabled, otherwise the unsaved MSR value may be clobbered by host activity. The saving operation is conducted conditionally only when guest_fpu:xfd includes a non-zero value. Doing so also avoids misread on a platform which doesn't support XFD but #NM is triggered due to L1 interception. Queueing #NM to the guest is postponed to handle_exception_nmi(). This goes through the nested_vmx check so a virtual vmexit is queued instead when #NM is triggered in L2 but L1 wants to intercept it. Restore the host value (always ZERO outside of the host #NM handler) before enabling interrupt. Restore the guest value from the guest_fpu container right before entering the guest (with interrupt disabled). Suggested-by: Thomas Gleixner Signed-off-by: Jing Liu Signed-off-by: Kevin Tian Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-13-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 59 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6e5de2e2b0da..e325c290a816 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info) return is_exception_n(intr_info, MC_VECTOR); } +static inline bool is_nm_fault(u32 intr_info) +{ + return is_exception_n(intr_info, NM_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7b5abe25e1e5..84f6904cdb6e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -761,6 +762,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } + /* + * Trap #NM if guest xfd contains a non-zero value so guest XFD_ERR + * can be saved timely. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + eb |= (1u << NM_VECTOR); + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1967,6 +1975,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; + case MSR_IA32_XFD: + ret = kvm_set_msr_common(vcpu, msr_info); + /* Update #NM interception according to guest xfd */ + if (!ret) + vmx_update_exception_bitmap(vcpu); + break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) @@ -4798,6 +4812,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ + /* + * Queue the exception here instead of in handle_nm_fault_irqoff(). + * This ensures the nested_vmx check is not skipped so vmexit can + * be reflected to L1 (when it intercepts #NM) before reaching this + * point. + */ + if (is_nm_fault(intr_info)) { + kvm_queue_exception(vcpu, NM_VECTOR); + return 1; + } + if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -6399,6 +6424,26 @@ static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, kvm_after_interrupt(vcpu); } +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; @@ -6407,6 +6452,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(&vmx->vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b18d2838606f..bb9534590a3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9952,6 +9952,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -10015,6 +10018,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_handle_exit_irqoff)(vcpu); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, 0); + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. -- cgit v1.2.1 From 548e83650a51dce0d188b9e41b1e2ca5d63597cf Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:24 -0800 Subject: kvm: x86: Emulate IA32_XFD_ERR for guest Emulate read/write to IA32_XFD_ERR MSR. Only the saved value in the guest_fpu container is touched in the emulation handler. Actual MSR update is handled right before entering the guest (with preemption disabled) Signed-off-by: Jing Liu Signed-off-by: Zeng Guang Signed-off-by: Wei Wang Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-14-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb9534590a3a..2475b64cb762 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1377,7 +1377,7 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - MSR_IA32_XFD, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -3699,6 +3699,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~(XFEATURE_MASK_USER_DYNAMIC & + vcpu->arch.guest_supported_xcr0)) + return 1; + + vcpu->arch.guest_fpu.xfd_err = data; + break; #endif default: if (kvm_pmu_is_valid_msr(vcpu, msr)) @@ -4028,6 +4039,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.xfd_err; + break; #endif default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) @@ -6465,6 +6483,7 @@ static void kvm_init_msr_list(void) continue; break; case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) continue; break; -- cgit v1.2.1 From 61f208134a871047f1d642ed3b813f4f71e30b0e Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:25 -0800 Subject: kvm: x86: Disable RDMSR interception of IA32_XFD_ERR This saves one unnecessary VM-exit in guest #NM handler, given that the MSR is already restored with the guest value before the guest is resumed. Suggested-by: Paolo Bonzini Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-15-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 84f6904cdb6e..b8b7f5c7b3df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -162,6 +162,7 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD_ERR, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, @@ -7288,6 +7289,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + + set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 18111368cf85..69dd2f85abdc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -349,7 +349,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 14 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); -- cgit v1.2.1 From 86aff7a4799286635efd94dab17b513544703cad Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:26 -0800 Subject: kvm: x86: Add XCR0 support for Intel AMX Two XCR0 bits are defined for AMX to support XSAVE mechanism. Bit 17 is for tilecfg and bit 18 is for tiledata. The value of XCR0[17:18] is always either 00b or 11b. Also, SDM recommends that only 64-bit operating systems enable Intel AMX by setting XCR0[18:17]. 32-bit host kernel never sets the tile bits in vcpu->arch.guest_supported_xcr0. Signed-off-by: Jing Liu Signed-off-by: Kevin Tian Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-16-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2475b64cb762..993eee6451ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -211,7 +211,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ - | XFEATURE_MASK_PKRU) + | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); @@ -1010,6 +1010,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } + + if ((xcr0 & XFEATURE_MASK_XTILE) && + ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) + return 1; + vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) -- cgit v1.2.1 From 690a757d610e50c2c3acd2e4bc3992cfc63feff2 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:27 -0800 Subject: kvm: x86: Add CPUID support for Intel AMX Extend CPUID emulation to support XFD, AMX_TILE, AMX_INT8 and AMX_BF16. Adding those bits into kvm_cpu_caps finally activates all previous logics in this series. Hide XFD on 32bit host kernels. Otherwise it leads to a weird situation where KVM tells userspace to migrate MSR_IA32_XFD and then rejects attempts to read/write the MSR. Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-17-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a0fedf1514ab..ba4c3d5d2386 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -442,9 +442,11 @@ void kvm_set_cpu_caps(void) #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); + unsigned int f_xfd = F(XFD); #else unsigned int f_gbpages = 0; unsigned int f_lm = 0; + unsigned int f_xfd = 0; #endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); @@ -512,7 +514,8 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) + F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -531,7 +534,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); kvm_cpu_cap_init_scattered(CPUID_12_EAX, @@ -657,6 +660,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, case 0x14: case 0x17: case 0x18: + case 0x1d: + case 0x1e: case 0x1f: case 0x8000001d: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -929,6 +934,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; + /* Intel AMX TILE */ + case 0x1d: + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { + if (!do_host_cpuid(array, function, i)) + goto out; + } + break; + case 0x1e: /* TMUL information */ + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + break; case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; -- cgit v1.2.1 From be50b2065dfa3d88428fdfdc340d154d96bf6848 Mon Sep 17 00:00:00 2001 From: Guang Zeng Date: Wed, 5 Jan 2022 04:35:29 -0800 Subject: kvm: x86: Add support for getting/setting expanded xstate buffer With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set xstate data from/to KVM. This doesn't work when dynamic xfeatures (e.g. AMX) are exposed to the guest as they require a larger buffer size. Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended to work with both legacy and new capabilities by doing properly-sized memdup_user() based on the guest fpu container. KVM_GET_XSAVE is kept for backward-compatible reason. Instead, KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred interface for getting xstate buffer (4KB or larger size) from KVM (Link: https://lkml.org/lkml/2021/12/15/510) Also, update the api doc with the new KVM_GET_XSAVE2 ioctl. Signed-off-by: Guang Zeng Signed-off-by: Wei Wang Signed-off-by: Jing Liu Signed-off-by: Kevin Tian Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-19-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 2 ++ arch/x86/kvm/x86.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ba4c3d5d2386..c55e57b30e81 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -32,7 +32,7 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); -static u32 xstate_required_size(u64 xstate_bv, bool compacted) +u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c99edfff7f82..8a770b481d9d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +u32 xstate_required_size(u64 xstate_bv, bool compacted); + int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 993eee6451ea..bde18ca657db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4314,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = 0; break; + case KVM_CAP_XSAVE2: { + u64 guest_perm = xstate_get_guest_group_perm(); + + r = xstate_required_size(supported_xcr0 & guest_perm, false); + if (r < sizeof(struct kvm_xsave)) + r = sizeof(struct kvm_xsave); + break; + } default: break; } @@ -4917,6 +4925,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, vcpu->arch.pkru); } +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) +{ + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; + + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + state, size, vcpu->arch.pkru); +} + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { @@ -5370,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { + r = -EINVAL; + if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) + break; + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) @@ -5384,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = memdup_user(argp, size); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; @@ -5393,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + + case KVM_GET_XSAVE2: { + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, size)) + break; + + r = 0; + break; + } + case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; -- cgit v1.2.1 From b5274b1b7ba89fe8ed38cc470041cd6ba0dfb79b Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 5 Jan 2022 04:35:32 -0800 Subject: kvm: x86: Disable interception for IA32_XFD on demand Always intercepting IA32_XFD causes non-negligible overhead when this register is updated frequently in the guest. Disable r/w emulation after intercepting the first WRMSR(IA32_XFD) with a non-zero value. Disable WRMSR emulation implies that IA32_XFD becomes out-of-sync with the software states in fpstate and the per-cpu xfd cache. This leads to two additional changes accordingly: - Call fpu_sync_guest_vmexit_xfd_state() after vm-exit to bring software states back in-sync with the MSR, before handle_exit_irqoff() is called. - Always trap #NM once write interception is disabled for IA32_XFD. The #NM exception is rare if the guest doesn't use dynamic features. Otherwise, there is at most one exception per guest task given a dynamic feature. p.s. We have confirmed that SDM is being revised to say that when setting IA32_XFD[18] the AMX register state is not guaranteed to be preserved. This clarification avoids adding mess for a creative guest which sets IA32_XFD[18]=1 before saving active AMX state to its own storage. Signed-off-by: Kevin Tian Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-22-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 24 +++++++++++++++++++----- arch/x86/kvm/vmx/vmx.h | 2 +- arch/x86/kvm/x86.c | 8 ++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b8b7f5c7b3df..15e30602782b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -162,6 +162,7 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, #endif MSR_IA32_SYSENTER_CS, @@ -764,10 +765,11 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) } /* - * Trap #NM if guest xfd contains a non-zero value so guest XFD_ERR - * can be saved timely. + * Disabling xfd interception indicates that dynamic xfeatures + * might be used in the guest. Always trap #NM in this case + * to save guest xfd_err timely. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) + if (vcpu->arch.xfd_no_write_intercept) eb |= (1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); @@ -1978,9 +1980,21 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_XFD: ret = kvm_set_msr_common(vcpu, msr_info); - /* Update #NM interception according to guest xfd */ - if (!ret) + /* + * Always intercepting WRMSR could incur non-negligible + * overhead given xfd might be changed frequently in + * guest context switch. Disable write interception + * upon the first write with a non-zero value (indicating + * potential usage on dynamic xfeatures). Also update + * exception bitmap to trap #NM for proper virtualization + * of guest xfd_err. + */ + if (!ret && data) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, + MSR_TYPE_RW); + vcpu->arch.xfd_no_write_intercept = true; vmx_update_exception_bitmap(vcpu); + } break; #endif case MSR_IA32_SYSENTER_CS: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 69dd2f85abdc..f8fc7441baea 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -349,7 +349,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 14 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bde18ca657db..60da2331ec32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10083,6 +10083,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + /* + * Sync xfd before calling handle_exit_irqoff() which may + * rely on the fact that guest_fpu::xfd is up-to-date (e.g. + * in #NM irqoff handler). + */ + if (vcpu->arch.xfd_no_write_intercept) + fpu_sync_guest_vmexit_xfd_state(); + static_call(kvm_x86_handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) -- cgit v1.2.1