summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c32
-rw-r--r--arch/x86/kvm/hyperv.c63
-rw-r--r--arch/x86/kvm/irq_comm.c5
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c25
-rw-r--r--arch/x86/kvm/pmu.c3
-rw-r--r--arch/x86/kvm/pmu.h3
-rw-r--r--arch/x86/kvm/svm/nested.c12
-rw-r--r--arch/x86/kvm/vmx/nested.c20
-rw-r--r--arch/x86/kvm/vmx/vmx.c28
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--arch/x86/kvm/xen.c230
13 files changed, 242 insertions, 188 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b14653b61470..596061c1610e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -770,16 +770,22 @@ struct kvm_cpuid_array {
int nent;
};
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
u32 function, u32 index)
{
- struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
- if (array->nent >= array->maxnent)
+ if (!entry)
return NULL;
- entry = &array->entries[array->nent++];
-
memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
@@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = edx.full;
break;
}
- /*
- * Per Intel's SDM, the 0x1f is a superset of 0xb,
- * thus they can be handled by common code.
- */
case 0x1f:
case 0xb:
/*
- * Populate entries until the level type (ECX[15:8]) of the
- * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
- * the starting entry, filled by the primary do_host_cpuid().
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
*/
- for (i = 1; entry->ecx & 0xff00; ++i) {
- entry = do_host_cpuid(array, function, i);
- if (!entry)
- goto out;
- }
+ entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
@@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
break;
case 0x8000001F:
if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2c7f2a26421e..e8296942a868 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1769,6 +1769,7 @@ static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_ba
}
struct kvm_hv_hcall {
+ /* Hypercall input data */
u64 param;
u64 ingpa;
u64 outgpa;
@@ -1779,12 +1780,21 @@ struct kvm_hv_hcall {
bool fast;
bool rep;
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+
+ /*
+ * Current read offset when KVM reads hypercall input data gradually,
+ * either offset in bytes from 'ingpa' for regular hypercalls or the
+ * number of already consumed 'XMM halves' for 'fast' hypercalls.
+ */
+ union {
+ gpa_t data_offset;
+ int consumed_xmm_halves;
+ };
};
static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
- u16 orig_cnt, u16 cnt_cap, u64 *data,
- int consumed_xmm_halves, gpa_t offset)
+ u16 orig_cnt, u16 cnt_cap, u64 *data)
{
/*
* Preserve the original count when ignoring entries via a "cap", KVM
@@ -1799,11 +1809,11 @@ static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
* Each XMM holds two sparse banks, but do not count halves that
* have already been consumed for hypercall parameters.
*/
- if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
for (i = 0; i < cnt; i++) {
- j = i + consumed_xmm_halves;
+ j = i + hc->consumed_xmm_halves;
if (j % 2)
data[i] = sse128_hi(hc->xmm[j / 2]);
else
@@ -1812,27 +1822,24 @@ static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
return 0;
}
- return kvm_read_guest(kvm, hc->ingpa + offset, data,
+ return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data,
cnt * sizeof(*data));
}
static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
- u64 *sparse_banks, int consumed_xmm_halves,
- gpa_t offset)
+ u64 *sparse_banks)
{
if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
return -EINVAL;
/* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
- sparse_banks, consumed_xmm_halves, offset);
+ sparse_banks);
}
-static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
- int consumed_xmm_halves, gpa_t offset)
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[])
{
- return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt,
- entries, consumed_xmm_halves, offset);
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries);
}
static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
@@ -1926,8 +1933,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
struct kvm_vcpu *v;
unsigned long i;
bool all_cpus;
- int consumed_xmm_halves = 0;
- gpa_t data_offset;
/*
* The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
@@ -1955,12 +1960,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
flush.processor_mask = sse128_lo(hc->xmm[0]);
- consumed_xmm_halves = 1;
+ hc->consumed_xmm_halves = 1;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa,
&flush, sizeof(flush))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- data_offset = sizeof(flush);
+ hc->data_offset = sizeof(flush);
}
trace_kvm_hv_flush_tlb(flush.processor_mask,
@@ -1985,12 +1990,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush_ex.flags = hc->outgpa;
memcpy(&flush_ex.hv_vp_set,
&hc->xmm[0], sizeof(hc->xmm[0]));
- consumed_xmm_halves = 2;
+ hc->consumed_xmm_halves = 2;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
sizeof(flush_ex))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- data_offset = sizeof(flush_ex);
+ hc->data_offset = sizeof(flush_ex);
}
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
@@ -2009,8 +2014,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (!hc->var_cnt)
goto ret_success;
- if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
- consumed_xmm_halves, data_offset))
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
@@ -2021,8 +2025,10 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
* consumed_xmm_halves to make sure TLB flush entries are read
* from the correct offset.
*/
- data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
- consumed_xmm_halves += hc->var_cnt;
+ if (hc->fast)
+ hc->consumed_xmm_halves += hc->var_cnt;
+ else
+ hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
}
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
@@ -2030,8 +2036,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
tlb_flush_entries = NULL;
} else {
- if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries,
- consumed_xmm_halves, data_offset))
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
tlb_flush_entries = __tlb_flush_entries;
}
@@ -2180,9 +2185,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (!hc->var_cnt)
goto ret_success;
- if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1,
- offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents)))
+ if (!hc->fast)
+ hc->data_offset = offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents);
+ else
+ hc->consumed_xmm_halves = 1;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 0687162c4f22..3742d9adacfc 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -426,8 +426,9 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
if (irq.trig_mode &&
- kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- irq.dest_id, irq.dest_mode))
+ (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ irq.dest_id, irq.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, irq.vector)))
__set_bit(irq.vector, ioapic_handled_vectors);
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 28e3769066e2..58c3242fcc7a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -188,11 +188,11 @@ static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
extern struct static_key_false_deferred apic_hw_disabled;
-static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
if (static_branch_unlikely(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
- return MSR_IA32_APICBASE_ENABLE;
+ return true;
}
extern struct static_key_false_deferred apic_sw_disabled;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1f03701b943a..6f54dc9409c9 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -363,7 +363,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
* A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
*
* 1. To intercept writes for dirty logging. KVM write-protects huge pages
- * so that they can be split be split down into the dirty logging
+ * so that they can be split down into the dirty logging
* granularity (4KiB) whenever the guest writes to them. KVM also
* write-protects 4KiB pages so that writes can be recorded in the dirty log
* (e.g. if not using PML). SPTEs are write-protected for dirty logging
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 771210ce5181..d6df38d371a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1074,7 +1074,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
int ret = RET_PF_FIXED;
bool wrprot = false;
- WARN_ON(sp->role.level != fault->goal_level);
+ if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
+ return RET_PF_RETRY;
+
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
@@ -1173,9 +1175,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
- if (iter.level == fault->goal_level)
- break;
-
/*
* If SPTE has been frozen by another thread, just give up and
* retry, avoiding unnecessary page table allocation and free.
@@ -1183,6 +1182,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (is_removed_spte(iter.old_spte))
goto retry;
+ if (iter.level == fault->goal_level)
+ goto map_target_level;
+
/* Step down into the lower level page table if it exists. */
if (is_shadow_present_pte(iter.old_spte) &&
!is_large_pte(iter.old_spte))
@@ -1203,8 +1205,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
/*
- * Also force the guest to retry the access if the upper level SPTEs
- * aren't in place.
+ * Force the guest to retry if installing an upper level SPTE
+ * failed, e.g. because a different task modified the SPTE.
*/
if (r) {
tdp_mmu_free_sp(sp);
@@ -1214,11 +1216,20 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (fault->huge_page_disallowed &&
fault->req_level >= iter.level) {
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- track_possible_nx_huge_page(kvm, sp);
+ if (sp->nx_huge_page_disallowed)
+ track_possible_nx_huge_page(kvm, sp);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
}
+ /*
+ * The walk aborted before reaching the target level, e.g. because the
+ * iterator detected an upper level SPTE was frozen during traversal.
+ */
+ WARN_ON_ONCE(iter.level == fault->goal_level);
+ goto retry;
+
+map_target_level:
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
retry:
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 684393c22105..eb594620dd75 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -238,7 +238,8 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return false;
/* recalibrate sample period and check if it's accepted by perf core */
- if (perf_event_period(pmc->perf_event,
+ if (is_sampling_event(pmc->perf_event) &&
+ perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter)))
return false;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 85ff3c0588ba..cdb91009701d 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -140,7 +140,8 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
{
- if (!pmc->perf_event || pmc->is_paused)
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
return;
perf_event_period(pmc->perf_event,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bc9cd7086fa9..add65dd59756 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm)
c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+ * does not affect any interrupt we may want to inject;
+ * therefore, writes to CR8 are irrelevant to L0, as are
+ * interrupt window vmexits.
*/
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b6f4411b613e..d93c715cda6a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5296,10 +5296,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
- kvm_vcpu_write_guest(vcpu,
- vmptr + offsetof(struct vmcs12,
- launch_state),
- &zero, sizeof(zero));
+ /*
+ * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
+ * for VMCLEAR includes a "ensure that data for VMCS referenced
+ * by the operand is in memory" clause that guards writes to
+ * memory, i.e. doing nothing for I/O is architecturally valid.
+ *
+ * FIXME: Suppress failures if and only if no memslot is found,
+ * i.e. exit to userspace if __copy_to_user() fails.
+ */
+ (void)kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
} else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
nested_release_evmcs(vcpu);
}
@@ -6873,7 +6882,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
/*
* We can emulate "VMCS shadowing," even if the hardware
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fe5615fd8295..7eec0226d56a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3440,18 +3440,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -4459,6 +4456,13 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* controls for features that are/aren't exposed to the guest.
*/
if (nested) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
if (enabled)
vmx->nested.msrs.secondary_ctls_high |= control;
else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 312aea1854ae..da4bbd043a7b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13132,6 +13132,9 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e)
{
if (r == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_BUG_ON(!e, vcpu->kvm))
+ return -EIO;
+
kvm_inject_emulated_page_fault(vcpu, e);
return 1;
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index d7af40240248..8fd41f5deae3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -41,7 +41,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == GPA_INVALID) {
+ if (gfn == KVM_XEN_INVALID_GFN) {
kvm_gpc_deactivate(gpc);
goto out;
}
@@ -271,7 +271,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- read_lock_irqsave(&gpc1->lock, flags);
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
while (!kvm_gpc_check(gpc1, user_len1)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -304,9 +312,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* The guest's runstate_info is split across two pages and we
* need to hold and validate both GPCs simultaneously. We can
* declare a lock ordering GPC1 > GPC2 because nothing else
- * takes them more than one at a time.
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
*/
- read_lock(&gpc2->lock);
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
if (!kvm_gpc_check(gpc2, user_len2)) {
read_unlock(&gpc2->lock);
@@ -590,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.upcall_vector = data->u.vector;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
@@ -619,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.xen_version = data->u.xen_version;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -630,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = -EOPNOTSUPP;
break;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -647,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
@@ -659,7 +676,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (kvm->arch.xen.shinfo_cache.active)
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
- data->u.shared_info.gfn = GPA_INVALID;
+ data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
@@ -686,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return r;
}
@@ -694,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int idx, r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (data->type) {
@@ -705,7 +722,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == GPA_INVALID) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
@@ -719,7 +736,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
- if (data->u.gpa == GPA_INVALID) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
@@ -739,7 +756,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EOPNOTSUPP;
break;
}
- if (data->u.gpa == GPA_INVALID) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
r = 0;
deactivate_out:
kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
@@ -922,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -930,14 +947,14 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
if (vcpu->arch.xen.vcpu_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
- data->u.gpa = GPA_INVALID;
+ data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
@@ -945,7 +962,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
- data->u.gpa = GPA_INVALID;
+ data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
@@ -1013,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -1069,6 +1086,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
: kvm->arch.xen_hvm_config.blob_size_32;
u8 *page;
+ int ret;
if (page_num >= blob_size)
return 1;
@@ -1079,10 +1097,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
if (IS_ERR(page))
return PTR_ERR(page);
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
- kfree(page);
+ ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
+ kfree(page);
+ if (ret)
return 1;
- }
}
return 0;
}
@@ -1105,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
@@ -1114,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return 0;
}
@@ -1183,30 +1201,22 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
u64 param, u64 *r)
{
- int idx, i;
struct sched_poll sched_poll;
evtchn_port_t port, *ports;
- gpa_t gpa;
+ struct x86_exception e;
+ int i;
if (!lapic_in_kernel(vcpu) ||
!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
return false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- if (!gpa) {
- *r = -EFAULT;
- return true;
- }
-
if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
struct compat_sched_poll sp32;
/* Sanity check that the compat struct definition is correct */
BUILD_BUG_ON(sizeof(sp32) != 16);
- if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) {
+ if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
*r = -EFAULT;
return true;
}
@@ -1220,8 +1230,8 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
sched_poll.nr_ports = sp32.nr_ports;
sched_poll.timeout = sp32.timeout;
} else {
- if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
- sizeof(sched_poll))) {
+ if (kvm_read_guest_virt(vcpu, param, &sched_poll,
+ sizeof(sched_poll), &e)) {
*r = -EFAULT;
return true;
}
@@ -1243,18 +1253,13 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
} else
ports = &port;
+ if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
+ sched_poll.nr_ports * sizeof(*ports), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
for (i = 0; i < sched_poll.nr_ports; i++) {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu,
- (gva_t)(sched_poll.ports + i),
- NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
- &ports[i], sizeof(port))) {
- *r = -EFAULT;
- goto out;
- }
if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
*r = -EINVAL;
goto out;
@@ -1330,9 +1335,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
int vcpu_id, u64 param, u64 *r)
{
struct vcpu_set_singleshot_timer oneshot;
+ struct x86_exception e;
s64 delta;
- gpa_t gpa;
- int idx;
if (!kvm_xen_timer_enabled(vcpu))
return false;
@@ -1343,9 +1347,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
*r = -EINVAL;
return true;
}
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
/*
* The only difference for 32-bit compat is the 4 bytes of
@@ -1363,9 +1364,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
sizeof_field(struct vcpu_set_singleshot_timer, flags));
- if (!gpa ||
- kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
- sizeof(struct compat_vcpu_set_singleshot_timer))) {
+ if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
*r = -EFAULT;
return true;
}
@@ -1675,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- /*
- * For the irqfd workqueue, using the main kvm->lock mutex is
- * fine since this function is invoked from kvm_set_irq() with
- * no other lock held, no srcu. In future if it will be called
- * directly from a vCPU thread (e.g. on hypercall for an IPI)
- * then it may need to switch to using a leaf-node mutex for
- * serializing the shared_info mapping.
- */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
/*
* It is theoretically possible for the page to be unmapped
@@ -1712,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -1825,20 +1817,20 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
{
u32 port = data->u.evtchn.send_port;
struct evtchnfd *evtchnfd;
+ int ret;
- if (!port || port >= max_evtchn_port(kvm))
- return -EINVAL;
-
- mutex_lock(&kvm->lock);
+ /* Protect writes to evtchnfd as well as the idr lookup. */
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ ret = -ENOENT;
if (!evtchnfd)
- return -ENOENT;
+ goto out_unlock;
/* For an UPDATE, nothing may change except the priority/vcpu */
+ ret = -EINVAL;
if (evtchnfd->type != data->u.evtchn.type)
- return -EINVAL;
+ goto out_unlock;
/*
* Port cannot change, and if it's zero that was an eventfd
@@ -1846,20 +1838,21 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
*/
if (!evtchnfd->deliver.port.port ||
evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
- return -EINVAL;
+ goto out_unlock;
/* We only support 2 level event channels for now */
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
- return -EINVAL;
+ goto out_unlock;
- mutex_lock(&kvm->lock);
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
evtchnfd->deliver.port.vcpu_idx = -1;
}
- mutex_unlock(&kvm->lock);
- return 0;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return ret;
}
/*
@@ -1871,12 +1864,9 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
{
u32 port = data->u.evtchn.send_port;
struct eventfd_ctx *eventfd = NULL;
- struct evtchnfd *evtchnfd = NULL;
+ struct evtchnfd *evtchnfd;
int ret = -EINVAL;
- if (!port || port >= max_evtchn_port(kvm))
- return -EINVAL;
-
evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
if (!evtchnfd)
return -ENOMEM;
@@ -1924,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
GFP_KERNEL);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (ret >= 0)
return 0;
@@ -1945,15 +1935,14 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
{
struct evtchnfd *evtchnfd;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (!evtchnfd)
return -ENOENT;
- if (kvm)
- synchronize_srcu(&kvm->srcu);
+ synchronize_srcu(&kvm->srcu);
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
@@ -1962,18 +1951,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
static int kvm_xen_eventfd_reset(struct kvm *kvm)
{
- struct evtchnfd *evtchnfd;
+ struct evtchnfd *evtchnfd, **all_evtchnfds;
int i;
+ int n = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
- mutex_lock(&kvm->lock);
+ /*
+ * Because synchronize_srcu() cannot be called inside the
+ * critical section, first collect all the evtchnfd objects
+ * in an array as they are removed from evtchn_ports.
+ */
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
+ n++;
+
+ all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
+ if (!all_evtchnfds) {
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return -ENOMEM;
+ }
+
+ n = 0;
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ all_evtchnfds[n++] = evtchnfd;
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
- synchronize_srcu(&kvm->srcu);
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ synchronize_srcu(&kvm->srcu);
+
+ while (n--) {
+ evtchnfd = all_evtchnfds[n];
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
}
- mutex_unlock(&kvm->lock);
+ kfree(all_evtchnfds);
return 0;
}
@@ -2002,20 +2015,22 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
{
struct evtchnfd *evtchnfd;
struct evtchn_send send;
- gpa_t gpa;
- int idx;
+ struct x86_exception e;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+ /* Sanity check: this structure is the same for 32-bit and 64-bit */
+ BUILD_BUG_ON(sizeof(send) != 4);
+ if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
*r = -EFAULT;
return true;
}
- /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+ /*
+ * evtchnfd is protected by kvm->srcu; the idr lookup instead
+ * is protected by RCU.
+ */
+ rcu_read_lock();
evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ rcu_read_unlock();
if (!evtchnfd)
return false;
@@ -2063,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}