diff options
Diffstat (limited to 'arch/x86')
35 files changed, 450 insertions, 232 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a2433ae8a65e..4efd39aacb9f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) regs->ax = -EFAULT; instrumentation_end(); - syscall_exit_to_user_mode(regs); + local_irq_disable(); + irqentry_exit_to_user_mode(regs); return false; } diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6ddeed3cd2ac..18df17129695 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); -DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } -static inline struct perf_guest_switch_msr * -perf_guest_get_msrs_nop(int *nr) -{ - *nr = 0; - return NULL; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; x86_pmu_static_call_update(); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d5c18e..37ce38403cb8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3659,11 +3659,16 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3681,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ebae1826403..d32b302719fe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 877a4025d8da..3768819693e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -884,12 +884,29 @@ struct kvm_hv_syndbg { u64 options; }; +/* Current state of Hyper-V TSC page clocksource */ +enum hv_tsc_page_status { + /* TSC page was not set up or disabled */ + HV_TSC_PAGE_UNSET = 0, + /* TSC page MSR was written by the guest, update pending */ + HV_TSC_PAGE_GUEST_CHANGED, + /* TSC page MSR was written by KVM userspace, update pending */ + HV_TSC_PAGE_HOST_CHANGED, + /* TSC page was properly set up and is currently active */ + HV_TSC_PAGE_SET, + /* TSC page is currently being updated and therefore is inactive */ + HV_TSC_PAGE_UPDATING, + /* TSC page was set up with an inaccessible GPA */ + HV_TSC_PAGE_BROKEN, +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; + enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; @@ -931,6 +948,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_x86_msr_filter { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; +}; + #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 @@ -963,7 +986,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -1025,18 +1048,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool bus_lock_detection_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct kvm_x86_msr_filter __rcu *msr_filter; - struct { - u8 count; - bool default_allow:1; - struct msr_bitmap_range ranges[16]; - } msr_filter; - - bool bus_lock_detection_enabled; - - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index dc6d149bf851..f1b9ed5efaa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -551,15 +551,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, *size = fpu_kernel_xstate_size; } -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ - static inline void native_load_sp0(unsigned long sp0) { diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..b6a9d51d1d79 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include <asm/paravirt_types.h> #endif +#include <asm/proto.h> + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8b58d6975d5d..0bc9b0895f33 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0d751d5da702..06b740bae431 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -205,10 +205,23 @@ static inline int arch_within_stack_frames(const void * const stack, #endif +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + +#ifndef __ASSEMBLY__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ + +#define arch_set_restart_data(restart) \ + do { restart->arch_data = current_thread_info()->status; } while (0) + #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_32 #define in_ia32_syscall() true diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bda4f2a36868..4f26700f314d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2342,6 +2342,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c3b60c37c728..73ff4dd426a8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 373e5fa3ce1f..51c7f5271aee 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01ca3b4..78bb0fae3982 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf6..1fc0962c89c0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 84c1821819af..04a780abb512 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } @@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) + if (on_vc_stack(regs)) new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); else new_ist = old_ist - sizeof(old_ist); @@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; @@ -1248,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1263,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1325,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..f306e85a08a6 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7f5aec758f0e..ac1874a2a70e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2a1d47f47eee..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 58fa8c029867..f98370a39936 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) u64 tsc; /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. + * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, + * is broken, disabled or being updated. */ - if (!hv->tsc_ref.tsc_sequence) + if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); @@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, return true; } +/* + * Don't touch TSC page values if the guest has opted for TSC emulation after + * migration. KVM doesn't fully support reenlightenment notifications and TSC + * access emulation and Hyper-V is known to expect the values in TSC page to + * stay constant before TSC access emulation is disabled from guest side + * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC + * frequency and guest visible TSC value across migration (and prevent it when + * TSC scaling is unsupported). + */ +static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) +{ + return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && + hv->hv_tsc_emulation_control; +} + void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { @@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; mutex_lock(&hv->hv_lock); @@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) + goto out_err; + + if (tsc_seq && tsc_page_update_unsafe(hv)) { + if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; + } /* * While we're computing and writing the parameters, force the @@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - goto out_unlock; + goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - goto out_unlock; + goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - goto out_unlock; + goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + goto out_unlock; + +out_err: + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; +out_unlock: + mutex_unlock(&hv->hv_lock); +} + +void kvm_hv_invalidate_tsc_page(struct kvm *kvm) +{ + struct kvm_hv *hv = to_kvm_hv(kvm); + u64 gfn; + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || + tsc_page_update_unsafe(hv)) + return; + + mutex_lock(&hv->hv_lock); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + + /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */ + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET) + hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + out_unlock: mutex_unlock(&hv->hv_lock); } @@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { + if (!host) + hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; + else + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } else { + hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; + } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, @@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: + if (data && !host) + return 1; + hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e951af1fcb2c..60547d5cb6d7 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); +void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bfacb7c..cc369b9ad8f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ec4fc28b325a..1f6f98c76bdf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e5f148106e20..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) } /* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; - - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 4cc177d75c4a..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -36,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c926c6b899a1..462b1f71c77f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} - static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); @@ -301,11 +296,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, bool shared) { - struct kvm_mmu_page *sp = sptep_to_sp(pt); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; u64 old_child_spte; @@ -318,7 +318,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = pt + i; + sptep = rcu_dereference(pt) + i; gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not @@ -481,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -498,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, true); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return true; } @@ -527,7 +534,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*iter->sptep, 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); return true; } @@ -553,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - tdp_ptep_t root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -570,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, false); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -648,9 +651,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index baee91c1e936..58a45bb139f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50810d471462..32cf8287d4a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..fe806e894212 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1526,35 +1526,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; - u32 count = kvm->arch.msr_filter.count; - u32 i; - bool r = kvm->arch.msr_filter.default_allow; + bool allowed; int idx; + u32 i; - /* MSR filtering not set up or x2APIC enabled, allow everything */ - if (!count || (index >= 0x800 && index <= 0x8ff)) + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) return true; - /* Prevent collision with set_msr_filter */ idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < count; i++) { + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - r = !!test_bit(index - start, bitmap); + allowed = !!test_bit(index - start, bitmap); break; } } +out: srcu_read_unlock(&kvm->srcu, idx); - return r; + return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -2551,6 +2560,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + kvm_hv_invalidate_tsc_page(kvm); + spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ @@ -5352,25 +5363,34 @@ split_irqchip_unlock: return r; } -static void kvm_clear_msr_filter(struct kvm *kvm) +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; - u32 count = kvm->arch.msr_filter.count; - struct msr_bitmap_range ranges[16]; - mutex_lock(&kvm->lock); - kvm->arch.msr_filter.count = 0; - memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); - for (i = 0; i < count; i++) - kfree(ranges[i].bitmap); + kfree(msr_filter); } -static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; @@ -5404,11 +5424,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user goto err; } - /* Everything ok, add this range identifier to our global pool */ - ranges[kvm->arch.msr_filter.count] = range; - /* Make sure we filled the array before we tell anyone to walk it */ - smp_wmb(); - kvm->arch.msr_filter.count++; + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; return 0; err: @@ -5419,10 +5437,11 @@ err: static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) { struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_x86_msr_filter *new_filter, *old_filter; struct kvm_msr_filter filter; bool default_allow; - int r = 0; bool empty = true; + int r = 0; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) @@ -5435,25 +5454,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (empty && !default_allow) return -EINVAL; - kvm_clear_msr_filter(kvm); - - kvm->arch.msr_filter.default_allow = default_allow; + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; - /* - * Protect from concurrent calls to this function that could trigger - * a TOCTOU violation on kvm->arch.msr_filter.count. - */ - mutex_lock(&kvm->lock); for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(kvm, &filter.ranges[i]); - if (r) - break; + r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } } + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); - return r; + return 0; } long kvm_arch_vm_ioctl(struct file *filp, @@ -6603,7 +6629,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -10601,7 +10627,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; @@ -10634,8 +10660,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - u32 i; - if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10651,8 +10675,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } static_call_cond(kvm_x86_vm_destroy)(kvm); - for (i = 0; i < kvm->arch.msr_filter.count; i++) - kfree(kvm->arch.msr_filter.ranges[i].bitmap); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..bb0b3fe1e0a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -2033,9 +2044,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2225,7 +2244,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2336,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ac8578258af..b42bfdab01a9 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -27,7 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); -MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); static bool force; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a3cc33091f46..17d80f751fcb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -741,7 +741,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap[0].host_addr = map_ops[i].host_addr, unmap[0].handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -751,7 +751,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, kmap_ops[i].status = GNTST_general_error; unmap[1].host_addr = kmap_ops[i].host_addr, unmap[1].handle = kmap_ops[i].handle; - kmap_ops[i].handle = ~0; + kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; else @@ -776,7 +776,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, out: return ret; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -802,7 +801,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> |