diff options
Diffstat (limited to 'arch/x86/entry')
-rw-r--r-- | arch/x86/entry/calling.h | 2 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 21 | ||||
-rw-r--r-- | arch/x86/entry/entry_64.S | 138 | ||||
-rw-r--r-- | arch/x86/entry/vdso/Makefile | 16 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vclock_gettime.c | 220 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vgetcpu.c | 8 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 38 | ||||
-rw-r--r-- | arch/x86/entry/vsyscall/vsyscall_gtod.c | 51 |
8 files changed, 186 insertions, 308 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 352e70cd33e8..708b46a54578 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -338,7 +338,7 @@ For 32-bit we have the following conventions - kernel is built with .macro CALL_enter_from_user_mode #ifdef CONFIG_CONTEXT_TRACKING #ifdef HAVE_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 + STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1 #endif call enter_from_user_mode .Lafter_call_\@: diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2767c625a52c..687e47f8a796 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -389,6 +389,13 @@ * that register for the time this macro runs */ + /* + * The high bits of the CS dword (__csh) are used for + * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case + * hardware didn't do this for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -407,12 +414,6 @@ /* Load top of task-stack into %edi */ movl TSS_entry2task_stack(%edi), %edi - /* - * Clear unused upper bits of the dword containing the word-sized CS - * slot in pt_regs in case hardware didn't clear it for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Special case - entry from kernel mode via entry stack */ #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS @@ -782,7 +783,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) * will ignore all of the single-step traps generated in this range. */ -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * Xen doesn't set %esp to be precisely what the normal SYSENTER * entry point expects, so fix it up before using the normal path. @@ -1240,7 +1241,7 @@ ENTRY(spurious_interrupt_bug) jmp common_exception END(spurious_interrupt_bug) -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL @@ -1321,11 +1322,13 @@ ENTRY(xen_failsafe_callback) _ASM_EXTABLE(3b, 8b) _ASM_EXTABLE(4b, 9b) ENDPROC(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ +#ifdef CONFIG_XEN_PVHVM BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) +#endif -#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 957dfb693ecc..4d7a2d9d44cf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - -ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY - swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi -END(entry_SYSCALL_64_trampoline) - - .popsection - -ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe -END(entry_SYSCALL_64_stage2) - ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -212,21 +151,19 @@ ENTRY(entry_SYSCALL_64) */ swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + /* tss.sp2 is scratch space. */ + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +/** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -1050,7 +1023,7 @@ ENTRY(do_softirq_own_stack) ret ENDPROC(do_softirq_own_stack) -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* @@ -1130,11 +1103,13 @@ ENTRY(xen_failsafe_callback) ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ +#ifdef CONFIG_XEN_PVHVM apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall +#endif -#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1151,7 +1126,7 @@ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 @@ -1187,6 +1162,16 @@ ENTRY(paranoid_entry) xorl %ebx, %ebx 1: + /* + * Always stash CR3 in %r14. This value will be restored, + * verbatim, at exit. Needed if paranoid_entry interrupted + * another entry that already switched to the user CR3 value + * but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ret @@ -1211,11 +1196,13 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel @@ -1626,6 +1613,7 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 testl %ebx, %ebx /* swapgs needed? */ diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index fa3f439f0a92..141d415a8c80 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) @@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d95c60..007b3fe9d727 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -43,50 +43,26 @@ extern u8 hvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "rcx", "r11"); return ret; } -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} - - #else notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) - : "memory", "edx"); + : "=a" (ret), "=m" (*ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) + : "edx"); return ret; } @@ -98,12 +74,11 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } -static notrace u64 vread_pvclock(int *mode) +static notrace u64 vread_pvclock(void) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - u64 ret; - u64 last; u32 version; + u64 ret; /* * Note: The kernel and hypervisor must guarantee that cpu ID @@ -130,175 +105,112 @@ static notrace u64 vread_pvclock(int *mode) do { version = pvclock_read_begin(pvti); - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { - *mode = VCLOCK_NONE; - return 0; - } + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) + return U64_MAX; ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - /* refer to vread_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; + return ret; } #endif #ifdef CONFIG_HYPERV_TSCPAGE -static notrace u64 vread_hvclock(int *mode) +static notrace u64 vread_hvclock(void) { const struct ms_hyperv_tsc_page *tsc_pg = (const struct ms_hyperv_tsc_page *)&hvclock_page; - u64 current_tick = hv_read_tsc_page(tsc_pg); - - if (current_tick != U64_MAX) - return current_tick; - *mode = VCLOCK_NONE; - return 0; + return hv_read_tsc_page(tsc_pg); } #endif -notrace static u64 vread_tsc(void) -{ - u64 ret = (u64)rdtsc_ordered(); - u64 last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -notrace static inline u64 vgetsns(int *mode) +notrace static inline u64 vgetcyc(int mode) { - u64 v; - cycles_t cycles; - - if (gtod->vclock_mode == VCLOCK_TSC) - cycles = vread_tsc(); + if (mode == VCLOCK_TSC) + return (u64)rdtsc_ordered(); #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) - cycles = vread_pvclock(mode); + else if (mode == VCLOCK_PVCLOCK) + return vread_pvclock(); #endif #ifdef CONFIG_HYPERV_TSCPAGE - else if (gtod->vclock_mode == VCLOCK_HVCLOCK) - cycles = vread_hvclock(mode); + else if (mode == VCLOCK_HVCLOCK) + return vread_hvclock(); #endif - else - return 0; - v = (cycles - gtod->cycle_last) & gtod->mask; - return v * gtod->mult; + return U64_MAX; } -/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ -notrace static int __always_inline do_realtime(struct timespec *ts) +notrace static int do_hres(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = >od->basetime[clk]; + u64 cycles, last, sec, ns; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->wall_time_snsec; - ns += vgetsns(&mode); + cycles = vgetcyc(gtod->vclock_mode); + ns = base->nsec; + last = gtod->cycle_last; + if (unlikely((s64)cycles < 0)) + return vdso_fallback_gettime(clk, ts); + if (cycles > last) + ns += (cycles - last) * gtod->mult; ns >>= gtod->shift; + sec = base->sec; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; - return mode; + return 0; } -notrace static int __always_inline do_monotonic(struct timespec *ts) +notrace static void do_coarse(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = >od->basetime[clk]; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; + ts->tv_sec = base->sec; + ts->tv_nsec = base->nsec; } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; } -notrace static void do_realtime_coarse(struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->wall_time_coarse_sec; - ts->tv_nsec = gtod->wall_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + unsigned int msk; -notrace static void do_monotonic_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + /* Sort out negative (CPU/FD) and invalid clocks */ + if (unlikely((unsigned int) clock >= MAX_CLOCKS)) + return vdso_fallback_gettime(clock, ts); -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - switch (clock) { - case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_MONOTONIC: - if (do_monotonic(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_REALTIME_COARSE: - do_realtime_coarse(ts); - break; - case CLOCK_MONOTONIC_COARSE: - do_monotonic_coarse(ts); - break; - default: - goto fallback; + /* + * Convert the clockid to a bitmask and use it to check which + * clocks are handled in the VDSO directly. + */ + msk = 1U << clock; + if (likely(msk & VGTOD_HRES)) { + return do_hres(clock, ts); + } else if (msk & VGTOD_COARSE) { + do_coarse(clock, ts); + return 0; } - - return 0; -fallback: return vdso_fallback_gettime(clock, ts); } + int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { if (likely(tv != NULL)) { - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) - return vdso_fallback_gtod(tv, tz); + struct timespec *ts = (struct timespec *) tv; + + do_hres(CLOCK_REALTIME, ts); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { @@ -318,7 +230,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86 so we don't need any locks. */ - time_t result = READ_ONCE(gtod->wall_time_sec); + time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); if (t) *t = result; diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index 8ec3d1f4ce9a..f86ab0ae1777 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -13,14 +13,8 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int p; + vdso_read_cpunode(cpu, node); - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; return 0; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b8b556dbb12..3f9d43f26f63 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -332,40 +332,6 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (static_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit1 = node >> 4; - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static int vgetcpu_online(unsigned int cpu) -{ - return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); -} static int __init init_vdso(void) { @@ -375,9 +341,7 @@ static int __init init_vdso(void) init_vdso_image(&vdso_image_x32); #endif - /* notifier priority > KVM */ - return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "x86/vdso/vma:online", vgetcpu_online, NULL); + return 0; } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index e1216dd95c04..cfcdba082feb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *tk) { int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + struct vgtod_ts *base; + u64 nsec; /* Mark the new vclock used. */ BUILD_BUG_ON(VCLOCK_MAX >= 32); @@ -45,34 +47,37 @@ void update_vsyscall(struct timekeeper *tk) vdata->mult = tk->tkr_mono.mult; vdata->shift = tk->tkr_mono.shift; - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + base = &vdata->basetime[CLOCK_REALTIME]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } + base = &vdata->basetime[CLOCK_TAI]; + base->sec = tk->xtime_sec + (s64)tk->tai_offset; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); + base = &vdata->basetime[CLOCK_MONOTONIC]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + base->sec++; + } + base->nsec = nsec; - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + base = &vdata->basetime[CLOCK_REALTIME_COARSE]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; + base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->wall_to_monotonic.tv_nsec; + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + base->sec++; } + base->nsec = nsec; gtod_write_end(vdata); } |