summaryrefslogtreecommitdiff
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/calling.h2
-rw-r--r--arch/x86/entry/entry_32.S21
-rw-r--r--arch/x86/entry/entry_64.S138
-rw-r--r--arch/x86/entry/vdso/Makefile16
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c220
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c8
-rw-r--r--arch/x86/entry/vdso/vma.c38
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c51
8 files changed, 186 insertions, 308 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 352e70cd33e8..708b46a54578 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -338,7 +338,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef HAVE_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+ STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1
#endif
call enter_from_user_mode
.Lafter_call_\@:
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2767c625a52c..687e47f8a796 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -389,6 +389,13 @@
* that register for the time this macro runs
*/
+ /*
+ * The high bits of the CS dword (__csh) are used for
+ * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
+ * hardware didn't do this for us.
+ */
+ andl $(0x0000ffff), PT_CS(%esp)
+
/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@ -407,12 +414,6 @@
/* Load top of task-stack into %edi */
movl TSS_entry2task_stack(%edi), %edi
- /*
- * Clear unused upper bits of the dword containing the word-sized CS
- * slot in pt_regs in case hardware didn't clear it for us.
- */
- andl $(0x0000ffff), PT_CS(%esp)
-
/* Special case - entry from kernel mode via entry stack */
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
@@ -782,7 +783,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region)
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
@@ -1240,7 +1241,7 @@ ENTRY(spurious_interrupt_bug)
jmp common_exception
END(spurious_interrupt_bug)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
@@ -1321,11 +1322,13 @@ ENTRY(xen_failsafe_callback)
_ASM_EXTABLE(3b, 8b)
_ASM_EXTABLE(4b, 9b)
ENDPROC(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
xen_evtchn_do_upcall)
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 957dfb693ecc..4d7a2d9d44cf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,67 +142,6 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
- .pushsection .entry_trampoline, "ax"
-
-/*
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address). So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
- *
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline. We can thus find cpu_entry_area with this macro:
- */
-
-#define CPU_ENTRY_AREA \
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
- UNWIND_HINT_EMPTY
- swapgs
-
- /* Stash the user RSP. */
- movq %rsp, RSP_SCRATCH
-
- /* Note: using %rsp as a scratch reg. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
- /* Load the top of the task stack into RSP */
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
- /* Start building the simulated IRET frame. */
- pushq $__USER_DS /* pt_regs->ss */
- pushq RSP_SCRATCH /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
-
- /*
- * x86 lacks a near absolute jump, and we can't jump to the real
- * entry text with a relative jump. We could push the target
- * address and then use retq, but this destroys the pipeline on
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
- * spill RDI and restore it in a second-stage trampoline.
- */
- pushq %rdi
- movq $entry_SYSCALL_64_stage2, %rdi
- JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
- .popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
- UNWIND_HINT_EMPTY
- popq %rdi
- jmp entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -212,21 +151,19 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
- /*
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- * is not required to switch CR3.
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER_DS /* pt_regs->ss */
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
- pushq %rax /* pt_regs->orig_ax */
+ pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+/**
+ * idtentry - Generate an IDT entry stub
+ * @sym: Name of the generated entry point
+ * @do_sym: C function to be called
+ * @has_error_code: True if this IDT vector has an error code on the stack
+ * @paranoid: non-zero means that this vector may be invoked from
+ * kernel mode with user GSBASE and/or user CR3.
+ * 2 is special -- see below.
+ * @shift_ist: Set to an IST index if entries from kernel mode should
+ * decrement the IST stack so that nested entries get a
+ * fresh stack. (This is for #DB, which has a nasty habit
+ * of recursing.)
+ *
+ * idtentry generates an IDT stub that sets up a usable kernel context,
+ * creates struct pt_regs, and calls @do_sym. The stub has the following
+ * special behaviors:
+ *
+ * On an entry from user mode, the stub switches from the trampoline or
+ * IST stack to the normal thread stack. On an exit to user mode, the
+ * normal exit-to-usermode path is invoked.
+ *
+ * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+ * whereas we omit the preemption check if @paranoid != 0. This is purely
+ * because the implementation is simpler this way. The kernel only needs
+ * to check for asynchronous kernel preemption when IRQ handlers return.
+ *
+ * If @paranoid == 0, then the stub will handle IRET faults by pretending
+ * that the fault came from user mode. It will handle gs_change faults by
+ * pretending that the fault happened with kernel GSBASE. Since this handling
+ * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+ * @paranoid == 0. This special handling will do the wrong thing for
+ * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ *
+ * @paranoid == 2 is special: the stub will never switch stacks. This is for
+ * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -1050,7 +1023,7 @@ ENTRY(do_softirq_own_stack)
ret
ENDPROC(do_softirq_own_stack)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
@@ -1130,11 +1103,13 @@ ENTRY(xen_failsafe_callback)
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1151,7 +1126,7 @@ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry int3 do_int3 has_error_code=0
idtentry stack_segment do_stack_segment has_error_code=1
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
@@ -1187,6 +1162,16 @@ ENTRY(paranoid_entry)
xorl %ebx, %ebx
1:
+ /*
+ * Always stash CR3 in %r14. This value will be restored,
+ * verbatim, at exit. Needed if paranoid_entry interrupted
+ * another entry that already switched to the user CR3 value
+ * but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ */
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
@@ -1211,11 +1196,13 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
.Lparanoid_exit_restore:
jmp restore_regs_and_return_to_kernel
@@ -1626,6 +1613,7 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index fa3f439f0a92..141d415a8c80 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
- -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS)
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
@@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(obj)/vdso32.so.dbg: FORCE \
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d95c60..007b3fe9d727 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -43,50 +43,26 @@ extern u8 hvclock_page
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
+ asm ("syscall" : "=a" (ret), "=m" (*ts) :
+ "0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
+ "rcx", "r11");
return ret;
}
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
- return ret;
-}
-
-
#else
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
- asm(
+ asm (
"mov %%ebx, %%edx \n"
- "mov %2, %%ebx \n"
+ "mov %[clock], %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
- : "=a" (ret)
- : "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
- : "memory", "edx");
- return ret;
-}
-
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm(
- "mov %%ebx, %%edx \n"
- "mov %2, %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret)
- : "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
- : "memory", "edx");
+ : "=a" (ret), "=m" (*ts)
+ : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
+ : "edx");
return ret;
}
@@ -98,12 +74,11 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
}
-static notrace u64 vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(void)
{
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
- u64 ret;
- u64 last;
u32 version;
+ u64 ret;
/*
* Note: The kernel and hypervisor must guarantee that cpu ID
@@ -130,175 +105,112 @@ static notrace u64 vread_pvclock(int *mode)
do {
version = pvclock_read_begin(pvti);
- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
+ return U64_MAX;
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
- /* refer to vread_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
+ return ret;
}
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
-static notrace u64 vread_hvclock(int *mode)
+static notrace u64 vread_hvclock(void)
{
const struct ms_hyperv_tsc_page *tsc_pg =
(const struct ms_hyperv_tsc_page *)&hvclock_page;
- u64 current_tick = hv_read_tsc_page(tsc_pg);
-
- if (current_tick != U64_MAX)
- return current_tick;
- *mode = VCLOCK_NONE;
- return 0;
+ return hv_read_tsc_page(tsc_pg);
}
#endif
-notrace static u64 vread_tsc(void)
-{
- u64 ret = (u64)rdtsc_ordered();
- u64 last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a function of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-
-notrace static inline u64 vgetsns(int *mode)
+notrace static inline u64 vgetcyc(int mode)
{
- u64 v;
- cycles_t cycles;
-
- if (gtod->vclock_mode == VCLOCK_TSC)
- cycles = vread_tsc();
+ if (mode == VCLOCK_TSC)
+ return (u64)rdtsc_ordered();
#ifdef CONFIG_PARAVIRT_CLOCK
- else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
- cycles = vread_pvclock(mode);
+ else if (mode == VCLOCK_PVCLOCK)
+ return vread_pvclock();
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
- else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
- cycles = vread_hvclock(mode);
+ else if (mode == VCLOCK_HVCLOCK)
+ return vread_hvclock();
#endif
- else
- return 0;
- v = (cycles - gtod->cycle_last) & gtod->mask;
- return v * gtod->mult;
+ return U64_MAX;
}
-/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
-notrace static int __always_inline do_realtime(struct timespec *ts)
+notrace static int do_hres(clockid_t clk, struct timespec *ts)
{
- unsigned long seq;
- u64 ns;
- int mode;
+ struct vgtod_ts *base = &gtod->basetime[clk];
+ u64 cycles, last, sec, ns;
+ unsigned int seq;
do {
seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->wall_time_snsec;
- ns += vgetsns(&mode);
+ cycles = vgetcyc(gtod->vclock_mode);
+ ns = base->nsec;
+ last = gtod->cycle_last;
+ if (unlikely((s64)cycles < 0))
+ return vdso_fallback_gettime(clk, ts);
+ if (cycles > last)
+ ns += (cycles - last) * gtod->mult;
ns >>= gtod->shift;
+ sec = base->sec;
} while (unlikely(gtod_read_retry(gtod, seq)));
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
- return mode;
+ return 0;
}
-notrace static int __always_inline do_monotonic(struct timespec *ts)
+notrace static void do_coarse(clockid_t clk, struct timespec *ts)
{
- unsigned long seq;
- u64 ns;
- int mode;
+ struct vgtod_ts *base = &gtod->basetime[clk];
+ unsigned int seq;
do {
seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
- ns += vgetsns(&mode);
- ns >>= gtod->shift;
+ ts->tv_sec = base->sec;
+ ts->tv_nsec = base->nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
-
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return mode;
}
-notrace static void do_realtime_coarse(struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->wall_time_coarse_sec;
- ts->tv_nsec = gtod->wall_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
+ unsigned int msk;
-notrace static void do_monotonic_coarse(struct timespec *ts)
-{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->monotonic_time_coarse_sec;
- ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
+ /* Sort out negative (CPU/FD) and invalid clocks */
+ if (unlikely((unsigned int) clock >= MAX_CLOCKS))
+ return vdso_fallback_gettime(clock, ts);
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
-{
- switch (clock) {
- case CLOCK_REALTIME:
- if (do_realtime(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_MONOTONIC:
- if (do_monotonic(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_REALTIME_COARSE:
- do_realtime_coarse(ts);
- break;
- case CLOCK_MONOTONIC_COARSE:
- do_monotonic_coarse(ts);
- break;
- default:
- goto fallback;
+ /*
+ * Convert the clockid to a bitmask and use it to check which
+ * clocks are handled in the VDSO directly.
+ */
+ msk = 1U << clock;
+ if (likely(msk & VGTOD_HRES)) {
+ return do_hres(clock, ts);
+ } else if (msk & VGTOD_COARSE) {
+ do_coarse(clock, ts);
+ return 0;
}
-
- return 0;
-fallback:
return vdso_fallback_gettime(clock, ts);
}
+
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (likely(tv != NULL)) {
- if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
- return vdso_fallback_gtod(tv, tz);
+ struct timespec *ts = (struct timespec *) tv;
+
+ do_hres(CLOCK_REALTIME, ts);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
@@ -318,7 +230,7 @@ int gettimeofday(struct timeval *, struct timezone *)
notrace time_t __vdso_time(time_t *t)
{
/* This is atomic on x86 so we don't need any locks. */
- time_t result = READ_ONCE(gtod->wall_time_sec);
+ time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec);
if (t)
*t = result;
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4ce9a..f86ab0ae1777 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -13,14 +13,8 @@
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
- unsigned int p;
+ vdso_read_cpunode(cpu, node);
- p = __getcpu();
-
- if (cpu)
- *cpu = p & VGETCPU_CPU_MASK;
- if (node)
- *node = p >> 12;
return 0;
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556dbb12..3f9d43f26f63 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -332,40 +332,6 @@ static __init int vdso_setup(char *s)
return 0;
}
__setup("vdso=", vdso_setup);
-#endif
-
-#ifdef CONFIG_X86_64
-static void vgetcpu_cpu_init(void *arg)
-{
- int cpu = smp_processor_id();
- struct desc_struct d = { };
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /*
- * Store cpu number in limit so that it can be loaded
- * quickly in user space in vgetcpu. (12 bits for the CPU
- * and 8 bits for the node)
- */
- d.limit0 = cpu | ((node & 0xf) << 12);
- d.limit1 = node >> 4;
- d.type = 5; /* RO data, expand down, accessed */
- d.dpl = 3; /* Visible to user code */
- d.s = 1; /* Not a system segment */
- d.p = 1; /* Present */
- d.d = 1; /* 32-bit */
-
- write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static int vgetcpu_online(unsigned int cpu)
-{
- return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
-}
static int __init init_vdso(void)
{
@@ -375,9 +341,7 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
#endif
- /* notifier priority > KVM */
- return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
- "x86/vdso/vma:online", vgetcpu_online, NULL);
+ return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index e1216dd95c04..cfcdba082feb 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *tk)
{
int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+ struct vgtod_ts *base;
+ u64 nsec;
/* Mark the new vclock used. */
BUILD_BUG_ON(VCLOCK_MAX >= 32);
@@ -45,34 +47,37 @@ void update_vsyscall(struct timekeeper *tk)
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
- vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
+ base = &vdata->basetime[CLOCK_REALTIME];
+ base->sec = tk->xtime_sec;
+ base->nsec = tk->tkr_mono.xtime_nsec;
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
- + ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr_mono.shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
- vdata->monotonic_time_sec++;
- }
+ base = &vdata->basetime[CLOCK_TAI];
+ base->sec = tk->xtime_sec + (s64)tk->tai_offset;
+ base->nsec = tk->tkr_mono.xtime_nsec;
- vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
- tk->tkr_mono.shift);
+ base = &vdata->basetime[CLOCK_MONOTONIC];
+ base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
+ base->sec++;
+ }
+ base->nsec = nsec;
- vdata->monotonic_time_coarse_sec =
- vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_coarse_nsec =
- vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
+ base = &vdata->basetime[CLOCK_REALTIME_COARSE];
+ base->sec = tk->xtime_sec;
+ base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
- while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
- vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
- vdata->monotonic_time_coarse_sec++;
+ base = &vdata->basetime[CLOCK_MONOTONIC_COARSE];
+ base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->wall_to_monotonic.tv_nsec;
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ base->sec++;
}
+ base->nsec = nsec;
gtod_write_end(vdata);
}