From 8c49d9a74bac5ea3f18480307057241b808fcc0c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 May 2011 09:31:24 -0400 Subject: x86-64: Clean up vdso/kernel shared variables Variables that are shared between the vdso and the kernel are currently a bit of a mess. They are each defined with their own magic, they are accessed differently in the kernel, the vsyscall page, and the vdso, and one of them (vsyscall_clock) doesn't even really exist. This changes them all to use a common mechanism. All of them are delcared in vvar.h with a fixed address (validated by the linker script). In the kernel (as before), they look like ordinary read-write variables. In the vsyscall page and the vdso, they are accessed through a new macro VVAR, which gives read-only access. The vdso is now loaded verbatim into memory without any fixups. As a side bonus, access from the vdso is faster because a level of indirection is removed. While we're at it, pack jiffies and vgetcpu_mode into the same cacheline. Signed-off-by: Andy Lutomirski Cc: Andi Kleen Cc: Linus Torvalds Cc: "David S. Miller" Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Borislav Petkov Link: http://lkml.kernel.org/r/%3C7357882fbb51fa30491636a7b6528747301b7ee9.1306156808.git.luto%40mit.edu%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time.c | 2 +- arch/x86/kernel/tsc.c | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 34 +++++++++++--------------------- arch/x86/kernel/vsyscall_64.c | 46 +++++++++++++++++++------------------------ 4 files changed, 34 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25a28a245937..00cbb272627f 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include #ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9335bf7dd2e7..db697b81b8b1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,8 +777,8 @@ static cycle_t __vsyscall_fn vread_tsc(void) ret = (cycle_t)vget_cycles(); rdtsc_barrier(); - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; + return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ? + ret : VVAR(vsyscall_gtod_data).clock.cycle_last; } #endif diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49927a863cc1..74b5ad450c7e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -161,6 +161,12 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) +#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ + ADDR(.vsyscall_0) + offset \ + : AT(VLOAD(.vsyscall_var_ ## x)) { \ + *(.vsyscall_var_ ## x) \ + } \ + x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; @@ -175,18 +181,6 @@ SECTIONS *(.vsyscall_fn) } - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } @@ -194,21 +188,14 @@ SECTIONS *(.vsyscall_2) } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR @@ -216,6 +203,7 @@ SECTIONS #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT +#undef EMIT_VVAR #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c4b694..5f6ad032575a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -49,15 +49,8 @@ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" -/* - * vsyscall_gtod_data contains data that is : - * - readonly from vsyscalls - * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) - * Try to keep this structure as small as possible to avoid cache line ping pongs - */ -int __vgetcpu_mode __section_vgetcpu_mode; - -struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = +DEFINE_VVAR(int, vgetcpu_mode); +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, @@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __vsyscall_gtod_data.sys_tz; + *tz = VVAR(vsyscall_gtod_data).sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - vread = __vsyscall_gtod_data.clock.vread; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + vread = VVAR(vsyscall_gtod_data).clock.vread; + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || + !vread)) { gettimeofday(tv,NULL); return; } now = vread(); - base = __vsyscall_gtod_data.clock.cycle_last; - mask = __vsyscall_gtod_data.clock.mask; - mult = __vsyscall_gtod_data.clock.mult; - shift = __vsyscall_gtod_data.clock.shift; + base = VVAR(vsyscall_gtod_data).clock.cycle_last; + mask = VVAR(vsyscall_gtod_data).clock.mask; + mult = VVAR(vsyscall_gtod_data).clock.mult; + shift = VVAR(vsyscall_gtod_data).clock.shift; - tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; - nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; + nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; @@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t) { unsigned seq; time_t result; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) return time_syscall(t); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - result = __vsyscall_gtod_data.wall_time_sec; + result = VVAR(vsyscall_gtod_data).wall_time_sec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); if (t) *t = result; @@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) We do this here because otherwise user space would do it on its own in a likely inferior way (no access to jiffies). If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { + if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ native_read_tscp(&p); } else { -- cgit v1.2.1 From 057e6a8c660e95c3f4e7162e00e2fee1fc90c50d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 May 2011 09:31:25 -0400 Subject: x86-64: Remove unnecessary barrier in vread_tsc RDTSC is completely unordered on modern Intel and AMD CPUs. The Intel manual says that lfence;rdtsc causes all previous instructions to complete before the tsc is read, and the AMD manual says to use mfence;rdtsc to do the same thing. From a decent amount of testing [1] this is enough to make rdtsc be ordered with respect to subsequent loads across a wide variety of CPUs. On Sandy Bridge (i7-2600), this improves a loop of clock_gettime(CLOCK_MONOTONIC) by more than 5 ns/iter. [1] https://lkml.org/lkml/2011/4/18/350 Signed-off-by: Andy Lutomirski Cc: Andi Kleen Cc: Linus Torvalds Cc: "David S. Miller" Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Borislav Petkov Link: http://lkml.kernel.org/r/%3C1c158b9d74338aa5361f96dd473d0e6a58235302.1306156808.git.luto%40mit.edu%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db697b81b8b1..1e6244202612 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -769,13 +769,14 @@ static cycle_t __vsyscall_fn vread_tsc(void) cycle_t ret; /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. */ rdtsc_barrier(); ret = (cycle_t)vget_cycles(); - rdtsc_barrier(); return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ? ret : VVAR(vsyscall_gtod_data).clock.cycle_last; -- cgit v1.2.1 From 3729db5ca2b2000c660e5a5d0eb68b1053212cab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 May 2011 09:31:26 -0400 Subject: x86-64: Don't generate cmov in vread_tsc vread_tsc checks whether rdtsc returns something less than cycle_last, which is an extremely predictable branch. GCC likes to generate a cmov anyway, which is several cycles slower than a predicted branch. This saves a couple of nanoseconds. Signed-off-by: Andy Lutomirski Cc: Andi Kleen Cc: Linus Torvalds Cc: "David S. Miller" Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Borislav Petkov Link: http://lkml.kernel.org/r/%3C561280649519de41352fcb620684dfb22bad6bac.1306156808.git.luto%40mit.edu%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 1e6244202612..24249a5360b6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,6 +767,7 @@ static cycle_t read_tsc(struct clocksource *cs) static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret; + u64 last; /* * Empirically, a fence (of type that depends on the CPU) @@ -778,8 +779,21 @@ static cycle_t __vsyscall_fn vread_tsc(void) rdtsc_barrier(); ret = (cycle_t)vget_cycles(); - return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ? - ret : VVAR(vsyscall_gtod_data).clock.cycle_last; + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; } #endif -- cgit v1.2.1 From 44259b1abfaa8bb819d25d41d71e8e33e25dd36a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 May 2011 09:31:28 -0400 Subject: x86-64: Move vread_tsc into a new file with sensible options vread_tsc is short and hot, and it's userspace code so the usual reasons to enable -pg and turn off sibling calls don't apply. (OK, turning off sibling calls has no effect. But it might someday...) As an added benefit, tsc.c is profilable now. Signed-off-by: Andy Lutomirski Cc: Andi Kleen Cc: Linus Torvalds Cc: "David S. Miller" Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Borislav Petkov Link: http://lkml.kernel.org/r/%3C99c6d7f5efa3ccb65b4ac6eb443e1ab7bad47d7b.1306156808.git.luto%40mit.edu%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 8 +++++--- arch/x86/kernel/tsc.c | 34 ---------------------------------- arch/x86/kernel/vread_tsc_64.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 37 deletions(-) create mode 100644 arch/x86/kernel/vread_tsc_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 250806472a7e..f5abe3a245b8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities -CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg @@ -24,13 +23,16 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_tsc.o := $(nostackp) +CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n GCOV_PROFILE_paravirt.o := n +# vread_tsc_64 is hot and should be fully optimized: +CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls + obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o @@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 24249a5360b6..6cc6922262af 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -763,40 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs) ret : clocksource_tsc.cycle_last; } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} -#endif - static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c new file mode 100644 index 000000000000..a81aa9e9894c --- /dev/null +++ b/arch/x86/kernel/vread_tsc_64.c @@ -0,0 +1,36 @@ +/* This code runs in userspace. */ + +#define DISABLE_BRANCH_PROFILING +#include + +notrace cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} -- cgit v1.2.1