From 79c53a83d7a31a5b5c7bafce4f0723bebf26836a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 22 May 2019 14:22:47 +0100 Subject: locking/atomic, x86: Use s64 for atomic64 As a step towards making the atomic64 API use consistent types treewide, let's have the x86 atomic64 implementation use s64 as the underlying type for atomic64_t, rather than long or long long, matching the generated headers. Note that the x86 arch_atomic64 implementation is already wrapped by the generic instrumented atomic64 implementation, which uses s64 consistently. Otherwise, there should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: aou@eecs.berkeley.edu Cc: arnd@arndb.de Cc: catalin.marinas@arm.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: ink@jurassic.park.msu.ru Cc: jhogan@kernel.org Cc: mattst88@gmail.com Cc: mpe@ellerman.id.au Cc: palmer@sifive.com Cc: paul.burton@mips.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: rth@twiddle.net Cc: tony.luck@intel.com Cc: vgupta@synopsys.com Link: https://lkml.kernel.org/r/20190522132250.26499-16-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic64_32.h | 66 ++++++++++++++++++-------------------- arch/x86/include/asm/atomic64_64.h | 38 +++++++++++----------- 2 files changed, 51 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6a5b0ec460da..52cfaecb13f9 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -9,7 +9,7 @@ /* An 64bit atomic type */ typedef struct { - u64 __aligned(8) counter; + s64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } @@ -71,8 +71,7 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, - long long n) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } @@ -85,9 +84,9 @@ static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { - long long o; + s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; alternative_atomic64(xchg, "=&A" (o), @@ -103,7 +102,7 @@ static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) * * Atomically sets the value of @v to @n. */ -static inline void arch_atomic64_set(atomic64_t *v, long long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -118,9 +117,9 @@ static inline void arch_atomic64_set(atomic64_t *v, long long i) * * Atomically reads the value of @v and returns it. */ -static inline long long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; } @@ -132,7 +131,7 @@ static inline long long arch_atomic64_read(const atomic64_t *v) * * Atomically adds @i to @v and returns @i + *@v */ -static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -143,7 +142,7 @@ static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) /* * Other variants with different arithmetic operators: */ -static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -151,18 +150,18 @@ static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) return i; } -static inline long long arch_atomic64_inc_return(atomic64_t *v) +static inline s64 arch_atomic64_inc_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(inc_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline long long arch_atomic64_dec_return(atomic64_t *v) +static inline s64 arch_atomic64_dec_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(dec_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; @@ -176,7 +175,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v) * * Atomically adds @i to @v. */ -static inline long long arch_atomic64_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -191,7 +190,7 @@ static inline long long arch_atomic64_add(long long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline long long arch_atomic64_sub(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -234,8 +233,7 @@ static inline void arch_atomic64_dec(atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, - long long u) +static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -254,9 +252,9 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) +static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(dec_if_positive, "=&A" (r), "S" (v) : "ecx", "memory"); return r; @@ -266,17 +264,17 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(long long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; } -static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; @@ -284,17 +282,17 @@ static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_or(long long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; } -static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; @@ -302,17 +300,17 @@ static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_xor(long long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; } -static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; @@ -320,9 +318,9 @@ static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) return old; } -static inline long long arch_atomic64_fetch_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) c = old; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index dadc20adba21..703b7dfd45e0 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -17,7 +17,7 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { return READ_ONCE((v)->counter); } @@ -29,7 +29,7 @@ static inline long arch_atomic64_read(const atomic64_t *v) * * Atomically sets the value of @v to @i. */ -static inline void arch_atomic64_set(atomic64_t *v, long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { WRITE_ONCE(v->counter, i); } @@ -41,7 +41,7 @@ static inline void arch_atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static __always_inline void arch_atomic64_add(long i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -55,7 +55,7 @@ static __always_inline void arch_atomic64_add(long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline void arch_atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) @@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } @@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) +static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } @@ -155,43 +155,43 @@ static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } -static inline long arch_atomic64_sub_return(long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } -static inline long arch_atomic64_fetch_add(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } -static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } -static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return try_cmpxchg(&v->counter, old, new); } -static inline long arch_atomic64_xchg(atomic64_t *v, long new) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } -static inline void arch_atomic64_and(long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -199,7 +199,7 @@ static inline void arch_atomic64_and(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -208,7 +208,7 @@ static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_or(long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -216,7 +216,7 @@ static inline void arch_atomic64_or(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -225,7 +225,7 @@ static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_xor(long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -233,7 +233,7 @@ static inline void arch_atomic64_xor(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); -- cgit v1.2.1 From 4cc6620b5e4c8953c725bcfab86a57df01e83a7c Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:27 +0200 Subject: x86/jump_label: Add a __jump_label_set_jump_code() helper Move the definition of the code to be written from __jump_label_transform() to a specialized function. No functional change. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/d2f52a0010ecd399cf9b02a65bcf5836571b9e52.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/jump_label.c | 52 +++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e631c358f7f4..f33408f1c3f6 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void __jump_label_set_jump_code(struct jump_entry *entry, + enum jump_label_type type, + union jump_code_union *code, + int init) { - union jump_code_union jmp; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect, *code; + const void *expect; int line; - jmp.jump = 0xe9; - jmp.offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - - if (type == JUMP_LABEL_JMP) { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = ideal_nop; line = __LINE__; - } + code->jump = 0xe9; + code->offset = jump_entry_target(entry) - + (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - code = &jmp.code; + if (init) { + expect = default_nop; line = __LINE__; + } else if (type == JUMP_LABEL_JMP) { + expect = ideal_nop; line = __LINE__; } else { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = &jmp.code; line = __LINE__; - } - - code = ideal_nop; + expect = code->code; line = __LINE__; } if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) bug_at((void *)jump_entry_code(entry), line); + if (type == JUMP_LABEL_NOP) + memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); +} + +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + union jump_code_union code; + + __jump_label_set_jump_code(entry, type, &code, init); + /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that @@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), code, + text_poke_early((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE, + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); } -- cgit v1.2.1 From c0213b0ac03cf69f90fe5c6a8fe2c986630940fa Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:29 +0200 Subject: x86/alternative: Batch of patch operations Currently, the patch of an address is done in three steps: -- Pseudo-code #1 - Current implementation --- 1) add an int3 trap to the address that will be patched sync cores (send IPI to all other CPUs) 2) update all but the first byte of the patched range sync cores (send IPI to all other CPUs) 3) replace the first byte (int3) by the first byte of replacing opcode sync cores (send IPI to all other CPUs) -- Pseudo-code #1 --- When a static key has more than one entry, these steps are called once for each entry. The number of IPIs then is linear with regard to the number 'n' of entries of a key: O(n*3), which is O(n). This algorithm works fine for the update of a single key. But we think it is possible to optimize the case in which a static key has more than one entry. For instance, the sched_schedstats jump label has 56 entries in my (updated) fedora kernel, resulting in 168 IPIs for each CPU in which the thread that is enabling the key is _not_ running. With this patch, rather than receiving a single patch to be processed, a vector of patches is passed, enabling the rewrite of the pseudo-code #1 in this way: -- Pseudo-code #2 - This patch --- 1) for each patch in the vector: add an int3 trap to the address that will be patched sync cores (send IPI to all other CPUs) 2) for each patch in the vector: update all but the first byte of the patched range sync cores (send IPI to all other CPUs) 3) for each patch in the vector: replace the first byte (int3) by the first byte of replacing opcode sync cores (send IPI to all other CPUs) -- Pseudo-code #2 - This patch --- Doing the update in this way, the number of IPI becomes O(3) with regard to the number of keys, which is O(1). The batch mode is done with the function text_poke_bp_batch(), that receives two arguments: a vector of "struct text_to_poke", and the number of entries in the vector. The vector must be sorted by the addr field of the text_to_poke structure, enabling the binary search of a handler in the poke_int3_handler function (a fast path). Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/ca506ed52584c80f64de23f6f55ca288e5d079de.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 15 ++++ arch/x86/kernel/alternative.c | 154 +++++++++++++++++++++++++++-------- 2 files changed, 135 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 880b5515b1d6..d83e9f771d86 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,6 +18,20 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +/* + * Currently, the max observed size in the kernel code is + * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. + * Raise it if needed. + */ +#define POKE_MAX_OPCODE_SIZE 5 + +struct text_poke_loc { + void *detour; + void *addr; + size_t len; + const char opcode[POKE_MAX_OPCODE_SIZE]; +}; + extern void text_poke_early(void *addr, const void *opcode, size_t len); /* @@ -38,6 +52,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 390596b761e3..bd542f9b0953 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -848,81 +849,133 @@ static void do_sync_core(void *info) sync_core(); } -static bool bp_patching_in_progress; -static void *bp_int3_handler, *bp_int3_addr; +static struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; +} bp_patching; + +static int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < tp->addr) + return -1; + if (key > tp->addr) + return 1; + return 0; +} +NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { + struct text_poke_loc *tp; + unsigned char int3 = 0xcc; + void *ip; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching_in_progress. + * bp_patching.nr_entries. * - * in_progress = TRUE INT3 + * nr_entries != 0 INT3 * WMB RMB - * write INT3 if (in_progress) + * write INT3 if (nr_entries) * - * Idem for bp_int3_handler. + * Idem for other elements in bp_patching. */ smp_rmb(); - if (likely(!bp_patching_in_progress)) + if (likely(!bp_patching.nr_entries)) return 0; - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs)) return 0; - /* set up the specified breakpoint handler */ - regs->ip = (unsigned long) bp_int3_handler; + /* + * Discount the sizeof(int3). See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - sizeof(int3); + + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(bp_patching.nr_entries > 1)) { + tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + return 0; + } else { + tp = bp_patching.vec; + if (tp->addr != ip) + return 0; + } + + /* set up the specified breakpoint detour */ + regs->ip = (unsigned long) tp->detour; return 1; } NOKPROBE_SYMBOL(poke_int3_handler); /** - * text_poke_bp() -- update instructions on live kernel on SMP - * @addr: address to patch - * @opcode: opcode of new instruction - * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: - * - add a int3 trap to the address that will be patched + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched * - sync cores - * - update all but the first byte of the patched range + * - For each entry in the vector: + * - update all but the first byte of the patched range * - sync cores - * - replace the first byte (int3) by the first byte of - * replacing opcode + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode * - sync cores */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { + int patched_all_but_first = 0; unsigned char int3 = 0xcc; - - bp_int3_handler = handler; - bp_int3_addr = (u8 *)addr + sizeof(int3); - bp_patching_in_progress = true; + unsigned int i; lockdep_assert_held(&text_mutex); + bp_patching.vec = tp; + bp_patching.nr_entries = nr_entries; + /* * Corresponding read barrier in int3 notifier for making sure the - * in_progress and handler are correctly ordered wrt. patching. + * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); - text_poke(addr, &int3, sizeof(int3)); + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, &int3, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - if (len - sizeof(int3) > 0) { - /* patch all but the first byte */ - text_poke((char *)addr + sizeof(int3), - (const char *) opcode + sizeof(int3), - len - sizeof(int3)); + /* + * Second step: update all but the first byte of the patched range. + */ + for (i = 0; i < nr_entries; i++) { + if (tp[i].len - sizeof(int3) > 0) { + text_poke((char *)tp[i].addr + sizeof(int3), + (const char *)tp[i].opcode + sizeof(int3), + tp[i].len - sizeof(int3)); + patched_all_but_first++; + } + } + + if (patched_all_but_first) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -931,14 +984,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) on_each_cpu(do_sync_core, NULL, 1); } - /* patch the first byte */ - text_poke(addr, opcode, sizeof(int3)); + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. */ - bp_patching_in_progress = false; + bp_patching.vec = NULL; + bp_patching.nr_entries = 0; } +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. + */ +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +{ + struct text_poke_loc tp = { + .detour = handler, + .addr = addr, + .len = len, + }; + + if (len > POKE_MAX_OPCODE_SIZE) { + WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); + return; + } + + memcpy((void *)tp.opcode, opcode, len); + + text_poke_bp_batch(&tp, 1); +} -- cgit v1.2.1 From ba54f0c3f7c400a392c413d8ca21d3ada35f2584 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:31 +0200 Subject: x86/jump_label: Batch jump label updates Currently, the jump label of a static key is transformed via the arch specific function: void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) The new approach (batch mode) uses two arch functions, the first has the same arguments of the arch_jump_label_transform(), and is the function: bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) Rather than transforming the code, it adds the jump_entry in a queue of entries to be updated. This functions returns true in the case of a successful enqueue of an entry. If it returns false, the caller must to apply the queue and then try to queue again, for instance, because the queue is full. This function expects the caller to sort the entries by the address before enqueueuing then. This is already done by the arch independent code, though. After queuing all jump_entries, the function: void arch_jump_label_transform_apply(void) Applies the changes in the queue. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/57b4caa654bad7e3b066301c9a9ae233dea065b5.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/jump_label.h | 2 ++ arch/x86/kernel/jump_label.c | 69 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 65191ce8e1cf..06c3cc22a058 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f33408f1c3f6..ea13808bf6da 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -101,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry, mutex_unlock(&text_mutex); } +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +int tp_vec_nr = 0; + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + struct text_poke_loc *tp; + void *entry_code; + + if (system_state == SYSTEM_BOOTING) { + /* + * Fallback to the non-batching mode. + */ + arch_jump_label_transform(entry, type); + return true; + } + + /* + * No more space in the vector, tell upper layer to apply + * the queue before continuing. + */ + if (tp_vec_nr == TP_VEC_MAX) + return false; + + tp = &tp_vec[tp_vec_nr]; + + entry_code = (void *)jump_entry_code(entry); + + /* + * The INT3 handler will do a bsearch in the queue, so we need entries + * to be sorted. We can survive an unsorted list by rejecting the entry, + * forcing the generic jump_label code to apply the queue. Warning once, + * to raise the attention to the case of an unsorted entry that is + * better not happen, because, in the worst case we will perform in the + * same way as we do without batching - with some more overhead. + */ + if (tp_vec_nr > 0) { + int prev = tp_vec_nr - 1; + struct text_poke_loc *prev_tp = &tp_vec[prev]; + + if (WARN_ON_ONCE(prev_tp->addr > entry_code)) + return false; + } + + __jump_label_set_jump_code(entry, type, + (union jump_code_union *) &tp->opcode, 0); + + tp->addr = entry_code; + tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; + tp->len = JUMP_LABEL_NOP_SIZE; + + tp_vec_nr++; + + return true; +} + +void arch_jump_label_transform_apply(void) +{ + if (!tp_vec_nr) + return; + + mutex_lock(&text_mutex); + text_poke_bp_batch(tp_vec, tp_vec_nr); + mutex_unlock(&text_mutex); + + tp_vec_nr = 0; +} + static enum { JL_STATE_START, JL_STATE_NO_UPDATE, -- cgit v1.2.1 From 9ffbe8ac05dbb4ab4a4836a55a47fc6be945a38f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 31 May 2019 13:06:51 +0300 Subject: locking/lockdep: Rename lockdep_assert_held_exclusive() -> lockdep_assert_held_write() All callers of lockdep_assert_held_exclusive() use it to verify the correct locking state of either a semaphore (ldisc_sem in tty, mmap_sem for perf events, i_rwsem of inode for dax) or rwlock by apparmor. Thus it makes sense to rename _exclusive to _write since that's the semantics callers care. Additionally there is already lockdep_assert_held_read(), which this new naming is more consistent with. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190531100651.3969-1-nborisov@suse.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f315425d8468..cf91d80b8452 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2179,7 +2179,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(&mm->mmap_sem); + lockdep_assert_held_write(&mm->mmap_sem); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); -- cgit v1.2.1 From 69d927bba39517d0980462efc051875b7f4db185 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Apr 2019 13:38:23 +0200 Subject: x86/atomic: Fix smp_mb__{before,after}_atomic() Recent probing at the Linux Kernel Memory Model uncovered a 'surprise'. Strongly ordered architectures where the atomic RmW primitive implies full memory ordering and smp_mb__{before,after}_atomic() are a simple barrier() (such as x86) fail for: *x = 1; atomic_inc(u); smp_mb__after_atomic(); r0 = *y; Because, while the atomic_inc() implies memory order, it (surprisingly) does not provide a compiler barrier. This then allows the compiler to re-order like so: atomic_inc(u); *x = 1; smp_mb__after_atomic(); r0 = *y; Which the CPU is then allowed to re-order (under TSO rules) like: atomic_inc(u); r0 = *y; *x = 1; And this very much was not intended. Therefore strengthen the atomic RmW ops to include a compiler barrier. NOTE: atomic_{or,and,xor} and the bitops already had the compiler barrier. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 8 ++++---- arch/x86/include/asm/atomic64_64.h | 8 ++++---- arch/x86/include/asm/barrier.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ea3d95275b43..115127c7ad28 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -54,7 +54,7 @@ static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -68,7 +68,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -95,7 +95,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc @@ -108,7 +108,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v) static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 703b7dfd45e0..95c6ceac66b9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -45,7 +45,7 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -59,7 +59,7 @@ static inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -87,7 +87,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc @@ -101,7 +101,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 14de0432d288..84f848c2541a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -80,8 +80,8 @@ do { \ }) /* Atomic operations are already serializing on x86 */ -#define __smp_mb__before_atomic() barrier() -#define __smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() do { } while (0) +#define __smp_mb__after_atomic() do { } while (0) #include -- cgit v1.2.1 From 0b9ccc0a9b146b49e83bf1e32f70d2396a694bfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Dec 2018 12:24:33 +0100 Subject: x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}() Nadav Amit reported that commit: b59167ac7baf ("x86/percpu: Fix this_cpu_read()") added a bunch of constraints to all sorts of code; and while some of that was correct and desired, some of that seems superfluous. The thing is, the this_cpu_*() operations are defined IRQ-safe, this means the values are subject to change from IRQs, and thus must be reloaded. Also, the generic form: local_irq_save() __this_cpu_read() local_irq_restore() would not allow the re-use of previous values; if by nothing else, then the barrier()s implied by local_irq_*(). Which raises the point that percpu_from_op() and the others also need that volatile. OTOH __this_cpu_*() operations are not IRQ-safe and assume external preempt/IRQ disabling and could thus be allowed more room for optimization. This makes the this_cpu_*() vs __this_cpu_*() behaviour more consistent with other architectures. $ ./compare.sh defconfig-build defconfig-build1 vmlinux.o x86_pmu_cancel_txn 80 71 -9,+0 __text_poke 919 964 +45,+0 do_user_addr_fault 1082 1058 -24,+0 __do_page_fault 1194 1178 -16,+0 do_exit 2995 3027 -43,+75 process_one_work 1008 989 -67,+48 finish_task_switch 524 505 -19,+0 __schedule_bug 103 98 -59,+54 __schedule_bug 103 98 -59,+54 __sched_setscheduler 2015 2030 +15,+0 freeze_processes 203 230 +31,-4 rcu_gp_kthread_wake 106 99 -7,+0 rcu_core 1841 1834 -7,+0 call_timer_fn 298 286 -12,+0 can_stop_idle_tick 146 139 -31,+24 perf_pending_event 253 239 -14,+0 shmem_alloc_page 209 213 +4,+0 __alloc_pages_slowpath 3284 3269 -15,+0 umount_tree 671 694 +23,+0 advance_transaction 803 798 -5,+0 con_put_char 71 51 -20,+0 xhci_urb_enqueue 1302 1295 -7,+0 xhci_urb_enqueue 1302 1295 -7,+0 tcp_sacktag_write_queue 2130 2075 -55,+0 tcp_try_undo_loss 229 208 -21,+0 tcp_v4_inbound_md5_hash 438 411 -31,+4 tcp_v4_inbound_md5_hash 438 411 -31,+4 tcp_v6_inbound_md5_hash 469 411 -33,-25 tcp_v6_inbound_md5_hash 469 411 -33,-25 restricted_pointer 434 420 -14,+0 irq_exit 162 154 -8,+0 get_perf_callchain 638 624 -14,+0 rt_mutex_trylock 169 156 -13,+0 avc_has_extended_perms 1092 1089 -3,+0 avc_has_perm_noaudit 309 306 -3,+0 __perf_sw_event 138 122 -16,+0 perf_swevent_get_recursion_context 116 102 -14,+0 __local_bh_enable_ip 93 72 -21,+0 xfrm_input 4175 4161 -14,+0 avc_has_perm 446 443 -3,+0 vm_events_fold_cpu 57 56 -1,+0 vfree 68 61 -7,+0 freeze_processes 203 230 +31,-4 _local_bh_enable 44 30 -14,+0 ip_do_fragment 1982 1944 -38,+0 do_exit 2995 3027 -43,+75 __do_softirq 742 724 -18,+0 cpu_init 1510 1489 -21,+0 account_system_time 80 79 -1,+0 total 12985281 12984819 -742,+280 Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20181206112433.GB13675@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/percpu.h | 224 +++++++++++++++++++++--------------------- 1 file changed, 112 insertions(+), 112 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1a19d11cfbbd..f75ccccd71aa 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -87,7 +87,7 @@ * don't give an lvalue though). */ extern void __bad_percpu_size(void); -#define percpu_to_op(op, var, val) \ +#define percpu_to_op(qual, op, var, val) \ do { \ typedef typeof(var) pto_T__; \ if (0) { \ @@ -97,22 +97,22 @@ do { \ } \ switch (sizeof(var)) { \ case 1: \ - asm(op "b %1,"__percpu_arg(0) \ + asm qual (op "b %1,"__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pto_T__)(val))); \ break; \ case 2: \ - asm(op "w %1,"__percpu_arg(0) \ + asm qual (op "w %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 4: \ - asm(op "l %1,"__percpu_arg(0) \ + asm qual (op "l %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 8: \ - asm(op "q %1,"__percpu_arg(0) \ + asm qual (op "q %1,"__percpu_arg(0) \ : "+m" (var) \ : "re" ((pto_T__)(val))); \ break; \ @@ -124,7 +124,7 @@ do { \ * Generate a percpu add to memory instruction and optimize code * if one is added or subtracted. */ -#define percpu_add_op(var, val) \ +#define percpu_add_op(qual, var, val) \ do { \ typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ @@ -138,41 +138,41 @@ do { \ switch (sizeof(var)) { \ case 1: \ if (pao_ID__ == 1) \ - asm("incb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incb "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decb "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addb %1, "__percpu_arg(0) \ + asm qual ("addb %1, "__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pao_T__)(val))); \ break; \ case 2: \ if (pao_ID__ == 1) \ - asm("incw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incw "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decw "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addw %1, "__percpu_arg(0) \ + asm qual ("addw %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 4: \ if (pao_ID__ == 1) \ - asm("incl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incl "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decl "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addl %1, "__percpu_arg(0) \ + asm qual ("addl %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 8: \ if (pao_ID__ == 1) \ - asm("incq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incq "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decq "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addq %1, "__percpu_arg(0) \ + asm qual ("addq %1, "__percpu_arg(0) \ : "+m" (var) \ : "re" ((pao_T__)(val))); \ break; \ @@ -180,27 +180,27 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(qual, op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm volatile(op "b "__percpu_arg(1)",%0"\ + asm qual (op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm volatile(op "w "__percpu_arg(1)",%0"\ + asm qual (op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm volatile(op "l "__percpu_arg(1)",%0"\ + asm qual (op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm volatile(op "q "__percpu_arg(1)",%0"\ + asm qual (op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ @@ -238,23 +238,23 @@ do { \ pfo_ret__; \ }) -#define percpu_unary_op(op, var) \ +#define percpu_unary_op(qual, op, var) \ ({ \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(0) \ + asm qual (op "b "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(0) \ + asm qual (op "w "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(0) \ + asm qual (op "l "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(0) \ + asm qual (op "q "__percpu_arg(0) \ : "+m" (var)); \ break; \ default: __bad_percpu_size(); \ @@ -264,27 +264,27 @@ do { \ /* * Add return operation */ -#define percpu_add_return_op(var, val) \ +#define percpu_add_return_op(qual, var, val) \ ({ \ typeof(var) paro_ret__ = val; \ switch (sizeof(var)) { \ case 1: \ - asm("xaddb %0, "__percpu_arg(1) \ + asm qual ("xaddb %0, "__percpu_arg(1) \ : "+q" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 2: \ - asm("xaddw %0, "__percpu_arg(1) \ + asm qual ("xaddw %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 4: \ - asm("xaddl %0, "__percpu_arg(1) \ + asm qual ("xaddl %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 8: \ - asm("xaddq %0, "__percpu_arg(1) \ + asm qual ("xaddq %0, "__percpu_arg(1) \ : "+re" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ @@ -299,13 +299,13 @@ do { \ * expensive due to the implied lock prefix. The processor cannot prefetch * cachelines if xchg is used. */ -#define percpu_xchg_op(var, nval) \ +#define percpu_xchg_op(qual, var, nval) \ ({ \ typeof(var) pxo_ret__; \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("\n\tmov "__percpu_arg(1)",%%al" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%al" \ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -313,7 +313,7 @@ do { \ : "memory"); \ break; \ case 2: \ - asm("\n\tmov "__percpu_arg(1)",%%ax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%ax" \ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -321,7 +321,7 @@ do { \ : "memory"); \ break; \ case 4: \ - asm("\n\tmov "__percpu_arg(1)",%%eax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%eax" \ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -329,7 +329,7 @@ do { \ : "memory"); \ break; \ case 8: \ - asm("\n\tmov "__percpu_arg(1)",%%rax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%rax" \ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -345,32 +345,32 @@ do { \ * cmpxchg has no such implied lock semantics as a result it is much * more efficient for cpu local operations. */ -#define percpu_cmpxchg_op(var, oval, nval) \ +#define percpu_cmpxchg_op(qual, var, oval, nval) \ ({ \ typeof(var) pco_ret__; \ typeof(var) pco_old__ = (oval); \ typeof(var) pco_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("cmpxchgb %2, "__percpu_arg(1) \ + asm qual ("cmpxchgb %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "q" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 2: \ - asm("cmpxchgw %2, "__percpu_arg(1) \ + asm qual ("cmpxchgw %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 4: \ - asm("cmpxchgl %2, "__percpu_arg(1) \ + asm qual ("cmpxchgl %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 8: \ - asm("cmpxchgq %2, "__percpu_arg(1) \ + asm qual ("cmpxchgq %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ @@ -391,58 +391,58 @@ do { \ */ #define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) - -#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_1(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op(, "mov", pcp) + +#define raw_cpu_write_1(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(, pcp, val) +#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(, pcp, val) +#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(, pcp, val) + +#define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(volatile, pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ @@ -466,23 +466,23 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_8(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(, pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction -- cgit v1.2.1 From 9ed7d75b2f09d836e71d597cd5879abb1a44e7a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 09:48:51 +0100 Subject: x86/percpu: Relax smp_processor_id() Nadav reported that since this_cpu_read() became asm-volatile, many smp_processor_id() users generated worse code due to the extra constraints. However since smp_processor_id() is reading a stable value, we can use __this_cpu_read(). While this does reduce text size somewhat, this mostly results in code movement to .text.unlikely as a result of more/larger .cold. subfunctions. Less text on the hotpath is good for I$. $ ./compare.sh defconfig-build1 defconfig-build2 vmlinux.o setup_APIC_ibs 90 98 -12,+20 force_ibs_eilvt_setup 400 413 -57,+70 pci_serr_error 109 104 -54,+49 pci_serr_error 109 104 -54,+49 unknown_nmi_error 125 120 -76,+71 unknown_nmi_error 125 120 -76,+71 io_check_error 125 132 -97,+104 intel_thermal_interrupt 730 822 +92,+0 intel_init_thermal 951 945 -6,+0 generic_get_mtrr 301 294 -7,+0 generic_get_mtrr 301 294 -7,+0 generic_set_all 749 754 -44,+49 get_fixed_ranges 352 360 -41,+49 x86_acpi_suspend_lowlevel 369 363 -6,+0 check_tsc_sync_source 412 412 -71,+71 irq_migrate_all_off_this_cpu 662 674 -14,+26 clocksource_watchdog 748 748 -113,+113 __perf_event_account_interrupt 204 197 -7,+0 attempt_merge 1748 1741 -7,+0 intel_guc_send_ct 1424 1409 -15,+0 __fini_doorbell 235 231 -4,+0 bdw_set_cdclk 928 923 -5,+0 gen11_dsi_disable 1571 1556 -15,+0 gmbus_wait 493 488 -5,+0 md_make_request 376 369 -7,+0 __split_and_process_bio 543 536 -7,+0 delay_tsc 96 89 -7,+0 hsw_disable_pc8 696 691 -5,+0 tsc_verify_tsc_adjust 215 228 -22,+35 cpuidle_driver_unref 56 49 -7,+0 blk_account_io_completion 159 148 -11,+0 mtrr_wrmsr 95 99 -29,+33 __intel_wait_for_register_fw 401 419 +18,+0 cpuidle_driver_ref 43 36 -7,+0 cpuidle_get_driver 15 8 -7,+0 blk_account_io_done 535 528 -7,+0 irq_migrate_all_off_this_cpu 662 674 -14,+26 check_tsc_sync_source 412 412 -71,+71 irq_wait_for_poll 170 163 -7,+0 generic_end_io_acct 329 322 -7,+0 x86_acpi_suspend_lowlevel 369 363 -6,+0 nohz_balance_enter_idle 198 191 -7,+0 generic_start_io_acct 254 247 -7,+0 blk_account_io_start 341 334 -7,+0 perf_event_task_tick 682 675 -7,+0 intel_init_thermal 951 945 -6,+0 amd_e400_c1e_apic_setup 47 51 -28,+32 setup_APIC_eilvt 350 328 -22,+0 hsw_enable_pc8 1611 1605 -6,+0 total 12985947 12985892 -994,+939 Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index da545df207b2..0d3fe060a44f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -162,7 +162,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); -- cgit v1.2.1 From 602447f95461469e20c81254c1c08be23a46fe53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 09:53:46 +0100 Subject: x86/percpu, x86/irq: Relax {set,get}_irq_regs() Nadav reported that since the this_cpu_*() ops got asm-volatile constraints on, code generation suffered for do_IRQ(), but since this is all with IRQs disabled we can use __this_cpu_*(). smp_x86_platform_ipi 234 222 -12,+0 smp_kvm_posted_intr_ipi 74 66 -8,+0 smp_kvm_posted_intr_wakeup_ipi 86 78 -8,+0 smp_apic_timer_interrupt 292 284 -8,+0 smp_kvm_posted_intr_nested_ipi 74 66 -8,+0 do_IRQ 195 187 -8,+0 Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_regs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index 8f3bee821e6c..187ce59aea28 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -16,7 +16,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return this_cpu_read(irq_regs); + return __this_cpu_read(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) @@ -24,7 +24,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) struct pt_regs *old_regs; old_regs = get_irq_regs(); - this_cpu_write(irq_regs, new_regs); + __this_cpu_write(irq_regs, new_regs); return old_regs; } -- cgit v1.2.1 From 2234a6d3a28a971182cc91d5679e444516421de0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 11:09:56 +0100 Subject: x86/percpu: Optimize raw_cpu_xchg() Since raw_cpu_xchg() doesn't need to be IRQ-safe, like this_cpu_xchg(), we can use a simple load-store instead of the cmpxchg loop. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/percpu.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index f75ccccd71aa..2278797c769d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -407,9 +407,21 @@ do { \ #define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val) #define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val) #define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val) -#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(, pcp, val) -#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(, pcp, val) -#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(, pcp, val) + +/* + * raw_cpu_xchg() can use a load-store since it is not required to be + * IRQ-safe. + */ +#define raw_percpu_xchg_op(var, nval) \ +({ \ + typeof(var) pxo_ret__ = raw_cpu_read(var); \ + raw_cpu_write(var, (nval)); \ + pxo_ret__; \ +}) + +#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp) #define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp) @@ -472,7 +484,7 @@ do { \ #define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val) #define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val) #define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val) -#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(, pcp, nval) +#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) #define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp) -- cgit v1.2.1 From bf10c97adbd0dc8fa65c35d5b0c0dc281a68ac8e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Jun 2019 11:45:48 +0800 Subject: x86/jump_label: Make tp_vec_nr static Fix sparse warning: arch/x86/kernel/jump_label.c:106:5: warning: symbol 'tp_vec_nr' was not declared. Should it be static? It's only used in jump_label.c, so make it static. Fixes: ba54f0c3f7c4 ("x86/jump_label: Batch jump label updates") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/20190625034548.26392-1-yuehaibing@huawei.com --- arch/x86/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea13808bf6da..044053235302 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -103,7 +103,7 @@ void arch_jump_label_transform(struct jump_entry *entry, #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; -int tp_vec_nr = 0; +static int tp_vec_nr; bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) -- cgit v1.2.1