diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-12-04 10:54:47 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-12-04 10:54:47 +1100 |
commit | 4fc52faf253242cb5bab57fcb062e37e89770ad8 (patch) | |
tree | 265255968de646fad754c1bc0b1edc4d966ed9fa /kernel | |
parent | 4b91b79c32a4d6623506e46f495cc1af809e5de9 (diff) | |
parent | e445033e58108a9891abfbc0dea90b066a75e4a9 (diff) | |
download | linux-next-4fc52faf253242cb5bab57fcb062e37e89770ad8.tar.gz |
Merge remote-tracking branch 'tip/auto-latest'
# Conflicts:
# arch/x86/entry/entry_32.S
# arch/x86/include/asm/pgtable_32_types.h
# arch/x86/kernel/head_32.S
# arch/x86/mm/pat_interval.c
# include/trace/events/rpcrdma.h
# kernel/time/time.c
# kernel/trace/trace_export.c
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 6 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/irq/chip.c | 44 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 2 | ||||
-rw-r--r-- | kernel/irq_work.c | 34 | ||||
-rw-r--r-- | kernel/kcsan/Makefile | 11 | ||||
-rw-r--r-- | kernel/kcsan/atomic.h | 27 | ||||
-rw-r--r-- | kernel/kcsan/core.c | 621 | ||||
-rw-r--r-- | kernel/kcsan/debugfs.c | 271 | ||||
-rw-r--r-- | kernel/kcsan/encoding.h | 95 | ||||
-rw-r--r-- | kernel/kcsan/kcsan.h | 109 | ||||
-rw-r--r-- | kernel/kcsan/report.c | 318 | ||||
-rw-r--r-- | kernel/kcsan/test.c | 121 | ||||
-rw-r--r-- | kernel/module.c | 43 | ||||
-rw-r--r-- | kernel/printk/printk.c | 2 | ||||
-rw-r--r-- | kernel/sched/Makefile | 6 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 11 | ||||
-rw-r--r-- | kernel/time/time.c | 25 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace.h | 31 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 66 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 20 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 106 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 16 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 50 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 9 |
28 files changed, 1806 insertions, 252 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f2cc0d118a0b..e5ffd8c00254 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,9 @@ endif # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n +# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data +# are CPU local" => assume no data races), to reduce overhead in interrupts. +KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n @@ -30,6 +33,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible @@ -102,6 +106,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_PERF_EVENTS) += events/ @@ -119,6 +124,7 @@ obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n +KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n $(obj)/configs.o: $(obj)/config_data.gz diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index caca752ee5e6..3f958b90d914 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (irqs_disabled()) { work = this_cpu_ptr(&up_read_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/fork.c b/kernel/fork.c index 21c6c1e29b98..2508a4f238a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2185,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boottime_ns(); + p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b76703b2c0af..b3fa2d87d2f3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1298,6 +1298,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ /** + * irq_chip_set_parent_state - set the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_set_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool val) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_set_irqchip_state) + return 0; + + return data->chip->irq_set_irqchip_state(data, which, val); +} +EXPORT_SYMBOL_GPL(irq_chip_set_parent_state); + +/** + * irq_chip_get_parent_state - get the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: one of IRQCHIP_STATE_* the caller wants to know + * @state: a pointer to a boolean where the state is to be stored + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_get_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_get_irqchip_state) + return 0; + + return data->chip->irq_get_irqchip_state(data, which, state); +} +EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); + +/** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) * @data: Pointer to interrupt specific data diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9be995fc3c5a..5b8fdd659e54 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -750,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) EXPORT_SYMBOL_GPL(irq_free_descs); /** - * irq_alloc_descs - allocate and initialize a range of irq descriptors + * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. diff --git a/kernel/irq_work.c b/kernel/irq_work.c index d42acaf81886..828cc30774bc 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, oflags, nflags; + int oflags; + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* - * Start with our best wish as a premise but only trust any - * flag value after cmpxchg() result. + * If the work is already pending, no need to raise the IPI. + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure + * everything we did before is visible. */ - flags = work->flags & ~IRQ_WORK_PENDING; - for (;;) { - nflags = flags | IRQ_WORK_CLAIMED; - oflags = cmpxchg(&work->flags, flags, nflags); - if (oflags == flags) - break; - if (oflags & IRQ_WORK_PENDING) - return false; - flags = oflags; - cpu_relax(); - } - + if (oflags & IRQ_WORK_PENDING) + return false; return true; } @@ -61,7 +53,7 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { + if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); @@ -143,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - unsigned long flags; BUG_ON(!irqs_disabled()); @@ -152,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list) llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, llnode) { + int flags; /* * Clear the PENDING bit, after this point the @work * can be re-used. @@ -159,15 +151,15 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = work->flags & ~IRQ_WORK_PENDING; - xchg(&work->flags, flags); + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } @@ -199,7 +191,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (work->flags & IRQ_WORK_BUSY) + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile new file mode 100644 index 000000000000..dd15b62ec0b5 --- /dev/null +++ b/kernel/kcsan/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +KCSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ + $(call cc-option,-fno-stack-protector,) + +obj-y := core.o debugfs.o report.o +obj-$(CONFIG_KCSAN_SELFTEST) += test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h new file mode 100644 index 000000000000..576e03ddd6a3 --- /dev/null +++ b/kernel/kcsan/atomic.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ATOMIC_H +#define _KERNEL_KCSAN_ATOMIC_H + +#include <linux/jiffies.h> + +/* + * Helper that returns true if access to @ptr should be considered an atomic + * access, even though it is not explicitly atomic. + * + * List all volatile globals that have been observed in races, to suppress + * data race reports between accesses to these variables. + * + * For now, we assume that volatile accesses of globals are as strong as atomic + * accesses (READ_ONCE, WRITE_ONCE cast to volatile). The situation is still not + * entirely clear, as on some architectures (Alpha) READ_ONCE/WRITE_ONCE do more + * than cast to volatile. Eventually, we hope to be able to remove this + * function. + */ +static inline bool kcsan_is_atomic(const volatile void *ptr) +{ + /* only jiffies for now */ + return ptr == &jiffies; +} + +#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c new file mode 100644 index 000000000000..3314fc29e236 --- /dev/null +++ b/kernel/kcsan/core.c @@ -0,0 +1,621 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +#include "atomic.h" +#include "encoding.h" +#include "kcsan.h" + +bool kcsan_enabled; + +/* Per-CPU kcsan_ctx for interrupts */ +static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, +}; + +/* + * Helper macros to index into adjacent slots slots, starting from address slot + * itself, followed by the right and left slots. + * + * The purpose is 2-fold: + * + * 1. if during insertion the address slot is already occupied, check if + * any adjacent slots are free; + * 2. accesses that straddle a slot boundary due to size that exceeds a + * slot's range may check adjacent slots if any watchpoint matches. + * + * Note that accesses with very large size may still miss a watchpoint; however, + * given this should be rare, this is a reasonable trade-off to make, since this + * will avoid: + * + * 1. excessive contention between watchpoint checks and setup; + * 2. larger number of simultaneous watchpoints without sacrificing + * performance. + * + * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]: + * + * slot=0: [ 1, 2, 0] + * slot=9: [10, 11, 9] + * slot=63: [64, 65, 63] + */ +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) +#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) + +/* + * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary + * slot (middle) is fine if we assume that data races occur rarely. The set of + * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to + * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. + */ +#define SLOT_IDX_FAST(slot, i) (slot + i) + +/* + * Watchpoints, with each entry encoded as defined in encoding.h: in order to be + * able to safely update and access a watchpoint without introducing locking + * overhead, we encode each watchpoint as a single atomic long. The initial + * zero-initialized state matches INVALID_WATCHPOINT. + * + * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to + * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path. + */ +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; + +/* + * Instructions to skip watching counter, used in should_watch(). We use a + * per-CPU counter to avoid excessive contention. + */ +static DEFINE_PER_CPU(long, kcsan_skip); + +static inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, + bool expect_write, + long *encoded_watchpoint) +{ + const int slot = watchpoint_slot(addr); + const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; + atomic_long_t *watchpoint; + unsigned long wp_addr_masked; + size_t wp_size; + bool is_write; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)]; + *encoded_watchpoint = atomic_long_read(watchpoint); + if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked, + &wp_size, &is_write)) + continue; + + if (expect_write && !is_write) + continue; + + /* Check if the watchpoint matches the access. */ + if (matching_access(wp_addr_masked, wp_size, addr_masked, size)) + return watchpoint; + } + + return NULL; +} + +static inline atomic_long_t * +insert_watchpoint(unsigned long addr, size_t size, bool is_write) +{ + const int slot = watchpoint_slot(addr); + const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); + atomic_long_t *watchpoint; + int i; + + /* Check slot index logic, ensuring we stay within array bounds. */ + BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + long expect_val = INVALID_WATCHPOINT; + + /* Try to acquire this slot. */ + watchpoint = &watchpoints[SLOT_IDX(slot, i)]; + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint)) + return watchpoint; + } + + return NULL; +} + +/* + * Return true if watchpoint was successfully consumed, false otherwise. + * + * This may return false if: + * + * 1. another thread already consumed the watchpoint; + * 2. the thread that set up the watchpoint already removed it; + * 3. the watchpoint was removed and then re-used. + */ +static inline bool +try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) +{ + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); +} + +/* + * Return true if watchpoint was not touched, false if consumed. + */ +static inline bool remove_watchpoint(atomic_long_t *watchpoint) +{ + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; +} + +static inline struct kcsan_ctx *get_ctx(void) +{ + /* + * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would + * also result in calls that generate warnings in uaccess regions. + */ + return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); +} + +static inline bool is_atomic(const volatile void *ptr) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (unlikely(ctx->atomic_next > 0)) { + /* + * Because we do not have separate contexts for nested + * interrupts, in case atomic_next is set, we simply assume that + * the outer interrupt set atomic_next. In the worst case, we + * will conservatively consider operations as atomic. This is a + * reasonable trade-off to make, since this case should be + * extremely rare; however, even if extremely rare, it could + * lead to false positives otherwise. + */ + if ((hardirq_count() >> HARDIRQ_SHIFT) < 2) + --ctx->atomic_next; /* in task, or outer interrupt */ + return true; + } + if (unlikely(ctx->atomic_nest_count > 0 || ctx->in_flat_atomic)) + return true; + + return kcsan_is_atomic(ptr); +} + +static inline bool should_watch(const volatile void *ptr, int type) +{ + /* + * Never set up watchpoints when memory operations are atomic. + * + * Need to check this first, before kcsan_skip check below: (1) atomics + * should not count towards skipped instructions, and (2) to actually + * decrement kcsan_atomic_next for consecutive instruction stream. + */ + if ((type & KCSAN_ACCESS_ATOMIC) != 0 || is_atomic(ptr)) + return false; + + if (this_cpu_dec_return(kcsan_skip) >= 0) + return false; + + /* + * NOTE: If we get here, kcsan_skip must always be reset in slow path + * via reset_kcsan_skip() to avoid underflow. + */ + + /* this operation should be watched */ + return true; +} + +static inline void reset_kcsan_skip(void) +{ + long skip_count = CONFIG_KCSAN_SKIP_WATCH - + (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? + prandom_u32_max(CONFIG_KCSAN_SKIP_WATCH) : + 0); + this_cpu_write(kcsan_skip, skip_count); +} + +static inline bool kcsan_is_enabled(void) +{ + return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; +} + +static inline unsigned int get_delay(void) +{ + unsigned int delay = in_task() ? CONFIG_KCSAN_UDELAY_TASK : + CONFIG_KCSAN_UDELAY_INTERRUPT; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + prandom_u32_max(delay) : + 0); +} + +/* + * Pull everything together: check_access() below contains the performance + * critical operations; the fast-path (including check_access) functions should + * all be inlinable by the instrumentation functions. + * + * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are + * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can + * be filtered from the stacktrace, as well as give them unique names for the + * UACCESS whitelist of objtool. Each function uses user_access_save/restore(), + * since they do not access any user memory, but instrumentation is still + * emitted in UACCESS regions. + */ + +static noinline void kcsan_found_watchpoint(const volatile void *ptr, + size_t size, + bool is_write, + atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + unsigned long flags; + bool consumed; + + if (!kcsan_is_enabled()) + return; + /* + * Consume the watchpoint as soon as possible, to minimize the chances + * of !consumed. Consuming the watchpoint must always be guarded by + * kcsan_is_enabled() check, as otherwise we might erroneously + * triggering reports when disabled. + */ + consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); + + /* keep this after try_consume_watchpoint */ + flags = user_access_save(); + + if (consumed) { + kcsan_report(ptr, size, is_write, true, raw_smp_processor_id(), + KCSAN_REPORT_CONSUMED_WATCHPOINT); + } else { + /* + * The other thread may not print any diagnostics, as it has + * already removed the watchpoint, or another thread consumed + * the watchpoint before this thread. + */ + kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + } + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + user_access_restore(flags); +} + +static noinline void +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) +{ + atomic_long_t *watchpoint; + union { + u8 _1; + u16 _2; + u32 _4; + u64 _8; + } expect_value; + bool value_change = false; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + + /* + * Always reset kcsan_skip counter in slow-path to avoid underflow; see + * should_watch(). + */ + reset_kcsan_skip(); + + if (!kcsan_is_enabled()) + goto out; + + if (!check_encodable((unsigned long)ptr, size)) { + kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + goto out; + } + + /* + * Disable interrupts & preemptions to avoid another thread on the same + * CPU accessing memory locations for the set up watchpoint; this is to + * avoid reporting races to e.g. CPU-local data. + * + * An alternative would be adding the source CPU to the watchpoint + * encoding, and checking that watchpoint-CPU != this-CPU. There are + * several problems with this: + * 1. we should avoid stealing more bits from the watchpoint encoding + * as it would affect accuracy, as well as increase performance + * overhead in the fast-path; + * 2. if we are preempted, but there *is* a genuine data race, we + * would *not* report it -- since this is the common case (vs. + * CPU-local data accesses), it makes more sense (from a data race + * detection point of view) to simply disable preemptions to ensure + * as many tasks as possible run on other CPUs. + */ + local_irq_save(irq_flags); + + watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); + if (watchpoint == NULL) { + /* + * Out of capacity: the size of 'watchpoints', and the frequency + * with which should_watch() returns true should be tweaked so + * that this case happens very rarely. + */ + kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + goto out_unlock; + } + + kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); + kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + + /* + * Read the current value, to later check and infer a race if the data + * was modified via a non-instrumented access, e.g. from a device. + */ + switch (size) { + case 1: + expect_value._1 = READ_ONCE(*(const u8 *)ptr); + break; + case 2: + expect_value._2 = READ_ONCE(*(const u16 *)ptr); + break; + case 4: + expect_value._4 = READ_ONCE(*(const u32 *)ptr); + break; + case 8: + expect_value._8 = READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { + kcsan_disable_current(); + pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + is_write ? "write" : "read", size, ptr, + watchpoint_slot((unsigned long)ptr), + encode_watchpoint((unsigned long)ptr, size, is_write)); + kcsan_enable_current(); + } + + /* + * Delay this thread, to increase probability of observing a racy + * conflicting access. + */ + udelay(get_delay()); + + /* + * Re-read value, and check if it is as expected; if not, we infer a + * racy access. + */ + switch (size) { + case 1: + value_change = expect_value._1 != READ_ONCE(*(const u8 *)ptr); + break; + case 2: + value_change = expect_value._2 != READ_ONCE(*(const u16 *)ptr); + break; + case 4: + value_change = expect_value._4 != READ_ONCE(*(const u32 *)ptr); + break; + case 8: + value_change = expect_value._8 != READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + /* Check if this access raced with another. */ + if (!remove_watchpoint(watchpoint)) { + /* + * No need to increment 'data_races' counter, as the racing + * thread already did. + */ + kcsan_report(ptr, size, is_write, size > 8 || value_change, + smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + } else if (value_change) { + /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + kcsan_report(ptr, size, is_write, true, + smp_processor_id(), + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); + } + + kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); +out_unlock: + local_irq_restore(irq_flags); +out: + user_access_restore(ua_flags); +} + +static __always_inline void check_access(const volatile void *ptr, size_t size, + int type) +{ + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + atomic_long_t *watchpoint; + long encoded_watchpoint; + + /* + * Avoid user_access_save in fast-path: find_watchpoint is safe without + * user_access_save, as the address that ptr points to is only used to + * check if a watchpoint exists; ptr is never dereferenced. + */ + watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + &encoded_watchpoint); + /* + * It is safe to check kcsan_is_enabled() after find_watchpoint in the + * slow-path, as long as no state changes that cause a data race to be + * detected and reported have occurred until kcsan_is_enabled() is + * checked. + */ + + if (unlikely(watchpoint != NULL)) + kcsan_found_watchpoint(ptr, size, is_write, watchpoint, + encoded_watchpoint); + else if (unlikely(should_watch(ptr, type))) + kcsan_setup_watchpoint(ptr, size, is_write); +} + +/* === Public interface ===================================================== */ + +void __init kcsan_init(void) +{ + BUG_ON(!in_task()); + + kcsan_debugfs_init(); + + /* + * We are in the init task, and no other tasks should be running; + * WRITE_ONCE without memory barrier is sufficient. + */ + if (IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE)) + WRITE_ONCE(kcsan_enabled, true); +} + +/* === Exported interface =================================================== */ + +void kcsan_disable_current(void) +{ + ++get_ctx()->disable_count; +} +EXPORT_SYMBOL(kcsan_disable_current); + +void kcsan_enable_current(void) +{ + if (get_ctx()->disable_count-- == 0) { + /* + * Warn if kcsan_enable_current() calls are unbalanced with + * kcsan_disable_current() calls, which causes disable_count to + * become negative and should not happen. + */ + kcsan_disable_current(); /* restore to 0, KCSAN still enabled */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_enable_current); + +void kcsan_nestable_atomic_begin(void) +{ + /* + * Do *not* check and warn if we are in a flat atomic region: nestable + * and flat atomic regions are independent from each other. + * See include/linux/kcsan.h: struct kcsan_ctx comments for more + * comments. + */ + + ++get_ctx()->atomic_nest_count; +} +EXPORT_SYMBOL(kcsan_nestable_atomic_begin); + +void kcsan_nestable_atomic_end(void) +{ + if (get_ctx()->atomic_nest_count-- == 0) { + /* + * Warn if kcsan_nestable_atomic_end() calls are unbalanced with + * kcsan_nestable_atomic_begin() calls, which causes + * atomic_nest_count to become negative and should not happen. + */ + kcsan_nestable_atomic_begin(); /* restore to 0 */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_nestable_atomic_end); + +void kcsan_flat_atomic_begin(void) +{ + get_ctx()->in_flat_atomic = true; +} +EXPORT_SYMBOL(kcsan_flat_atomic_begin); + +void kcsan_flat_atomic_end(void) +{ + get_ctx()->in_flat_atomic = false; +} +EXPORT_SYMBOL(kcsan_flat_atomic_end); + +void kcsan_atomic_next(int n) +{ + get_ctx()->atomic_next = n; +} +EXPORT_SYMBOL(kcsan_atomic_next); + +void __kcsan_check_access(const volatile void *ptr, size_t size, int type) +{ + check_access(ptr, size, type); +} +EXPORT_SYMBOL(__kcsan_check_access); + +/* + * KCSAN uses the same instrumentation that is emitted by supported compilers + * for ThreadSanitizer (TSAN). + * + * When enabled, the compiler emits instrumentation calls (the functions + * prefixed with "__tsan" below) for all loads and stores that it generated; + * inline asm is not instrumented. + * + * Note that, not all supported compiler versions distinguish aligned/unaligned + * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned + * version to the generic version, which can handle both. + */ + +#define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr) \ + { \ + check_access(ptr, size, 0); \ + } \ + EXPORT_SYMBOL(__tsan_read##size); \ + void __tsan_unaligned_read##size(void *ptr) \ + __alias(__tsan_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr) \ + { \ + check_access(ptr, size, KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_write##size); \ + void __tsan_unaligned_write##size(void *ptr) \ + __alias(__tsan_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_write##size) + +DEFINE_TSAN_READ_WRITE(1); +DEFINE_TSAN_READ_WRITE(2); +DEFINE_TSAN_READ_WRITE(4); +DEFINE_TSAN_READ_WRITE(8); +DEFINE_TSAN_READ_WRITE(16); + +void __tsan_read_range(void *ptr, size_t size) +{ + check_access(ptr, size, 0); +} +EXPORT_SYMBOL(__tsan_read_range); + +void __tsan_write_range(void *ptr, size_t size) +{ + check_access(ptr, size, KCSAN_ACCESS_WRITE); +} +EXPORT_SYMBOL(__tsan_write_range); + +/* + * The below are not required by KCSAN, but can still be emitted by the + * compiler. + */ +void __tsan_func_entry(void *call_pc) +{ +} +EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void) +{ +} +EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void) +{ +} +EXPORT_SYMBOL(__tsan_init); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c new file mode 100644 index 000000000000..bec42dab32ee --- /dev/null +++ b/kernel/kcsan/debugfs.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/atomic.h> +#include <linux/bsearch.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/init.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/string.h> +#include <linux/uaccess.h> + +#include "kcsan.h" + +/* + * Statistics counters. + */ +static atomic_long_t counters[KCSAN_COUNTER_COUNT]; + +/* + * Addresses for filtering functions from reporting. This list can be used as a + * whitelist or blacklist. + */ +static struct { + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ +} report_filterlist = { + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ +}; +static DEFINE_SPINLOCK(report_filterlist_lock); + +static const char *counter_to_name(enum kcsan_counter_id id) +{ + switch (id) { + case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; + case KCSAN_COUNTER_COUNT: + BUG(); + } + return NULL; +} + +void kcsan_counter_inc(enum kcsan_counter_id id) +{ + atomic_long_inc(&counters[id]); +} + +void kcsan_counter_dec(enum kcsan_counter_id id) +{ + atomic_long_dec(&counters[id]); +} + +/* + * The microbenchmark allows benchmarking KCSAN core runtime only. To run + * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the + * debugfs file. + */ +static void microbenchmark(unsigned long iters) +{ + cycles_t cycles; + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + /* + * We can run this benchmark from multiple tasks; this address + * calculation increases likelyhood of some accesses overlapping + * (they still won't conflict because all are reads). + */ + unsigned long addr = + iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); + __kcsan_check_read((void *)addr, sizeof(long)); + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); +} + +static int cmp_filterlist_addrs(const void *rhs, const void *lhs) +{ + const unsigned long a = *(const unsigned long *)rhs; + const unsigned long b = *(const unsigned long *)lhs; + + return a < b ? -1 : a == b ? 0 : 1; +} + +bool kcsan_skip_report_debugfs(unsigned long func_addr) +{ + unsigned long symbolsize, offset; + unsigned long flags; + bool ret = false; + + if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) + return false; + func_addr -= offset; /* Get function start */ + + spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == 0) + goto out; + + /* Sort array if it is unsorted, and then do a binary search. */ + if (!report_filterlist.sorted) { + sort(report_filterlist.addrs, report_filterlist.used, + sizeof(unsigned long), cmp_filterlist_addrs, NULL); + report_filterlist.sorted = true; + } + ret = !!bsearch(&func_addr, report_filterlist.addrs, + report_filterlist.used, sizeof(unsigned long), + cmp_filterlist_addrs); + if (report_filterlist.whitelist) + ret = !ret; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static void set_report_filterlist_whitelist(bool whitelist) +{ + unsigned long flags; + + spin_lock_irqsave(&report_filterlist_lock, flags); + report_filterlist.whitelist = whitelist; + spin_unlock_irqrestore(&report_filterlist_lock, flags); +} + +/* Returns 0 on success, error-code otherwise. */ +static ssize_t insert_report_filterlist(const char *func) +{ + unsigned long flags; + unsigned long addr = kallsyms_lookup_name(func); + ssize_t ret = 0; + + if (!addr) { + pr_err("KCSAN: could not find function: '%s'\n", func); + return -ENOENT; + } + + spin_lock_irqsave(&report_filterlist_lock, flags); + + if (report_filterlist.addrs == NULL) { + /* initial allocation */ + report_filterlist.addrs = + kmalloc_array(report_filterlist.size, + sizeof(unsigned long), GFP_KERNEL); + if (report_filterlist.addrs == NULL) { + ret = -ENOMEM; + goto out; + } + } else if (report_filterlist.used == report_filterlist.size) { + /* resize filterlist */ + size_t new_size = report_filterlist.size * 2; + unsigned long *new_addrs = + krealloc(report_filterlist.addrs, + new_size * sizeof(unsigned long), GFP_KERNEL); + + if (new_addrs == NULL) { + /* leave filterlist itself untouched */ + ret = -ENOMEM; + goto out; + } + + report_filterlist.size = new_size; + report_filterlist.addrs = new_addrs; + } + + /* Note: deduplicating should be done in userspace. */ + report_filterlist.addrs[report_filterlist.used++] = + kallsyms_lookup_name(func); + report_filterlist.sorted = false; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return ret; +} + +static int show_info(struct seq_file *file, void *v) +{ + int i; + unsigned long flags; + + /* show stats */ + seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) + seq_printf(file, "%s: %ld\n", counter_to_name(i), + atomic_long_read(&counters[i])); + + /* show filter functions, and filter type */ + spin_lock_irqsave(&report_filterlist_lock, flags); + seq_printf(file, "\n%s functions: %s\n", + report_filterlist.whitelist ? "whitelisted" : "blacklisted", + report_filterlist.used == 0 ? "none" : ""); + for (i = 0; i < report_filterlist.used; ++i) + seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return 0; +} + +static int debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_info, NULL); +} + +static ssize_t +debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off) +{ + char kbuf[KSYM_NAME_LEN]; + char *arg; + int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + arg = strstrip(kbuf); + + if (!strcmp(arg, "on")) { + WRITE_ONCE(kcsan_enabled, true); + } else if (!strcmp(arg, "off")) { + WRITE_ONCE(kcsan_enabled, false); + } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + return -EINVAL; + microbenchmark(iters); + } else if (!strcmp(arg, "whitelist")) { + set_report_filterlist_whitelist(true); + } else if (!strcmp(arg, "blacklist")) { + set_report_filterlist_whitelist(false); + } else if (arg[0] == '!') { + ssize_t ret = insert_report_filterlist(&arg[1]); + + if (ret < 0) + return ret; + } else { + return -EINVAL; + } + + return count; +} + +static const struct file_operations debugfs_ops = +{ + .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release +}; + +void __init kcsan_debugfs_init(void) +{ + debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); +} diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h new file mode 100644 index 000000000000..b63890e86449 --- /dev/null +++ b/kernel/kcsan/encoding.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ENCODING_H +#define _KERNEL_KCSAN_ENCODING_H + +#include <linux/bits.h> +#include <linux/log2.h> +#include <linux/mm.h> + +#include "kcsan.h" + +#define SLOT_RANGE PAGE_SIZE + +#define INVALID_WATCHPOINT 0 +#define CONSUMED_WATCHPOINT 1 + +/* + * The maximum useful size of accesses for which we set up watchpoints is the + * max range of slots we check on an access. + */ +#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT)) + +/* + * Number of bits we use to store size info. + */ +#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE) +/* + * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS); + * however, most 64-bit architectures do not use the full 64-bit address space. + * Also, in order for a false positive to be observable 2 things need to happen: + * + * 1. different addresses but with the same encoded address race; + * 2. and both map onto the same watchpoint slots; + * + * Both these are assumed to be very unlikely. However, in case it still happens + * happens, the report logic will filter out the false positive (see report.c). + */ +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) + +/* + * Masks to set/retrieve the encoded data. + */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) +#define WATCHPOINT_SIZE_MASK \ + GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_MASK \ + GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) + +static inline bool check_encodable(unsigned long addr, size_t size) +{ + return size <= MAX_ENCODABLE_SIZE; +} + +static inline long +encode_watchpoint(unsigned long addr, size_t size, bool is_write) +{ + return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | + (size << WATCHPOINT_ADDR_BITS) | + (addr & WATCHPOINT_ADDR_MASK)); +} + +static inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, + size_t *size, + bool *is_write) +{ + if (watchpoint == INVALID_WATCHPOINT || + watchpoint == CONSUMED_WATCHPOINT) + return false; + + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + + return true; +} + +/* + * Return watchpoint slot for an address. + */ +static inline int watchpoint_slot(unsigned long addr) +{ + return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; +} + +static inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) +{ + unsigned long end_range1 = addr1 + size1 - 1; + unsigned long end_range2 = addr2 + size2 - 1; + + return addr1 <= end_range2 && addr2 <= end_range1; +} + +#endif /* _KERNEL_KCSAN_ENCODING_H */ diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h new file mode 100644 index 000000000000..d3b9a96ac8a4 --- /dev/null +++ b/kernel/kcsan/kcsan.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please + * see Documentation/dev-tools/kcsan.rst. + */ + +#ifndef _KERNEL_KCSAN_KCSAN_H +#define _KERNEL_KCSAN_KCSAN_H + +#include <linux/kcsan.h> + +/* The number of adjacent watchpoints to check. */ +#define KCSAN_CHECK_ADJACENT 1 + +/* + * Globally enable and disable KCSAN. + */ +extern bool kcsan_enabled; + +/* + * Initialize debugfs file. + */ +void kcsan_debugfs_init(void); + +enum kcsan_counter_id { + /* + * Number of watchpoints currently in use. + */ + KCSAN_COUNTER_USED_WATCHPOINTS, + + /* + * Total number of watchpoints set up. + */ + KCSAN_COUNTER_SETUP_WATCHPOINTS, + + /* + * Total number of data races. + */ + KCSAN_COUNTER_DATA_RACES, + + /* + * Number of times no watchpoints were available. + */ + KCSAN_COUNTER_NO_CAPACITY, + + /* + * A thread checking a watchpoint raced with another checking thread; + * only one will be reported. + */ + KCSAN_COUNTER_REPORT_RACES, + + /* + * Observed data value change, but writer thread unknown. + */ + KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN, + + /* + * The access cannot be encoded to a valid watchpoint. + */ + KCSAN_COUNTER_UNENCODABLE_ACCESSES, + + /* + * Watchpoint encoding caused a watchpoint to fire on mismatching + * accesses. + */ + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES, + + KCSAN_COUNTER_COUNT, /* number of counters */ +}; + +/* + * Increment/decrement counter with given id; avoid calling these in fast-path. + */ +extern void kcsan_counter_inc(enum kcsan_counter_id id); +extern void kcsan_counter_dec(enum kcsan_counter_id id); + +/* + * Returns true if data races in the function symbol that maps to func_addr + * (offsets are ignored) should *not* be reported. + */ +extern bool kcsan_skip_report_debugfs(unsigned long func_addr); + +enum kcsan_report_type { + /* + * The thread that set up the watchpoint and briefly stalled was + * signalled that another thread triggered the watchpoint. + */ + KCSAN_REPORT_RACE_SIGNAL, + + /* + * A thread found and consumed a matching watchpoint. + */ + KCSAN_REPORT_CONSUMED_WATCHPOINT, + + /* + * No other thread was observed to race with the access, but the data + * value before and after the stall differs. + */ + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, +}; + +/* + * Print a race report from thread that encountered the race. + */ +extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); + +#endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c new file mode 100644 index 000000000000..0eea05a3135b --- /dev/null +++ b/kernel/kcsan/report.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/kernel.h> +#include <linux/preempt.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/stacktrace.h> + +#include "kcsan.h" +#include "encoding.h" + +/* + * Max. number of stack entries to show in the report. + */ +#define NUM_STACK_ENTRIES 64 + +/* + * Other thread info: communicated from other racing thread to thread that set + * up the watchpoint, which then prints the complete report atomically. Only + * need one struct, as all threads should to be serialized regardless to print + * the reports, with reporting being in the slow-path. + */ +static struct { + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; +} other_info = { .ptr = NULL }; + +/* + * This spinlock protects reporting and other_info, since other_info is usually + * required when reporting. + */ +static DEFINE_SPINLOCK(report_lock); + +/* + * Special rules to skip reporting. + */ +static bool +skip_report(bool is_write, bool value_change, unsigned long top_frame) +{ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && + !value_change) { + /* + * The access is a write, but the data value did not change. + * + * We opt-out of this filter for certain functions at request of + * maintainers. + */ + char buf[64]; + + snprintf(buf, sizeof(buf), "%ps", (void *)top_frame); + if (!strnstr(buf, "rcu_", sizeof(buf)) && + !strnstr(buf, "_rcu", sizeof(buf)) && + !strnstr(buf, "_srcu", sizeof(buf))) + return true; + } + + return kcsan_skip_report_debugfs(top_frame); +} + +static inline const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +/* Return thread description: in task or interrupt. */ +static const char *get_thread_desc(int task_id) +{ + if (task_id != -1) { + static char buf[32]; /* safe: protected by report_lock */ + + snprintf(buf, sizeof(buf), "task %i", task_id); + return buf; + } + return "interrupt"; +} + +/* Helper to skip KCSAN-related functions in stack-trace. */ +static int get_stack_skipnr(unsigned long stack_entries[], int num_entries) +{ + char buf[64]; + int skip = 0; + + for (; skip < num_entries; ++skip) { + snprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + if (!strnstr(buf, "csan_", sizeof(buf)) && + !strnstr(buf, "tsan_", sizeof(buf)) && + !strnstr(buf, "_once_size", sizeof(buf))) { + break; + } + } + return skip; +} + +/* Compares symbolized strings of addr1 and addr2. */ +static int sym_strcmp(void *addr1, void *addr2) +{ + char buf1[64]; + char buf2[64]; + + snprintf(buf1, sizeof(buf1), "%pS", addr1); + snprintf(buf2, sizeof(buf2), "%pS", addr2); + + return strncmp(buf1, buf2, sizeof(buf1)); +} + +/* + * Returns true if a report was generated, false otherwise. + */ +static bool print_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, + enum kcsan_report_type type) +{ + unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; + int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + int other_skipnr; + + /* + * Must check report filter rules before starting to print. + */ + if (skip_report(is_write, true, stack_entries[skipnr])) + return false; + + if (type == KCSAN_REPORT_RACE_SIGNAL) { + other_skipnr = get_stack_skipnr(other_info.stack_entries, + other_info.num_stack_entries); + + /* @value_change is only known for the other thread */ + if (skip_report(other_info.is_write, value_change, + other_info.stack_entries[other_skipnr])) + return false; + } + + /* Print report header. */ + pr_err("==================================================================\n"); + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: { + void *this_fn = (void *)stack_entries[skipnr]; + void *other_fn = (void *)other_info.stack_entries[other_skipnr]; + int cmp; + + /* + * Order functions lexographically for consistent bug titles. + * Do not print offset of functions to keep title short. + */ + cmp = sym_strcmp(other_fn, this_fn); + pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + cmp < 0 ? other_fn : this_fn, + cmp < 0 ? this_fn : other_fn); + } break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("BUG: KCSAN: data-race in %pS\n", + (void *)stack_entries[skipnr]); + break; + + default: + BUG(); + } + + pr_err("\n"); + + /* Print information about the racing accesses. */ + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(other_info.is_write), other_info.ptr, + other_info.size, get_thread_desc(other_info.task_pid), + other_info.cpu_id); + + /* Print the other thread's stack trace. */ + stack_trace_print(other_info.stack_entries + other_skipnr, + other_info.num_stack_entries - other_skipnr, + 0); + + pr_err("\n"); + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + default: + BUG(); + } + /* Print stack trace of this thread. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + + /* Print report footer. */ + pr_err("\n"); + pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); + dump_stack_print_info(KERN_DEFAULT); + pr_err("==================================================================\n"); + + return true; +} + +static void release_report(unsigned long *flags, enum kcsan_report_type type) +{ + if (type == KCSAN_REPORT_RACE_SIGNAL) + other_info.ptr = NULL; /* mark for reuse */ + + spin_unlock_irqrestore(&report_lock, *flags); +} + +/* + * Depending on the report type either sets other_info and returns false, or + * acquires the matching other_info and returns true. If other_info is not + * required for the report type, simply acquires report_lock and returns true. + */ +static bool prepare_report(unsigned long *flags, const volatile void *ptr, + size_t size, bool is_write, int cpu_id, + enum kcsan_report_type type) +{ + if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && + type != KCSAN_REPORT_RACE_SIGNAL) { + /* other_info not required; just acquire report_lock */ + spin_lock_irqsave(&report_lock, *flags); + return true; + } + +retry: + spin_lock_irqsave(&report_lock, *flags); + + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + if (other_info.ptr != NULL) + break; /* still in use, retry */ + + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); + + spin_unlock_irqrestore(&report_lock, *flags); + + /* + * The other thread will print the summary; other_info may now + * be consumed. + */ + return false; + + case KCSAN_REPORT_RACE_SIGNAL: + if (other_info.ptr == NULL) + break; /* no data available yet, retry */ + + /* + * First check if this is the other_info we are expecting, i.e. + * matches based on how watchpoint was encoded. + */ + if (!matching_access((unsigned long)other_info.ptr & + WATCHPOINT_ADDR_MASK, + other_info.size, + (unsigned long)ptr & WATCHPOINT_ADDR_MASK, + size)) + break; /* mismatching watchpoint, retry */ + + if (!matching_access((unsigned long)other_info.ptr, + other_info.size, (unsigned long)ptr, + size)) { + /* + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. + */ + kcsan_counter_inc( + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + + /* discard this other_info */ + release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + return false; + } + + /* + * Matching & usable access in other_info: keep other_info_lock + * locked, as this thread consumes it to print the full report; + * unlocked in release_report. + */ + return true; + + default: + BUG(); + } + + spin_unlock_irqrestore(&report_lock, *flags); + + goto retry; +} + +void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type) +{ + unsigned long flags = 0; + + kcsan_disable_current(); + if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { + if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) + panic("panic_on_warn set ...\n"); + + release_report(&flags, type); + } + kcsan_enable_current(); +} diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c new file mode 100644 index 000000000000..cc6000239dc0 --- /dev/null +++ b/kernel/kcsan/test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/types.h> + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* Encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* Check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* Check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); diff --git a/kernel/module.c b/kernel/module.c index 3a486f826224..cb6250be6ee9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2031,49 +2031,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -/* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - frob_text(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - } - mutex_unlock(&module_mutex); -} - -/* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - /* - * Ignore going modules since it's possible that ro - * protection has already been disabled, otherwise we'll - * run into protection faults at module deallocation. - */ - if (mod->state == MODULE_STATE_UNFORMED || - mod->state == MODULE_STATE_GOING) - continue; - - frob_text(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - } - mutex_unlock(&module_mutex); -} #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c8be5a0f5259..1ef6f75d92f1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, + .flags = ATOMIC_INIT(IRQ_WORK_LAZY), }; void wake_up_klogd(void) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..5fc9c9b70862 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,6 +7,12 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n +# There are numerous data races here, however, most of them are due to plain accesses. +# This would make it even harder for syzbot to find reproducers, because these +# bugs trigger without specific input. Disable by default, but should re-enable +# eventually. +KCSAN_SANITIZE := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9e20873148c6..8de90ea31280 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -966,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -988,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1013,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* diff --git a/kernel/time/time.c b/kernel/time/time.c index 58e312e7380f..082568f610f3 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return error; if (tz) { - /* Verify we're witin the +-15 hrs range */ + /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; @@ -548,18 +548,21 @@ EXPORT_SYMBOL(set_normalized_timespec64); */ struct timespec64 ns_to_timespec64(const s64 nsec) { - struct timespec64 ts; + struct timespec64 ts = { 0, 0 }; s32 rem; - if (!nsec) - return (struct timespec64) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; } - ts.tv_nsec = rem; return ts; } @@ -879,7 +882,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (in_compat_syscall()) + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ffc91d4935ac..e5ef4ae9edb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -739,7 +739,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) return -EBUSY; /* Add the current task, which is the target of sending signal, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ca7fccafbcbb..7372e87f7f75 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -52,6 +52,9 @@ enum trace_type { #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_struct #define __field_struct(type, item) __field(type, item) @@ -71,26 +74,22 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct struct_name { \ struct trace_entry ent; \ tstruct \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ - filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) __packed +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed #include "trace_entries.h" @@ -1916,17 +1915,15 @@ extern void tracing_log_err(struct trace_array *tr, #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fc8e97328e54..3e9d81608284 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) ), F_printk(" %ps <-- %ps", (void *)__entry->ip, (void *)__entry->parent_ip), - FILTER_TRACE_FN, - perf_ftrace_event_register ); @@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), - - FILTER_OTHER + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) ); /* Function return entry */ @@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long, ret, overrun ) __field_desc( unsigned long long, ret, calltime) __field_desc( unsigned long long, ret, rettime ) - __field_desc( unsigned long, ret, overrun ) __field_desc( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth), - - FILTER_OTHER + __entry->depth) ); /* @@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); /* @@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->fmt), - - FILTER_OTHER + (void *)__entry->ip, __entry->fmt) ); FTRACE_ENTRY_REG(print, print_entry, @@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry, F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), - FILTER_OTHER, - ftrace_event_register ); @@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry, ), F_printk("id:%04x %08x", - __entry->id, (int)__entry->buf[0]), - - FILTER_OTHER + __entry->id, (int)__entry->buf[0]) ); FTRACE_ENTRY(bputs, bputs_entry, @@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->str), - - FILTER_OTHER + (void *)__entry->ip, __entry->str) ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, __field_desc( resource_size_t, rw, phys ) __field_desc( unsigned long, rw, value ) __field_desc( unsigned long, rw, pc ) - __field_desc( int, rw, map_id ) + __field_desc( int, rw, map_id ) __field_desc( unsigned char, rw, opcode ) __field_desc( unsigned char, rw, width ) ), F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width), - - FILTER_OTHER + __entry->map_id, __entry->opcode, __entry->width) ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __field_desc( resource_size_t, map, phys ) __field_desc( unsigned long, map, virt ) __field_desc( unsigned long, map, len ) - __field_desc( int, map, map_id ) + __field_desc( int, map, map_id ) __field_desc( unsigned char, map, opcode ) ), F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode), - - FILTER_OTHER + __entry->map_id, __entry->opcode) ); @@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)%s", __entry->line, __entry->func, __entry->file, __entry->correct, - __entry->constant ? " CONSTANT" : ""), - - FILTER_OTHER + __entry->constant ? " CONSTANT" : "") ); @@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->duration, __entry->outer_duration, __entry->nmi_total_ts, - __entry->nmi_count), - - FILTER_OTHER + __entry->nmi_count) ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6b3a69e9aa6a..ae0f0bc30828 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,6 +24,7 @@ #include <linux/delay.h> #include <trace/events/sched.h> +#include <trace/syscall.h> #include <asm/setup.h> @@ -2017,7 +2018,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) */ head = trace_get_fields(call); if (list_empty(head)) { - ret = call->class->define_fields(call); + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + ret = field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (ret) + break; + + offset += field->size; + } if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f49d1a36d3ae..23b0195ac977 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1135,6 +1135,12 @@ static struct synth_event *find_synth_event(const char *name) return NULL; } +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -1156,7 +1162,7 @@ static int register_synth_event(struct synth_event *event) INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &synth_event_funcs; - call->class->define_fields = synth_event_define_fields; + call->class->fields_array = synth_event_fields_array; ret = register_trace_event(&call->event); if (!ret) { diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 2e6d2e9741cc..77ce5a3b6773 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call, * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) /* not needed for this file */ #undef __field_struct @@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_desc #define __field_desc(type, container, item) type item; @@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call, #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct ____ftrace_##name { \ tstruct \ }; \ @@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#undef __field_ext +#define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = _filter_type }, + #undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) + +#undef __field_fn +#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) #undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) #undef __array -#define __array(type, item, len) \ - do { \ - char *type_str = #type"["__stringify(len)"]"; \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, type_str, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; \ - } while (0); +#define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ + .size = sizeof(_type[_len]), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef __array_desc -#define __array_desc(type, container, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) #undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type "[]", #item, \ - offsetof(typeof(field), item), \ - 0, is_signed_type(type), filter_type);\ - if (ret) \ - return ret; +#define __dynamic_array(_type, _item) { \ + .type = #_type "[]", .name = #_item, \ + .size = 0, .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -static int __init \ -ftrace_define_fields_##name(struct trace_event_call *event_call) \ -{ \ - struct struct_name field; \ - int ret; \ - int filter_type = filter; \ - \ - tstruct; \ - \ - return ret; \ -} +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +static struct trace_event_fields ftrace_event_fields_##name[] = { \ + tstruct \ + {} }; #include "trace_entries.h" @@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #undef __field #define __field(type, item) +#undef __field_fn +#define __field_fn(type, item) + #undef __field_desc #define __field_desc(type, container, item) @@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ - regfn) \ - \ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \ static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ - .define_fields = ftrace_define_fields_##call, \ + .fields_array = ftrace_event_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ @@ -191,9 +163,9 @@ static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), filter, NULL) + PARAMS(tstruct), PARAMS(print), NULL) bool ftrace_event_is_function(struct trace_event_call *call) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f890262c8a3..cbdc4f4e64c7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1555,16 +1555,28 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; +static struct trace_event_fields kretprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kretprobe_event_define_fields }, + {} +}; + +static struct trace_event_fields kprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_kprobe *tk) { struct trace_event_call *call = trace_probe_event_call(&tk->tp); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; - call->class->define_fields = kretprobe_event_define_fields; + call->class->fields_array = kretprobe_fields_array; } else { call->event.funcs = &kprobe_funcs; - call->class->define_fields = kprobe_event_define_fields; + call->class->fields_array = kprobe_fields_array; } call->flags = TRACE_EVENT_FL_KPROBE; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 16fa218556fa..73140d80dd46 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -203,11 +203,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, field, name) \ - sizeof(type) != sizeof(trace.field) ? \ - __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), field), \ - sizeof(trace.field), is_signed_type(type) +#define SYSCALL_FIELD(_type, _name) { \ + .type = #_type, .name = #_name, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -274,42 +273,22 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; - int ret; - int i; int offset = offsetof(typeof(trace), args); - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; + int ret, i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, sizeof(unsigned long), 0, FILTER_OTHER); + if (ret) + break; offset += sizeof(unsigned long); } return ret; } -static int __init syscall_exit_define_fields(struct trace_event_call *call) -{ - struct syscall_trace_exit trace; - int ret; - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; - - ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), - FILTER_OTHER); - - return ret; -} - static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -507,6 +486,13 @@ static int __init init_syscall_trace(struct trace_event_call *call) return id; } +static struct trace_event_fields __refdata syscall_enter_fields_array[] = { + SYSCALL_FIELD(int, __syscall_nr), + { .type = TRACE_FUNCTION_TYPE, + .define_fields = syscall_enter_define_fields }, + {} +}; + struct trace_event_functions enter_syscall_print_funcs = { .trace = print_syscall_enter, }; @@ -518,7 +504,7 @@ struct trace_event_functions exit_syscall_print_funcs = { struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, + .fields_array = syscall_enter_fields_array, .get_fields = syscall_get_enter_fields, .raw_init = init_syscall_trace, }; @@ -526,7 +512,11 @@ struct trace_event_class __refdata event_class_syscall_enter = { struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, + .fields_array = (struct trace_event_fields[]){ + SYSCALL_FIELD(int, __syscall_nr), + SYSCALL_FIELD(long, ret), + {} + }, .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), .raw_init = init_syscall_trace, }; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..476a382f1f1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1507,12 +1507,17 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; +static struct trace_event_fields uprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = uprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_uprobe *tu) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); - call->event.funcs = &uprobe_funcs; - call->class->define_fields = uprobe_event_define_fields; + call->class->fields_array = uprobe_fields_array; call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; |