diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 2 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 28 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 65 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/irq/generic-chip.c | 15 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 2 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 9 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 18 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 34 |
11 files changed, 138 insertions, 53 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); if (array_size >= U32_MAX - PAGE_SIZE || - elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..e745d6a88224 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -69,7 +69,7 @@ static LIST_HEAD(dev_map_list); static u64 dev_map_bitmap_size(const union bpf_attr *attr) { - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); + return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -78,6 +78,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) int err = -EINVAL; u64 cost; + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) @@ -111,8 +114,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), - __alignof__(unsigned long)); + dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), + __alignof__(unsigned long), + GFP_KERNEL | __GFP_NOWARN); if (!dtab->flush_needed) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) - /* make sure the size for pcpu_alloc() is reasonable */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..2b6eb35ae5d3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <net/strparser.h> +#include <net/tcp.h> struct bpf_stab { struct bpf_map map; @@ -101,9 +102,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) return SK_DROP; skb_orphan(skb); + /* We need to ensure that BPF metadata for maps is also cleared + * when we orphan the skb so that we don't have the possibility + * to reference a stale map. + */ + TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; bpf_compute_data_end(skb); + preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); + preempt_enable(); skb->sk = NULL; return rc; @@ -114,17 +122,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) struct sock *sk; int rc; - /* Because we use per cpu values to feed input from sock redirect - * in BPF program to do_sk_redirect_map() call we need to ensure we - * are not preempted. RCU read lock is not sufficient in this case - * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. - */ - preempt_disable(); rc = smap_verdict_func(psock, skb); switch (rc) { case SK_REDIRECT: - sk = do_sk_redirect_map(); - preempt_enable(); + sk = do_sk_redirect_map(skb); if (likely(sk)) { struct smap_psock *peer = smap_psock_sk(sk); @@ -141,8 +142,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) /* Fall through and free skb otherwise */ case SK_DROP: default: - if (rc != SK_REDIRECT) - preempt_enable(); kfree_skb(skb); } } @@ -487,6 +486,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) int err = -EINVAL; u64 cost; + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) @@ -840,6 +842,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP) { + fput(socket->file); + return -EOPNOTSUPP; + } + err = sock_map_ctx_update_elem(&skops, map, key, flags); fput(socket->file); return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b8d6ba39e23..c48ca2a34b5e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1116,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* ctx accesses must be at a fixed offset, so that we can * determine what type of data were returned. */ - if (!tnum_is_const(reg->var_off)) { + if (reg->off) { + verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", + regno, reg->off, off - reg->off); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -1124,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn tn_buf, off, size); return -EACCES; } - off += reg->var_off.value; err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -2426,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + bool range_right_open) { struct bpf_reg_state *regs = state->regs, *reg; + u16 new_range; int i; - if (dst_reg->off < 0) + if (dst_reg->off < 0 || + (dst_reg->off == 0 && range_right_open)) /* This doesn't give us any range */ return; @@ -2442,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, */ return; - /* LLVM can generate four kind of checks: + new_range = dst_reg->off; + if (range_right_open) + new_range--; + + /* Examples for register markings: * - * Type 1/2: + * pkt_data in dst register: * * r2 = r3; * r2 += 8; @@ -2461,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * r2=pkt(id=n,off=8,r=0) * r3=pkt(id=n,off=0,r=0) * - * Type 3/4: + * pkt_data in src register: * * r2 = r3; * r2 += 8; @@ -2479,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * r3=pkt(id=n,off=0,r=0) * * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access. + * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) + * and [r3, r3 + 8-1) respectively is safe to access depending on + * the check. */ /* If our ids match, then we must have the same max_value. And we @@ -2490,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ - regs[i].range = max_t(u16, regs[i].range, dst_reg->off); + regs[i].range = max(regs[i].range, new_range); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = max_t(u16, reg->range, dst_reg->off); + reg->range = max(reg->range, new_range); } } @@ -2861,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + /* pkt_data' > pkt_end */ + find_good_pkt_pointers(this_branch, dst_reg, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end > pkt_data' */ + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + /* pkt_data' < pkt_end */ + find_good_pkt_pointers(other_branch, dst_reg, true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end < pkt_data' */ + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' >= pkt_end */ + find_good_pkt_pointers(this_branch, dst_reg, true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + /* pkt_end >= pkt_data' */ + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' <= pkt_end */ + find_good_pkt_pointers(other_branch, dst_reg, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + /* pkt_end <= pkt_data' */ + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; diff --git a/kernel/exit.c b/kernel/exit.c index cf28528842bc..f6cad39f35df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1611,7 +1611,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) - goto Efault; + return -EFAULT; user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1739,7 +1739,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) - goto Efault; + return -EFAULT; user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5270a54b9fa4..c26c5bb6b491 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d) } /** - * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt + * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt * @d: irq_data + * + * This generic implementation of the irq_mask_ack method is for chips + * with separate enable/disable registers instead of a single mask + * register and where a pending interrupt is acknowledged by setting a + * bit. + * + * Note: This is the only permutation currently used. Similar generic + * functions should be added here if other permutations are required. */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) +void irq_gc_mask_disable_and_ack_set(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(gc, mask, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.disable); + *ct->mask_cache &= ~mask; irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * * The callback function will be invoked some time after a full SRCU diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) } /** + * rcu_sync_enter_start - Force readers onto slow path for multiple updates + * @rsp: Pointer to rcu_sync structure to use for synchronization + * * Must be called after rcu_sync_init() and before first use. * * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() @@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) /** * rcu_sync_func() - Callback function managing reader access to fastpath - * @rsp: Pointer to rcu_sync structure to use for synchronization + * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * * This function is passed to one of the call_rcu() functions by * rcu_sync_exit(), so that it is invoked after a grace period following the @@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) * rcu_sync_exit(). Otherwise, set all state back to idle so that readers * can again use their fastpaths. */ -static void rcu_sync_func(struct rcu_head *rcu) +static void rcu_sync_func(struct rcu_head *rhp) { - struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; BUG_ON(rsp->gp_state != GP_PASSED); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, * read-side critical sections have completed. call_rcu_sched() assumes * that the read-side critical sections end on enabling of preemption * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - * - anything that disables preemption. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. * * These may be nested. * @@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * handler. This means that read-side critical sections in process * context must not be interrupted by softirqs. This interface is to be * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. - * OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * These may be nested. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * + * These may be nested. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,7 @@ #include <linux/membarrier.h> #include <linux/tick.h> #include <linux/cpumask.h> +#include <linux/atomic.h> #include "sched.h" /* for cpu_rq(). */ @@ -26,21 +27,26 @@ * except MEMBARRIER_CMD_QUERY. */ #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static void membarrier_private_expedited(void) +static int membarrier_private_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; + if (!(atomic_read(¤t->mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + if (num_online_cpus() == 1) - return; + return 0; /* * Matches memory barriers around rq->curr modification in @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static void membarrier_register_private_expedited(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + /* + * We need to consider threads belonging to different thread + * groups, which use the same mm. (CLONE_VM but not + * CLONE_THREAD). + */ + if (atomic_read(&mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) + return; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + &mm->membarrier_state); } /** @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) synchronize_sched(); return 0; case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - membarrier_private_expedited(); + return membarrier_private_expedited(); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + membarrier_register_private_expedited(); return 0; default: return -EINVAL; |