diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 51 | ||||
-rw-r--r-- | kernel/hung_task.c | 44 | ||||
-rw-r--r-- | kernel/kthread.c | 21 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 35 | ||||
-rw-r--r-- | kernel/sched/sched.h | 4 | ||||
-rw-r--r-- | kernel/sys.c | 7 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 |
10 files changed, 144 insertions, 33 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 804b847912dc..79882ce1f2b5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -603,7 +603,7 @@ static int finish_cpu(unsigned int cpu) */ if (mm != &init_mm) idle->active_mm = &init_mm; - mmdrop(mm); + mmdrop_lazy_tlb(mm); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..3e9ec041a4e5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -475,7 +475,7 @@ static void exit_mm(void) __set_current_state(TASK_RUNNING); mmap_read_lock(mm); } - mmgrab(mm); + mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); diff --git a/kernel/fork.c b/kernel/fork.c index e8b41e212110..af408d06e1fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -674,6 +674,53 @@ static void check_mm(struct mm_struct *mm) #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static void do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } +} + +static void do_check_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + WARN_ON_ONCE(current->active_mm == mm); +} + +static void shoot_lazy_tlbs(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * IPI overheads have not found to be expensive, but they could + * be reduced in a number of possible ways, for example (in + * roughly increasing order of complexity): + * - A batch of mms requiring IPIs could be gathered and freed + * at once. + * - CPUs could store their active mm somewhere that can be + * remotely checked without a lock, to filter out + * false-positives in the cpumask. + * - After mm_users or mm_count reaches zero, switching away + * from the mm could clear mm_cpumask to reduce some IPIs + * (some batching or delaying would help). + * - A delayed freeing and RCU-like quiescing sequence based on + * mm switching to avoid IPIs completely. + */ + on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + if (IS_ENABLED(CONFIG_DEBUG_VM)) + on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); + } else { + /* + * In this case, lazy tlb mms are refounted and would not reach + * __mmdrop until all CPUs have switched away and mmdrop()ed. + */ + } +} + /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by @@ -683,6 +730,10 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); + + /* Ensure no CPUs are using this as their lazy tlb mm */ + shoot_lazy_tlbs(mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..8cc07e7f29aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -145,6 +145,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -196,6 +237,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168..83ed75d531b4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1350,14 +1350,19 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + /* + * It's possible that tsk->active_mm == mm here, but we must + * still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), because lazy + * mm may not have its own refcount (see mmgrab/drop_lazy_tlb()). + */ + mmgrab(mm); + task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; - if (active_mm != mm) { - mmgrab(mm); + if (active_mm != mm) tsk->active_mm = mm; - } tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); @@ -1374,12 +1379,9 @@ void kthread_use_mm(struct mm_struct *mm) * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by - * mmdrop(), or explicitly with smp_mb(). + * mmdrop_lazy_tlb(). */ - if (active_mm != mm) - mmdrop(active_mm); - else - smp_mb(); + mmdrop_lazy_tlb(active_mm); to_kthread(tsk)->oldfs = force_uaccess_begin(); } @@ -1411,10 +1413,13 @@ void kthread_unuse_mm(struct mm_struct *mm) local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); + mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); + + mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca43239a255a..cb5a25a8a0cc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67fd8faa7a7f..014ce8b385c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4517,7 +4517,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; + struct mm_struct *mm = NULL; long prev_state; /* @@ -4536,7 +4536,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) current->comm, current->pid, preempt_count())) preempt_count_set(FORK_PREEMPT_COUNT); - rq->prev_mm = NULL; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + mm = rq->prev_lazy_mm; + rq->prev_lazy_mm = NULL; +#endif /* * A task struct has one reference for the use as "current". @@ -4576,13 +4579,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly - * provided by mmdrop(), + * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_lazy_tlb(mm); } + if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -4645,9 +4649,9 @@ context_switch(struct rq *rq, struct task_struct *prev, /* * kernel -> kernel lazy + transfer active - * user -> kernel lazy + mmgrab() active + * user -> kernel lazy + mmgrab_lazy_tlb() active * - * kernel -> user switch + mmdrop() active + * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch */ if (!next->mm) { // to kernel @@ -4655,7 +4659,7 @@ context_switch(struct rq *rq, struct task_struct *prev, next->active_mm = prev->active_mm; if (prev->mm) // from user - mmgrab(prev->active_mm); + mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; } else { // to user @@ -4671,9 +4675,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ - rq->prev_mm = prev->active_mm; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + /* Will mmdrop_lazy_tlb() in finish_task_switch(). */ + rq->prev_lazy_mm = prev->active_mm; prev->active_mm = NULL; +#else + /* + * Without MMU_LAZY_TLB_REFCOUNT there is no lazy + * tracking (because no rq->prev_lazy_mm) in + * finish_task_switch, so no mmdrop_lazy_tlb(), so no + * memory barrier for membarrier (see the membarrier + * comment in finish_task_switch()). Do it here. + */ + smp_mb(); +#endif } } @@ -9070,7 +9085,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a2c6f6ae6392..0fa583db5c4a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -967,7 +967,9 @@ struct rq { struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; - struct mm_struct *prev_mm; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + struct mm_struct *prev_lazy_mm; +#endif unsigned int clock_update_flags; u64 clock; diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..6ec50924b517 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1960,13 +1960,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - - /* * Neither we should allow to override limits if they set. */ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..82d6ff6d85cd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3275,7 +3275,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -3284,7 +3284,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -3293,7 +3293,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, @@ -3302,7 +3302,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, |