From f0d279654dea22b7a6ad34b9334aee80cda62cde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Aug 2014 16:06:06 -0400 Subject: percpu: fix pcpu_alloc_pages() failure path When pcpu_alloc_pages() fails midway, pcpu_free_pages() is invoked to free what has already been allocated. The invocation is across the whole requested range and pcpu_free_pages() will try to free all non-NULL pages; unfortunately, this is incorrect as pcpu_get_pages_and_bitmap(), unlike what its comment suggests, doesn't clear the pages array and thus the array may have entries from the previous invocations making the partial failure path free incorrect pages. Fix it by open-coding the partial freeing of the already allocated pages. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- mm/percpu-vm.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 3707c71ae4cd..8d9bb2c00c68 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -108,7 +108,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - unsigned int cpu; + unsigned int cpu, tcpu; int i; for_each_possible_cpu(cpu) { @@ -116,14 +116,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); - if (!*pagep) { - pcpu_free_pages(chunk, pages, populated, - page_start, page_end); - return -ENOMEM; - } + if (!*pagep) + goto err; } } return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; } /** -- cgit v1.2.1 From 849f5169097e1ba35b90ac9df76b5bb6f9c0aabd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Aug 2014 16:06:10 -0400 Subject: percpu: perform tlb flush after pcpu_map_pages() failure If pcpu_map_pages() fails midway, it unmaps the already mapped pages. Currently, it doesn't flush tlb after the partial unmapping. This may be okay in most cases as the established mapping hasn't been used at that point but it can go wrong and when it goes wrong it'd be extremely difficult to track down. Flush tlb after the partial unmapping. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- mm/percpu-vm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 8d9bb2c00c68..51108165f829 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -272,6 +272,7 @@ err: __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } -- cgit v1.2.1 From 3189eddbcafcc4d827f7f19facbeddec4424eba8 Mon Sep 17 00:00:00 2001 From: Honggang Li Date: Tue, 12 Aug 2014 21:36:15 +0800 Subject: percpu: free percpu allocation info for uniprocessor system Currently, only SMP system free the percpu allocation info. Uniprocessor system should free it too. For example, one x86 UML virtual machine with 256MB memory, UML kernel wastes one page memory. Signed-off-by: Honggang Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- mm/percpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 2139e30a4b44..da997f9800bd 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1932,6 +1932,8 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); + + pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ -- cgit v1.2.1 From ce00a967377baadf2481521e131771adc7652856 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 5 Sep 2014 08:43:57 -0400 Subject: mm: memcontrol: revert use of root_mem_cgroup res_counter Dave Hansen reports a massive scalability regression in an uncontained page fault benchmark with more than 30 concurrent threads, which he bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup res_counter") and pin-pointed on res_counter spinlock contention. That change relied on the per-cpu charge caches to mostly swallow the res_counter costs, but it's apparent that the caches don't scale yet. Revert memcg back to bypassing res_counters on the root level in order to restore performance for uncontained workloads. Reported-by: Dave Hansen Signed-off-by: Johannes Weiner Tested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 103 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ec4dcf1b9562..085dc6d2f876 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long long size; int ret = 0; + if (mem_cgroup_is_root(memcg)) + goto done; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2611,9 +2613,7 @@ nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - memcg = root_mem_cgroup; - ret = -EINTR; - goto retry; + return -EINTR; done_restock: if (batch > nr_pages) @@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long bytes = nr_pages * PAGE_SIZE; + if (mem_cgroup_is_root(memcg)) + return; + res_counter_uncharge(&memcg->res, bytes); if (do_swap_account) res_counter_uncharge(&memcg->memsw, bytes); @@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; + if (mem_cgroup_is_root(memcg)) + return; + res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -4093,6 +4099,46 @@ out: return retval; } +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) +{ + struct mem_cgroup *iter; + long val = 0; + + /* Per-cpu values can be negative, use a signed accumulator */ + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_stat(iter, idx); + + if (val < 0) /* race ? */ + val = 0; + return val; +} + +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + u64 val; + + if (!mem_cgroup_is_root(memcg)) { + if (!swap) + return res_counter_read_u64(&memcg->res, RES_USAGE); + else + return res_counter_read_u64(&memcg->memsw, RES_USAGE); + } + + /* + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS + * as well as in MEM_CGROUP_STAT_RSS_HUGE. + */ + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); + + if (swap) + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); + + return val << PAGE_SHIFT; +} + + static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (type) { case _MEM: + if (name == RES_USAGE) + return mem_cgroup_usage(memcg, false); return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: + if (name == RES_USAGE) + return mem_cgroup_usage(memcg, true); return res_counter_read_u64(&memcg->memsw, name); case _KMEM: return res_counter_read_u64(&memcg->kmem, name); @@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - if (!swap) - usage = res_counter_read_u64(&memcg->res, RES_USAGE); - else - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + usage = mem_cgroup_usage(memcg, swap); /* * current_threshold points to threshold just below or equal to usage. @@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, if (type == _MEM) { thresholds = &memcg->thresholds; - usage = res_counter_read_u64(&memcg->res, RES_USAGE); + usage = mem_cgroup_usage(memcg, false); } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + usage = mem_cgroup_usage(memcg, true); } else BUG(); @@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, if (type == _MEM) { thresholds = &memcg->thresholds; - usage = res_counter_read_u64(&memcg->res, RES_USAGE); + usage = mem_cgroup_usage(memcg, false); } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + usage = mem_cgroup_usage(memcg, true); } else BUG(); @@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, &root_mem_cgroup->res); - res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); - res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void) /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + if (!mem_cgroup_is_root(mc.from)) + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); @@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void) * we charged both to->res and to->memsw, so we should * uncharge to->res. */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); + if (!mem_cgroup_is_root(mc.to)) + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg) { - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, { unsigned long flags; - if (nr_mem) - res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); - if (nr_memsw) - res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); - - memcg_oom_recover(memcg); + if (!mem_cgroup_is_root(memcg)) { + if (nr_mem) + res_counter_uncharge(&memcg->res, + nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, + nr_memsw * PAGE_SIZE); + memcg_oom_recover(memcg); + } local_irq_save(flags); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); -- cgit v1.2.1 From 0a313a998adbae19c1309f80a3ad79107fff7c4e Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 9 Sep 2014 14:50:46 -0700 Subject: mem-hotplug: let memblock skip the hotpluggable memory regions in __next_mem_range() Let memblock skip the hotpluggable memory regions in __next_mem_range(), it is used to to prevent memblock from allocating hotpluggable memory for the kernel at early time. The code is the same as __next_mem_range_rev(). Clear hotpluggable flag before releasing free pages to the buddy allocator. If we don't clear hotpluggable flag in free_low_memory_core_early(), the memory which marked hotpluggable flag will not free to buddy allocator. Because __next_mem_range() will skip them. free_low_memory_core_early for_each_free_mem_range for_each_mem_range __next_mem_range [akpm@linux-foundation.org: fix warning] Signed-off-by: Xishi Qiu Cc: Tejun Heo Cc: Tang Chen Cc: Zhang Yanfei Cc: Wen Congyang Cc: "Rafael J. Wysocki" Cc: "H. Peter Anvin" Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++++ mm/nobootmem.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 70fad0c0dafb..6ecb0d937fb5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -816,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (nid != NUMA_NO_NODE && nid != m_nid) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7ed58602e71b..7c7ab32ee503 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end; u64 i; + memblock_clear_hotplug(0, -1); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); -- cgit v1.2.1 From 8542bdfc6632b55aa1cf4fa255283c878b662499 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 9 Sep 2014 14:50:59 -0700 Subject: mm/mmap.c: use pr_emerg when printing BUG related information Make sure we actually see the output of validate_mm() and browse_rb() before triggering a BUG(). pr_info isn't shown by default so the reason for the BUG() isn't obvious. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..c0a3637cdb64 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -369,20 +369,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - pr_info("vm_end %lx < vm_start %lx\n", + pr_emerg("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_info("free gap %lx, correct %lx\n", + pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -396,7 +396,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - pr_info("backwards %d, forwards %d\n", j, i); + pr_emerg("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -431,17 +431,17 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_info("map_count %d vm_next %d\n", mm->map_count, i); + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - pr_info("mm->highest_vm_end %lx, found %lx\n", + pr_emerg("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - pr_info("map_count %d rb %d\n", mm->map_count, i); + pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); -- cgit v1.2.1