diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-05-31 18:44:39 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-05-31 18:44:39 +1000 |
commit | 5cccae69f3e71096c73c4e545d17cf9e524027ca (patch) | |
tree | 47268c8493d82f11207c34692e3b24123df0f7bf /mm | |
parent | 6ce9696fcb9e612a9a4dc430a3ae4b8612f889aa (diff) | |
parent | 06a5af82ba137c5b451e0cc017a4cbe9a2c6f72b (diff) | |
download | linux-next-5cccae69f3e71096c73c4e545d17cf9e524027ca.tar.gz |
Merge branch 'akpm-current/current'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 39 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 8 | ||||
-rw-r--r-- | mm/gup.c | 42 | ||||
-rw-r--r-- | mm/hmm.c | 297 | ||||
-rw-r--r-- | mm/huge_memory.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 62 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 6 | ||||
-rw-r--r-- | mm/init-mm.c | 1 | ||||
-rw-r--r-- | mm/ksm.c | 25 | ||||
-rw-r--r-- | mm/list_lru.c | 7 | ||||
-rw-r--r-- | mm/memblock.c | 27 | ||||
-rw-r--r-- | mm/memcontrol.c | 783 | ||||
-rw-r--r-- | mm/memfd.c | 345 | ||||
-rw-r--r-- | mm/memory.c | 40 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 23 | ||||
-rw-r--r-- | mm/mincore.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 234 | ||||
-rw-r--r-- | mm/page_alloc.c | 74 | ||||
-rw-r--r-- | mm/page_counter.c | 100 | ||||
-rw-r--r-- | mm/page_owner.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 379 | ||||
-rw-r--r-- | mm/slab.c | 8 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 122 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 126 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 1 | ||||
-rw-r--r-- | mm/sparse.c | 35 | ||||
-rw-r--r-- | mm/swap_slots.c | 10 | ||||
-rw-r--r-- | mm/swap_state.c | 19 | ||||
-rw-r--r-- | mm/swapfile.c | 156 | ||||
-rw-r--r-- | mm/userfaultfd.c | 22 | ||||
-rw-r--r-- | mm/vmalloc.c | 41 | ||||
-rw-r--r-- | mm/vmpressure.c | 35 | ||||
-rw-r--r-- | mm/vmscan.c | 38 |
40 files changed, 1958 insertions, 1194 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 9dce9b77a91c..ce95491abd6a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -759,3 +759,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/Makefile b/mm/Makefile index b4e54a9ae9c5..8716bdabe1e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fe3ebd6ac00..7215c0d5cb10 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -221,11 +221,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); @@ -557,7 +592,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -736,7 +771,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); diff --git a/mm/debug.c b/mm/debug.c index 56e2d9125ea5..3aed6102b8d0 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -116,7 +116,7 @@ void dump_mm(const struct mm_struct *mm) "ioctx_table %px\n" #endif #ifdef CONFIG_MEMCG - "owner %px " + "memcg %px " #endif "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER @@ -147,7 +147,7 @@ void dump_mm(const struct mm_struct *mm) mm->ioctx_table, #endif #ifdef CONFIG_MEMCG - mm->owner, + mm->memcg, #endif mm->exe_file, #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/filemap.c b/mm/filemap.c index 0604cb02e6f3..52517f28e6f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; @@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; pgoff_t max_off; struct page *page; - int ret = 0; + vm_fault_t ret = 0; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2693,11 +2693,11 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - int ret = VM_FAULT_LOCKED; + vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) @@ -1354,7 +1370,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1430,7 +1446,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, @@ -966,173 +966,6 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } -static DEFINE_MUTEX(hmm_devmem_lock); -static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); - -static void hmm_devmem_radix_release(struct resource *resource) -{ - resource_size_t key, align_start, align_size; - - align_start = resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); - - mutex_lock(&hmm_devmem_lock); - for (key = resource->start; - key <= resource->end; - key += PA_SECTION_SIZE) - radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); - mutex_unlock(&hmm_devmem_lock); -} - -static void hmm_devmem_release(struct device *dev, void *data) -{ - struct hmm_devmem *devmem = data; - struct resource *resource = devmem->resource; - unsigned long start_pfn, npages; - struct zone *zone; - struct page *page; - - if (percpu_ref_tryget_live(&devmem->ref)) { - dev_WARN(dev, "%s: page mapping is still live!\n", __func__); - percpu_ref_put(&devmem->ref); - } - - /* pages are dead and unused, undo the arch mapping */ - start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; - npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; - - page = pfn_to_page(start_pfn); - zone = page_zone(page); - - mem_hotplug_begin(); - if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) - __remove_pages(zone, start_pfn, npages, NULL); - else - arch_remove_memory(start_pfn << PAGE_SHIFT, - npages << PAGE_SHIFT, NULL); - mem_hotplug_done(); - - hmm_devmem_radix_release(resource); -} - -static int hmm_devmem_pages_create(struct hmm_devmem *devmem) -{ - resource_size_t key, align_start, align_size, align_end; - struct device *device = devmem->device; - int ret, nid, is_ram; - unsigned long pfn; - - align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(devmem->resource->start + - resource_size(devmem->resource), - PA_SECTION_SIZE) - align_start; - - is_ram = region_intersects(align_start, align_size, - IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE); - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "%s attempted on mixed region %pr\n", - __func__, devmem->resource); - return -ENXIO; - } - if (is_ram == REGION_INTERSECTS) - return -ENXIO; - - if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - else - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_fault = hmm_devmem_fault; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.dev = devmem->device; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - - mutex_lock(&hmm_devmem_lock); - align_end = align_start + align_size - 1; - for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { - struct hmm_devmem *dup; - - dup = radix_tree_lookup(&hmm_devmem_radix, - key >> PA_SECTION_SHIFT); - if (dup) { - dev_err(device, "%s: collides with mapping for %s\n", - __func__, dev_name(dup->device)); - mutex_unlock(&hmm_devmem_lock); - ret = -EBUSY; - goto error; - } - ret = radix_tree_insert(&hmm_devmem_radix, - key >> PA_SECTION_SHIFT, - devmem); - if (ret) { - dev_err(device, "%s: failed: %d\n", __func__, ret); - mutex_unlock(&hmm_devmem_lock); - goto error_radix; - } - } - mutex_unlock(&hmm_devmem_lock); - - nid = dev_to_node(device); - if (nid < 0) - nid = numa_mem_id(); - - mem_hotplug_begin(); - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For device public memory, which is accesible by the CPU, we do - * want the linear mapping and thus use arch_add_memory(). - */ - if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) - ret = arch_add_memory(nid, align_start, align_size, NULL, - false); - else - ret = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); - if (ret) { - mem_hotplug_done(); - goto error_add_memory; - } - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL); - mem_hotplug_done(); - - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); - - page->pgmap = &devmem->pagemap; - } - return 0; - -error_add_memory: - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); -error_radix: - hmm_devmem_radix_release(devmem->resource); -error: - return ret; -} - -static int hmm_devmem_match(struct device *dev, void *data, void *match_data) -{ - struct hmm_devmem *devmem = data; - - return devmem->resource == match_data; -} - -static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) -{ - devres_release(devmem->device, &hmm_devmem_release, - &hmm_devmem_match, devmem->resource); -} - /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1156,12 +989,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, { struct hmm_devmem *devmem; resource_size_t addr; + void *result; int ret; dev_pagemap_get_ops(); - devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), - GFP_KERNEL, dev_to_node(device)); + devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); @@ -1175,11 +1008,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 0, GFP_KERNEL); if (ret) - goto error_percpu_ref; + return ERR_PTR(ret); - ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); if (ret) - goto error_devm_add_action; + return ERR_PTR(ret); size = ALIGN(size, PA_SECTION_SIZE); addr = min((unsigned long)iomem_resource.end, @@ -1199,54 +1032,45 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->resource = devm_request_mem_region(device, addr, size, dev_name(device)); - if (!devmem->resource) { - ret = -ENOMEM; - goto error_no_resource; - } + if (!devmem->resource) + return ERR_PTR(-ENOMEM); break; } - if (!devmem->resource) { - ret = -ERANGE; - goto error_no_resource; - } + if (!devmem->resource) + return ERR_PTR(-ERANGE); devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); - ret = hmm_devmem_pages_create(devmem); - if (ret) - goto error_pages; + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.res = *devmem->resource; + devmem->pagemap.page_fault = hmm_devmem_fault; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.altmap_valid = false; + devmem->pagemap.registered = false; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; - devres_add(device, devmem); + result = devm_memremap_pages(devmem->device, &devmem->pagemap); + if (IS_ERR(result)) + return result; - ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); - if (ret) { - hmm_devmem_remove(devmem); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) return ERR_PTR(ret); - } return devmem; - -error_pages: - devm_release_mem_region(device, devmem->resource->start, - resource_size(devmem->resource)); -error_no_resource: -error_devm_add_action: - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); -error_percpu_ref: - devres_free(devmem); - return ERR_PTR(ret); } -EXPORT_SYMBOL(hmm_devmem_add); +EXPORT_SYMBOL_GPL(hmm_devmem_add); struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, struct device *device, struct resource *res) { struct hmm_devmem *devmem; + void *result; int ret; if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) @@ -1254,8 +1078,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, dev_pagemap_get_ops(); - devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), - GFP_KERNEL, dev_to_node(device)); + devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); @@ -1269,71 +1092,37 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 0, GFP_KERNEL); if (ret) - goto error_percpu_ref; + return ERR_PTR(ret); - ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, + &devmem->ref); if (ret) - goto error_devm_add_action; - + return ERR_PTR(ret); devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); - ret = hmm_devmem_pages_create(devmem); - if (ret) - goto error_devm_add_action; + devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; + devmem->pagemap.res = *devmem->resource; + devmem->pagemap.page_fault = hmm_devmem_fault; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.altmap_valid = false; + devmem->pagemap.registered = false; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; - devres_add(device, devmem); + result = devm_memremap_pages(devmem->device, &devmem->pagemap); + if (IS_ERR(result)) + return result; - ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); - if (ret) { - hmm_devmem_remove(devmem); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) return ERR_PTR(ret); - } return devmem; - -error_devm_add_action: - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); -error_percpu_ref: - devres_free(devmem); - return ERR_PTR(ret); -} -EXPORT_SYMBOL(hmm_devmem_add_resource); - -/* - * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) - * - * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory - * - * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf - * of the device driver. It will free struct page and remove the resource that - * reserved the physical address range for this device memory. - */ -void hmm_devmem_remove(struct hmm_devmem *devmem) -{ - resource_size_t start, size; - struct device *device; - bool cdm = false; - - if (!devmem) - return; - - device = devmem->device; - start = devmem->resource->start; - size = resource_size(devmem->resource); - - cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); - hmm_devmem_pages_remove(devmem); - - if (!cdm) - devm_release_mem_region(device, start, size); } -EXPORT_SYMBOL(hmm_devmem_remove); +EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); /* * A device driver that wants to handle multiple devices memory through a diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 323acdd14e6e..e9177363fe2e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * ->lru in the tail pages is occupied by compound_head. - * Let's use ->mapping + ->index in the second tail page as list_head. - */ - return (struct list_head *)&page[2].mapping; + /* ->lru in the tail pages is occupied by compound_head. */ + return &page[2].deferred_list; } void prep_transhuge_page(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 129088710510..d4337e6d45a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) - return ERR_PTR(-ENOMEM); + if (map_chg < 0) { + ret = -ENOMEM; + goto out; + } /* * Processes that did not create the mapping will have no @@ -2019,8 +2021,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); + ret = -ENOSPC; + goto out_reservation; } /* @@ -2049,8 +2051,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + if (!page) { + ret = -ENOSPC; goto out_uncharge_cgroup; + } if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetPagePrivate(page); h->resv_huge_pages--; @@ -2087,8 +2091,10 @@ out_uncharge_cgroup: out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_reservation: vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); +out: + return ERR_PTR(ret); } int alloc_bootmem_huge_page(struct hstate *h) @@ -3159,7 +3165,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; @@ -3686,6 +3692,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3723,7 @@ retry: u32 hash; struct vm_fault vmf = { .vma = vma, - .address = address, + .address = haddr, .flags = flags, /* * Hard to debug if it ends up being @@ -3733,14 +3740,14 @@ retry: * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } - page = alloc_huge_page(vma, address, 0); + page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); if (ret == -ENOMEM) @@ -3789,12 +3796,12 @@ retry: * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3808,17 +3815,17 @@ retry: if (anon_rmap) { ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, vma, address); + hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); } spin_unlock(ptl); @@ -3830,7 +3837,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, address, page); + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -3883,10 +3890,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); - address &= huge_page_mask(h); - - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -3896,20 +3902,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } else { - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3939,16 +3945,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, - vma, address); + vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3973,16 +3979,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, + ret = hugetlb_cow(mm, vma, haddr, ptep, pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, ptep); + update_mmu_cache(vma, haddr, ptep); out_put_page: if (page != pagecache_page) unlock_page(page); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..68c2f2f3c05b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc0..f0179c9c04c2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) @@ -216,6 +216,8 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) + /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -840,6 +842,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *page_stable_node(struct page *page) +{ + return PageKsm(page) ? page_rmapping(page) : NULL; +} + +static inline void set_page_stable_node(struct page *page, + struct stable_node *stable_node) +{ + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: @@ -2587,10 +2600,15 @@ again: anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + unsigned long addr; + cond_resched(); vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) + + /* Ignore the stable/unstable/sqnr flags */ + addr = rmap_item->address & ~KSM_FLAG_MASK; + + if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this @@ -2604,8 +2622,7 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg)) { + if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed47..d9c84c5bda1d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> +#include <linux/prefetch.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> @@ -133,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + /* + * Prefetch the neighboring list entries to reduce lock hold time. + */ + prefetchw(item->prev); + prefetchw(item->next); + spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_kmem(nlru, item); diff --git a/mm/memblock.c b/mm/memblock.c index 5108356ad8aa..93ad42bc8a73 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + &base, &end, (void *)_RET_IP_); + return memblock_remove_range(&memblock.memory, base, size); } @@ -925,7 +930,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1046,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1521,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1542,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1550,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1585,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1598,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1695f38630f1..537ab73caf72 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -664,21 +664,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) } } -struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { - /* - * mm_update_next_owner() may clear mm->owner to NULL - * if it races with swapoff, page migration, etc. - * So this can be called with p == NULL. - */ - if (unlikely(!p)) - return NULL; - return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; @@ -692,7 +683,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (unlikely(!mm)) memcg = root_mem_cgroup; else { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + memcg = rcu_dereference(mm->memcg); if (unlikely(!memcg)) memcg = root_mem_cgroup; } @@ -701,6 +692,20 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static __always_inline struct mem_cgroup *get_mem_cgroup( + struct mem_cgroup *memcg, struct mm_struct *mm) +{ + if (unlikely(memcg)) { + rcu_read_lock(); + if (css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return memcg; + } + rcu_read_unlock(); + } + return get_mem_cgroup_from_mm(mm); +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -888,7 +893,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * value, the function breaks the iteration loop and returns the value. * Otherwise, it will iterate over all tasks and return 0. * - * This function must not be called for the root memory cgroup. + * If memcg is the root memory cgroup, this function will iterate only + * over tasks belonging directly to the root memory cgroup. */ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) @@ -896,8 +902,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); - for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -906,7 +910,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); - if (ret) { + if (ret || memcg == root_mem_cgroup) { mem_cgroup_iter_break(memcg, iter); break; } @@ -1034,13 +1038,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1118,43 +1122,49 @@ static const char *const memcg1_stat_names[] = { }; #define K(x) ((x) << (PAGE_SHIFT-10)) + /** - * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. - * @memcg: The memory cgroup that went over limit - * @p: Task that is going to be killed + * mem_cgroup_print_oom_context: Print OOM context information including allocation + * constraint, nodemask, orgin memcg that has reached its limit, kill memcg that + * contains the killed process, killed process's command, pid and pid. * - * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is - * enabled + * @oc: pointer to struct oom_control + * @p: Task that is going to be killed */ -void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { - struct mem_cgroup *iter; - unsigned int i; - rcu_read_lock(); - + pr_cont("origin_memcg="); + if (memcg) + pr_cont_cgroup_path(memcg->css.cgroup); if (p) { - pr_info("Task in "); + pr_cont(" kill_memcg="); pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); - } else { - pr_info("Memory limit reached of cgroup "); + pr_cont(" task=%s pid=%5d uid=%5d\n", p->comm, p->pid, + from_kuid(&init_user_ns, task_uid(p))); } - - pr_cont_cgroup_path(memcg->css.cgroup); - pr_cont("\n"); - rcu_read_unlock(); +} + +/** + * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + */ +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + unsigned int i; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1189,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -1889,6 +1899,18 @@ void mem_cgroup_handle_over_high(void) current->memcg_nr_pages_over_high = 0; } +/* + * Based on try_charge() force charge conditions. + */ +static inline bool should_force_charge(gfp_t gfp_mask) +{ + return (unlikely(tsk_is_oom_victim(current) || + fatal_signal_pending(current) || + current->flags & PF_EXITING || + current->flags & PF_MEMALLOC || + gfp_mask & __GFP_NOFAIL)); +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -2004,6 +2026,8 @@ force: * The allocation either can't fail or will lead to more memory * being freed very soon. Allow memory usage go over the limit * temporarily by force charging it. + * + * NOTE: Please keep the should_force_charge() conditions in sync. */ page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) @@ -2181,6 +2205,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) memcg_create_kmem_cache(memcg, cachep); css_put(&memcg->css); + kmem_cache_put(cachep); kfree(cw); } @@ -2196,6 +2221,12 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, if (!cw) return; + /* Make sure root kmem cache does not get destroyed in the middle */ + if (!kmem_cache_tryget(cachep)) { + kfree(cw); + return; + } + css_get(&memcg->css); cw->memcg = memcg; @@ -2261,7 +2292,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup(current->target_memcg, current->mm); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2320,8 +2351,11 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - cancel_charge(memcg, nr_pages); - return -ENOMEM; + if (!should_force_charge(gfp)) { + cancel_charge(memcg, nr_pages); + return -ENOMEM; + } + page_counter_charge(&memcg->kmem, nr_pages); } page->mem_cgroup = memcg; @@ -2345,7 +2379,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup(current->target_memcg, current->mm); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) @@ -2444,12 +2478,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; + bool drained = false; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; @@ -2460,26 +2495,32 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw)) { ret = -EBUSY; @@ -2592,6 +2633,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) return ret; } +static long memcg_oom_badness(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + long points = 0; + int nid; + pg_data_t *pgdat; + + for_each_node_state(nid, N_MEMORY) { + if (nodemask && !node_isset(nid, *nodemask)) + continue; + + points += mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); + + pgdat = NODE_DATA(nid); + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), + NR_SLAB_UNRECLAIMABLE); + } + + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / + (PAGE_SIZE / 1024); + points += memcg_page_state(memcg, MEMCG_SOCK); + points += memcg_page_state(memcg, MEMCG_SWAP); + + return points; +} + +/* + * Checks if the given memcg is a valid OOM victim and returns a number, + * which means the folowing: + * -1: there are inflight OOM victim tasks, belonging to the memcg + * 0: memcg is not eligible, e.g. all belonging tasks are protected + * by oom_score_adj set to OOM_SCORE_ADJ_MIN + * >0: memcg is eligible, and the returned value is an estimation + * of the memory footprint + */ +static long oom_evaluate_memcg(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + struct css_task_iter it; + struct task_struct *task; + int eligible = 0; + + /* + * Root memory cgroup is a special case: + * we don't have necessary stats to evaluate it exactly as + * leaf memory cgroups, so we approximate it's oom_score + * by summing oom_score of all belonging tasks, which are + * owners of their mm structs. + * + * If there are inflight OOM victim tasks inside + * the root memcg, we return -1. + */ + if (memcg == root_mem_cgroup) { + struct css_task_iter it; + struct task_struct *task; + long score = 0; + + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, + &task->signal->oom_mm->flags)) { + score = -1; + break; + } + + task_lock(task); + if (!task->mm) { + task_unlock(task); + continue; + } + task_unlock(task); + + score += oom_badness(task, memcg, nodemask, + totalpages); + } + css_task_iter_end(&it); + + return score; + } + + /* + * Memcg is OOM eligible if there are OOM killable tasks inside. + * + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN + * as unkillable. + * + * If there are inflight OOM victim tasks inside the memcg, + * we return -1. + */ + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (!eligible && + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) + eligible = 1; + + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { + eligible = -1; + break; + } + } + css_task_iter_end(&it); + + if (eligible <= 0) + return eligible; + + return memcg_oom_badness(memcg, nodemask, totalpages); +} + +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) +{ + struct mem_cgroup *iter, *group = NULL; + long group_score = 0; + + oc->chosen_memcg = NULL; + oc->chosen_points = 0; + + /* + * If OOM is memcg-wide, and the memcg has the oom_group flag set, + * all tasks belonging to the memcg should be killed. + * So, we mark the memcg as a victim. + */ + if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) { + oc->chosen_memcg = oc->memcg; + css_get(&oc->chosen_memcg->css); + return; + } + + /* + * The oom_score is calculated for leaf memory cgroups (including + * the root memcg). + * Non-leaf oom_group cgroups accumulating score of descendant + * leaf memory cgroups. + */ + rcu_read_lock(); + for_each_mem_cgroup_tree(iter, root) { + long score; + + /* + * We don't consider non-leaf non-oom_group memory cgroups + * as OOM victims. + */ + if (memcg_has_children(iter) && iter != root_mem_cgroup && + !mem_cgroup_oom_group(iter)) + continue; + + /* + * If group is not set or we've ran out of the group's sub-tree, + * we should set group and reset group_score. + */ + if (!group || group == root_mem_cgroup || + !mem_cgroup_is_descendant(iter, group)) { + group = iter; + group_score = 0; + } + + if (memcg_has_children(iter) && iter != root_mem_cgroup) + continue; + + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); + + /* + * Ignore empty and non-eligible memory cgroups. + */ + if (score == 0) + continue; + + /* + * If there are inflight OOM victims, we don't need + * to look further for new victims. + */ + if (score == -1) { + oc->chosen_memcg = INFLIGHT_VICTIM; + mem_cgroup_iter_break(root, iter); + break; + } + + group_score += score; + + if (group_score > oc->chosen_points) { + oc->chosen_points = group_score; + oc->chosen_memcg = group; + } + } + + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) + css_get(&oc->chosen_memcg->css); + + rcu_read_unlock(); +} + +bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + struct mem_cgroup *root; + + if (mem_cgroup_disabled()) + return false; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return false; + + if (oc->memcg) + root = oc->memcg; + else + root = root_mem_cgroup; + + select_victim_memcg(root, oc); + + return oc->chosen_memcg; +} + /* * Reclaims as many pages from the given memcg as possible. * @@ -2603,6 +2862,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); + + drain_all_stock(memcg); + /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { int progress; @@ -2757,7 +3019,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +3133,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +3175,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +3203,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3083,7 +3345,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ /* Universal VM events cgroup1 shows, original sort order */ -unsigned int memcg1_events[] = { +static const unsigned int memcg1_events[] = { PGPGIN, PGPGOUT, PGFAULT, @@ -3126,8 +3388,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3562,11 +3824,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -3626,7 +3883,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4031,6 +4288,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); @@ -4041,8 +4306,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { - idr_remove(&mem_cgroup_idr, memcg->id.id); - memcg->id.id = 0; + mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); @@ -4179,8 +4443,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: - if (memcg->id.id > 0) - idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); return NULL; } @@ -4239,6 +4502,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: + mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); return ERR_PTR(-ENOMEM); } @@ -4270,7 +4534,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4584,13 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); - memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -4834,8 +5100,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) mm = get_task_mm(p); if (!mm) return 0; - /* We move charges only when we move a owner of the mm */ - if (mm->owner == p) { + + /* We move charges except for creative uses of CLONE_VM */ + if (mm->memcg == from) { VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); @@ -5035,6 +5302,58 @@ static void mem_cgroup_move_task(void) } #endif +/** + * mm_update_memcg - Update the memory cgroup of a mm_struct + * @mm: mm struct + * @new: new memory cgroup value + * + * Called whenever mm->memcg needs to change. Consumes a reference + * to new (unless new is NULL). The reference to the old memory + * cgroup is decreased. + */ +void mm_update_memcg(struct mm_struct *mm, struct mem_cgroup *new) +{ + /* This is the only place where mm->memcg is changed */ + struct mem_cgroup *old; + + old = xchg(&mm->memcg, new); + if (old) + css_put(&old->css); +} + +static void task_update_memcg(struct task_struct *tsk, struct mem_cgroup *new) +{ + struct mm_struct *mm; + task_lock(tsk); + mm = tsk->mm; + if (mm && !(tsk->flags & PF_KTHREAD)) + mm_update_memcg(mm, new); + task_unlock(tsk); +} + +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *tsk; + + cgroup_taskset_for_each(tsk, css, tset) { + struct mem_cgroup *new = mem_cgroup_from_css(css); + css_get(css); + task_update_memcg(tsk, new); + } +} + +void mm_sync_memcg_from_task(struct task_struct *tsk) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = task_css(tsk, memory_cgrp_id); + if (css && css_tryget(css)) + task_update_memcg(tsk, mem_cgroup_from_css(css)); + rcu_read_unlock(); +} + /* * Cgroup retains root cgroups across [un]mount cycles making it necessary * to verify whether we're attached to the default hierarchy on each mount @@ -5061,10 +5380,40 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5435,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5131,7 +5480,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5504,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -5190,6 +5539,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + bool oom_group = memcg->oom_group; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + seq_printf(m, "%d\n", oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int oom_group; + int err; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + err = kstrtoint(strstrip(buf), 0, &oom_group); + if (err) + return err; + + memcg->oom_group = oom_group; + + return nbytes; +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5296,6 +5678,12 @@ static struct cftype memory_files[] = { .read_u64 = memory_current_read, }, { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_low_show, @@ -5314,6 +5702,12 @@ static struct cftype memory_files[] = { .write = memory_max_write, }, { + .name = "oom_group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, + { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct mem_cgroup, events_file), @@ -5335,8 +5729,10 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, + .fork = mm_sync_memcg_from_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, @@ -5344,54 +5740,140 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + * + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected + * + * @root is exclusive; it is never protected when looked at directly + * + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. + * + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. + * + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage + * + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. * - * For example, given cgroup A with children B and C: + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: * - * A - * / \ - * B C + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 * - * and + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * A/memory.current = 2G * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { + struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; + if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; + + usage = page_counter_read(&memcg->memory); + if (!usage) + return MEMCG_PROT_NONE; + + emin = memcg->memory.min; + elow = memcg->memory.low; + + parent = parent_mem_cgroup(memcg); + if (parent == root) + goto exit; + + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); } - return true; + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } + +exit: + memcg->memory.emin = emin; + memcg->memory.elow = elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** @@ -6012,10 +6494,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + return 0; + } + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6067,7 +6556,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6088,7 +6577,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6122,7 +6611,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6144,15 +6633,23 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6165,6 +6662,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 000000000000..27069518e3c5 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/khugepaged.h> +#include <linux/syscalls.h> +#include <linux/hugetlb.h> +#include <linux/shmem_fs.h> +#include <linux/memfd.h> +#include <uapi/linux/memfd.h> + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/memory.c b/mm/memory.c index 01f5464e0fd2..14578158ed20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1933,7 +1929,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* @@ -1955,12 +1952,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_mixed); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ + +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, true); + int err; + + err = __vm_insert_mixed(vma, addr, pfn, true); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed_mkwrite); +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old @@ -2925,7 +2935,7 @@ int do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -3035,7 +3045,6 @@ int do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3049,6 +3058,7 @@ int do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 25982467800b..7deb49f69e27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page) return page + pageblock_nr_pages; } +static bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +} + /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..a66f2052c7b1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); diff --git a/mm/mmap.c b/mm/mmap.c index d817764a9974..d1eb87ef4b1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; diff --git a/mm/nommu.c b/mm/nommu.c index 13723736d38f..4452d8bd9ae4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ba6cb88cf58..310babe45e93 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } @@ -304,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -static int oom_evaluate_task(struct task_struct *task, void *arg) +int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; @@ -338,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); + if (oc->chosen_task) + put_task_struct(oc->chosen_task); get_task_struct(task); - oc->chosen = task; + oc->chosen_task = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); - oc->chosen = (void *)-1UL; + if (oc->chosen_task) + put_task_struct(oc->chosen_task); + oc->chosen_task = INFLIGHT_VICTIM; return 1; } /* * Simple selection loop. We choose the process with the highest number of - * 'points'. In case scan was aborted, oc->chosen is set to -1. + * 'points'. In case scan was aborted, oc->chosen_task is set to -1. */ static void select_bad_process(struct oom_control *oc) { @@ -421,6 +421,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { + enum oom_constraint constraint = constrained_alloc(oc); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, nodemask_pr_args(oc->nodemask), oc->order, @@ -430,8 +432,26 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) cpuset_print_current_mems_allowed(); dump_stack(); + pr_info("oom-kill: constraint=CONSTRAINT_"); + switch (constraint) { + case CONSTRAINT_NONE: + pr_cont("NONE "); + break; + case CONSTRAINT_CPUSET: + pr_cont("CPUSET "); + break; + case CONSTRAINT_MEMORY_POLICY: + pr_cont("MEMORY_POLICY "); + break; + default: + pr_cont("MEMCG "); + break; + } + pr_cont("nodemask=%*pbl ", nodemask_pr_args(oc->nodemask)); + mem_cgroup_print_oom_context(oc->memcg, p); + pr_cont("\n"); if (is_memcg_oom(oc)) - mem_cgroup_print_oom_info(oc->memcg, p); + mem_cgroup_print_oom_meminfo(oc->memcg); else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); if (is_dump_unreclaim_slabs()) @@ -835,67 +855,22 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly + * __oom_kill_process() is used to kill all tasks belonging to + * the selected memory cgroup, so we should check that we're not + * trying to kill an unkillable task. */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + if (is_global_init(victim) || (victim->flags & PF_KTHREAD) || + victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + put_task_struct(victim); return; } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); p = find_lock_task_mm(victim); if (!p) { @@ -970,6 +945,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } #undef K +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen_task; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + __oom_kill_process(victim); +} + +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + get_task_struct(task); + __oom_kill_process(task); + return 0; +} + +static bool oom_kill_memcg_victim(struct oom_control *oc) +{ + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) + return oc->chosen_memcg; + + /* + * If memory.oom_group is set, kill all tasks belonging to the sub-tree + * of the chosen memory cgroup, otherwise kill the task with the biggest + * memory footprint. + */ + if (mem_cgroup_oom_group(oc->chosen_memcg)) { + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member, + NULL); + /* We have one or more terminating processes at this point. */ + oc->chosen_task = INFLIGHT_VICTIM; + } else { + oc->chosen_points = 0; + oc->chosen_task = NULL; + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); + + if (oc->chosen_task == NULL || + oc->chosen_task == INFLIGHT_VICTIM) + goto out; + + __oom_kill_process(oc->chosen_task); + } + +out: + mem_cgroup_put(oc->chosen_memcg); + return oc->chosen_task; +} + /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ @@ -1022,6 +1099,7 @@ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; + bool delay = false; /* if set, delay next allocation attempt */ if (oom_killer_disabled) return false; @@ -1066,27 +1144,37 @@ bool out_of_memory(struct oom_control *oc) current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oc->chosen = current; + oc->chosen_task = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } + if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) { + delay = true; + goto out; + } + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); + delay = true; } - return !!oc->chosen; + +out: + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ + if (delay) + schedule_timeout_killable(1); + + return !!oc->chosen_task; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86a0dcf29e2e..ccf80e97181a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -946,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; @@ -955,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 2: /* * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. + * deferred_list.next -- ignore value. */ break; default: @@ -3701,7 +3698,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP -struct lockdep_map __fs_reclaim_map = +static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); static bool __need_fs_reclaim(gfp_t gfp_mask) @@ -3726,17 +3723,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return true; } +void __fs_reclaim_acquire(void) +{ + lock_map_acquire(&__fs_reclaim_map); +} + +void __fs_reclaim_release(void) +{ + lock_map_release(&__fs_reclaim_map); +} + void fs_reclaim_acquire(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_acquire(&__fs_reclaim_map); + __fs_reclaim_acquire(); } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_release(&__fs_reclaim_map); + __fs_reclaim_release(); } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -3754,8 +3761,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3763,8 +3770,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); cond_resched(); @@ -4162,7 +4169,6 @@ retry: * orientated. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } @@ -4326,8 +4332,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, - unsigned int order, struct alloc_context *ac) +static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4358,7 +4363,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, order, &ac); + finalise_ac(gfp_mask, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -6326,18 +6331,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; + freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6369,7 +6374,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif @@ -7779,29 +7784,6 @@ unmovable: return true; } -bool is_pageblock_removable_nolock(struct page *page) -{ - struct zone *zone; - unsigned long pfn; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); -} - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,40 @@ #include <linux/bug.h> #include <asm/page.h> +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) +{ + unsigned long protected, old_protected; + long delta; + + if (!c->parent) + return; + + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } + + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -22,7 +56,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +76,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +118,10 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -123,20 +161,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,22 +187,56 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } /** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value diff --git a/mm/page_owner.c b/mm/page_owner.c index 75d21a2259b3..77d9e791ae8a 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) unsigned long block_end_pfn; if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/shmem.c b/mm/shmem.c index 9d6c7e595415..2b686e3f53ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && + shmem_huge != SHMEM_HUGE_DENY) + return true; + return false; +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -682,7 +691,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); + + if (is_huge_enabled(sb_info)) + stat->blksize = HPAGE_PMD_SIZE; + return 0; } @@ -1098,13 +1112,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -1322,9 +1342,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1370,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); @@ -1931,14 +1947,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in return ret; } -static int shmem_fault(struct vm_fault *vmf) +static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; - int error; - int ret = VM_FAULT_LOCKED; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can @@ -2006,10 +2022,10 @@ static int shmem_fault(struct vm_fault *vmf) else if (vma->vm_flags & VM_HUGEPAGE) sgp = SGP_HUGE; - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (err) + return vmf_error(err); return ret; } @@ -2616,241 +2632,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. - */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void shmem_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int shmem_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - int error, scan; - - shmem_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (file->f_op == &shmem_file_operations) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. - * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = shmem_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3428,6 +3209,15 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3444,7 +3234,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } @@ -3673,93 +3463,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) diff --git a/mm/slab.c b/mm/slab.c index 2f308253c3d7..4191d0582039 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) @@ -1883,8 +1881,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, struct kmem_cache *cachep; cachep = find_mergeable(size, align, flags, name, ctor); - if (cachep) { - cachep->refcount++; + if (cachep && kmem_cache_tryget(cachep)) { + cachep->shared_count++; /* * Adjust the object sizes so that we clear @@ -2665,6 +2663,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3070,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..10b99dc08d59 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -25,7 +25,8 @@ struct kmem_cache { unsigned int useroffset;/* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ - int refcount; /* Use counter */ + refcount_t refcount; /* Refcount for root kmem cache */ + int shared_count; /* Number of kmem caches sharing this cache */ void (*ctor)(void *); /* Called on object slot creation */ struct list_head list; /* List of all slab caches on the system */ }; @@ -203,6 +204,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +extern void kmem_cache_put_locked(struct kmem_cache *s); + #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* List of all root caches. */ @@ -295,7 +298,6 @@ extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); - #else /* CONFIG_MEMCG && !CONFIG_SLOB */ /* If !memcg, all caches are root. */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 98dcdc352062..b383149e04ec 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -306,7 +306,7 @@ int slab_unmergeable(struct kmem_cache *s) /* * We may have set a slab to be unmergeable during bootstrap. */ - if (s->refcount < 0) + if (s->shared_count < 0) return 1; return 0; @@ -391,7 +391,8 @@ static struct kmem_cache *create_cache(const char *name, if (err) goto out_free_cache; - s->refcount = 1; + s->shared_count = 1; + refcount_set(&s->refcount, 1); list_add(&s->list, &slab_caches); memcg_link_cache(s); out: @@ -611,6 +612,18 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; + /* + * The root cache has been requested to be destroyed while its memcg + * cache was in creation queue. + * + * The shared_count can be out-dated or can be incremented after return. + * No big worries, at worst the creation of memcg kmem_cache is delayed. + * The next allocation will again trigger the memcg kmem_cache creation + * request. + */ + if (!root_cache->shared_count) + goto out_unlock; + idx = memcg_cache_id(memcg); arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex)); @@ -663,6 +676,8 @@ static void kmemcg_deactivate_workfn(struct work_struct *work) { struct kmem_cache *s = container_of(work, struct kmem_cache, memcg_params.deact_work); + struct kmem_cache *root = s->memcg_params.root_cache; + struct mem_cgroup *memcg = s->memcg_params.memcg; get_online_cpus(); get_online_mems(); @@ -677,7 +692,8 @@ static void kmemcg_deactivate_workfn(struct work_struct *work) put_online_cpus(); /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ - css_put(&s->memcg_params.memcg->css); + css_put(&memcg->css); + kmem_cache_put(root); } static void kmemcg_deactivate_rcufn(struct rcu_head *head) @@ -712,6 +728,10 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, WARN_ON_ONCE(s->memcg_params.deact_fn)) return; + /* Make sure root kmem_cache does not get destroyed in the middle */ + if (!kmem_cache_tryget(s->memcg_params.root_cache)) + return; + /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); @@ -838,21 +858,17 @@ void slab_kmem_cache_release(struct kmem_cache *s) kmem_cache_free(kmem_cache, s); } -void kmem_cache_destroy(struct kmem_cache *s) +static void __kmem_cache_destroy(struct kmem_cache *s, bool lock) { int err; - if (unlikely(!s)) - return; - - get_online_cpus(); - get_online_mems(); - - mutex_lock(&slab_mutex); + if (lock) { + get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); + } - s->refcount--; - if (s->refcount) - goto out_unlock; + VM_BUG_ON(s->shared_count); err = shutdown_memcg_caches(s); if (!err) @@ -863,11 +879,75 @@ void kmem_cache_destroy(struct kmem_cache *s) s->name); dump_stack(); } -out_unlock: - mutex_unlock(&slab_mutex); - put_online_mems(); - put_online_cpus(); + if (lock) { + mutex_unlock(&slab_mutex); + put_online_mems(); + put_online_cpus(); + } +} + +/* + * kmem_cache_tryget - Try to get a reference on a kmem_cache + * @s: target kmem_cache + * + * Obtain a reference on a kmem_cache unless it already has reached zero and is + * being released. The caller needs to ensure that kmem_cache is accessible. + * Currently only root kmem_cache supports reference counting. + */ +bool kmem_cache_tryget(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return refcount_inc_not_zero(&s->refcount); + return false; +} + +/* + * kmem_cache_put - Put a reference on a kmem_cache + * @s: target kmem_cache + * + * Put a reference obtained via kmem_cache_tryget(). This function can not be + * called within slab_mutex as it can trigger a destruction of a kmem_cache + * which requires slab_mutex. + */ +void kmem_cache_put(struct kmem_cache *s) +{ + if (is_root_cache(s) && + refcount_dec_and_test(&s->refcount)) + __kmem_cache_destroy(s, true); +} + +/* + * kmem_cache_put_locked - Put a reference on a kmem_cache while holding + * slab_mutex + * @s: target kmem_cache + * + * Put a reference obtained via kmem_cache_tryget(). Use this function instead + * of kmem_cache_put if the caller has already acquired slab_mutex. + * + * At the moment this function is not exposed externally and is used by SLUB. + */ +void kmem_cache_put_locked(struct kmem_cache *s) +{ + if (is_root_cache(s) && + refcount_dec_and_test(&s->refcount)) + __kmem_cache_destroy(s, false); +} + +void kmem_cache_destroy(struct kmem_cache *s) +{ + if (unlikely(!s)) + return; + + /* + * It is safe to decrement shared_count without any lock. In + * __kmem_cache_alias the kmem_cache's refcount is elevated before + * incrementing shared_count and below the reference is dropped after + * decrementing shared_count. At worst shared_count can be outdated for + * a small window but that is tolerable. + */ + s->shared_count--; + kmem_cache_put(s); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -919,7 +999,8 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", name, size, err); - s->refcount = -1; /* Exempt from merging for now */ + s->shared_count = -1; /* Exempt from merging for now */ + refcount_set(&s->refcount, 1); } struct kmem_cache *__init create_kmalloc_cache(const char *name, @@ -934,7 +1015,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, create_boot_cache(s, name, size, flags, useroffset, usersize); list_add(&s->list, &slab_caches); memcg_link_cache(s); - s->refcount = 1; + s->shared_count = 1; + refcount_set(&s->refcount, 1); return s; } diff --git a/mm/slob.c b/mm/slob.c index 623e8a5c46ce..307c2c9feb44 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; diff --git a/mm/slub.c b/mm/slub.c index 44aa7847324a..92ada5a3346b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -52,11 +52,11 @@ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can @@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return (((unsigned int)PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } static inline struct kmem_cache_order_objects oo_make(unsigned int order, - unsigned int size, unsigned int reserved) + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; @@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -711,7 +696,7 @@ void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; @@ -847,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + length = PAGE_SIZE << compound_order(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -936,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = order_objects(compound_order(page), s->size, s->reserved); + maxobj = order_objects(compound_order(page), s->size); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", page->objects, maxobj); @@ -986,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + max_objects = order_objects(compound_order(page), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1694,24 +1679,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; memcg_uncharge_slab(page, order, s); __free_pages(page, order); } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) - static void rcu_free_slab(struct rcu_head *h) { - struct page *page; - - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); + struct page *page = container_of(h, struct page, rcu_head); __free_slab(page->slab_cache, page); } @@ -1719,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - struct rcu_head *head; - - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; - - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; - } else { - head = &page->rcu_head; - } - - call_rcu(head, rcu_free_slab); + call_rcu(&page->rcu_head, rcu_free_slab); } else __free_slab(s, page); } @@ -2444,6 +2409,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) @@ -3226,21 +3193,21 @@ static unsigned int slub_min_objects; */ static inline unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, - unsigned int fract_leftover, unsigned int reserved) + unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; @@ -3249,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size, return order; } -static inline int calculate_order(unsigned int size, unsigned int reserved) +static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; @@ -3266,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { @@ -3275,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); + slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; @@ -3287,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); + order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); + order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3562,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size, s->reserved); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -3580,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -3591,14 +3558,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { @@ -4239,12 +4202,6 @@ void __init kmem_cache_init(void) SLAB_HWCACHE_ALIGN, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ @@ -4274,8 +4231,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, struct kmem_cache *s, *c; s = find_mergeable(size, align, flags, name, ctor); - if (s) { - s->refcount++; + if (s && kmem_cache_tryget(s)) { + s->shared_count++; /* * Adjust the object sizes so that we clear @@ -4290,7 +4247,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, } if (sysfs_slab_alias(s, name)) { - s->refcount--; + s->shared_count--; + kmem_cache_put_locked(s); s = NULL; } } @@ -5013,7 +4971,8 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); + return sprintf(buf, "%d\n", + s->shared_count < 0 ? 0 : s->shared_count - 1); } SLAB_ATTR_RO(aliases); @@ -5117,12 +5076,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t reserved_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%u\n", s->reserved); -} -SLAB_ATTR_RO(reserved); - #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -5166,7 +5119,7 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, * as well as cause other issues like converting a mergeable * cache into an umergeable one. */ - if (s->refcount > 1) + if (s->shared_count > 1) return -EINVAL; s->flags &= ~SLAB_TRACE; @@ -5284,7 +5237,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) static ssize_t failslab_store(struct kmem_cache *s, const char *buf, size_t length) { - if (s->refcount > 1) + if (s->shared_count > 1) return -EINVAL; s->flags &= ~SLAB_FAILSLAB; @@ -5435,7 +5388,6 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b..640e68f8324b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -303,7 +303,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } if (vmemmap_buf_start) { diff --git a/mm/sparse.c b/mm/sparse.c index 73dc2fcc0eab..b2848cc6e32a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,18 +190,22 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +/* + * Record how many memory sections are marked as present + * during system bootup. + */ +static int __initdata nr_present_sections; + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -231,6 +235,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); + nr_present_sections++; } } } @@ -446,7 +451,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -474,7 +478,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; return NULL; } #endif @@ -486,10 +489,12 @@ void __weak __meminit vmemmap_populate_print_last(void) /** * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap * @map: usemap_map for pageblock flags or mmap_map for vmemmap + * @unit_size: size of map unit */ static void __init alloc_usemap_and_memmap(void (*alloc_func) (void *, unsigned long, unsigned long, - unsigned long, int), void *data) + unsigned long, int), void *data, + int data_unit_size) { unsigned long pnum; unsigned long map_count; @@ -524,7 +529,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } @@ -566,7 +571,8 @@ void __init sparse_init(void) if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); + (void *)usemap_map, + sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; @@ -574,21 +580,28 @@ void __init sparse_init(void) if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); + (void *)map_map, + sizeof(map_map[0])); #endif for_each_present_section_nr(0, pnum) { + struct mem_section *ms; + ms = __nr_to_section(pnum); usemap = usemap_map[pnum]; - if (!usemap) + if (!usemap) { + ms->section_mem_map = 0; continue; + } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER map = map_map[pnum]; #else map = sparse_early_mem_map_alloc(pnum); #endif - if (!map) + if (!map) { + ms->section_mem_map = 0; continue; + } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f2641894f440..f51ac051c0c9 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ repeat: } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3..c6b3eab73fde 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC @@ -337,8 +334,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -381,8 +383,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -392,7 +394,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 78a015fcec3b..744d66666f82 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> +#include <linux/stop_machine.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -1107,6 +1108,64 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * preempt_disable() in get_swap_device() or after the + * preempt_enable() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long type, offset; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + si = swap_info[type]; + + preempt_disable(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + preempt_enable(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1328,11 +1387,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1357,9 +1423,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -1800,8 +1868,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1810,6 +1876,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not @@ -2451,9 +2519,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2478,7 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2497,6 +2569,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, add_to_avail_list(p); } +static int swap_onoff_stop(void *arg) +{ + return 0; +} + static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, @@ -2505,7 +2582,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are used + * between get/put_swap_device() only if SWP_VALID bit is set + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2514,7 +2601,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2617,6 +2705,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3365,22 +3464,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) + p = get_swap_device(entry); + if (!p) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = swap_info[type]; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3426,11 +3519,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3522,6 +3613,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3529,15 +3621,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3555,9 +3647,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3609,10 +3700,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b81ede7..5029f241908f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + bool zeropage, + bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; @@ -431,6 +432,15 @@ retry: down_read(&dst_mm->mmap_sem); /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ @@ -563,13 +573,15 @@ out: } ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len) + unsigned long src_start, unsigned long len, + bool *mmap_changing) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, + mmap_changing); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len) + unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 63a5f502da08..89efac3a020e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1141,16 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1499,7 +1486,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); @@ -1519,16 +1505,17 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = remove_vm_area(addr); + area = find_vmap_area((unsigned long)addr)->vm; if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + remove_vm_area(addr); if (deallocate_pages) { int i; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce2d25d..4854584ec436 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } -static enum vmpressure_levels str_to_level(const char *arg) -{ - enum vmpressure_levels level; - - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) - if (!strcmp(vmpressure_str_levels[level], arg)) - return level; - return -1; -} - -static enum vmpressure_modes str_to_mode(const char *arg) -{ - enum vmpressure_modes mode; - - for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) - if (!strcmp(vmpressure_str_modes[mode], arg)) - return mode; - return -1; -} - #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** @@ -390,27 +370,26 @@ int vmpressure_register_event(struct mem_cgroup *memcg, char *token; int ret = 0; - spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } - strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); /* Find required level */ token = strsep(&spec, ","); - level = str_to_level(token); - if (level == -1) { - ret = -EINVAL; + level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (level < 0) { + ret = level; goto out; } /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = str_to_mode(token); - if (mode == -1) { - ret = -EINVAL; + mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (mode < 0) { + ret = mode; goto out; } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9b697323a88c..50055d72f294 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; @@ -3318,11 +3334,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + + __fs_reclaim_acquire(); + count_vm_event(PAGEOUTRUN); do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool ret; sc.reclaim_idx = classzone_idx; @@ -3395,7 +3415,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(); + ret = try_to_freeze(); + __fs_reclaim_acquire(); + if (ret || kthread_should_stop()) break; /* @@ -3412,6 +3435,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3600,9 +3624,7 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); - fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); - fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3684,16 +3706,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned long nr_reclaimed; unsigned int noreclaim_flag; - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } @@ -3870,6 +3892,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in }; cond_resched(); + fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE @@ -3877,7 +3900,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3892,9 +3914,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return sc.nr_reclaimed >= nr_pages; } |