summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug10
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c25
-rw-r--r--mm/debug-pagealloc.c45
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c23
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c26
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c180
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c9
-rw-r--r--mm/migrate.c28
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c50
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_ext.c403
-rw-r--r--mm/page_owner.c311
-rw-r--r--mm/rmap.c18
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slub.c17
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c216
-rw-r--r--mm/vmstat.c102
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zsmalloc.c180
-rw-r--r--mm/zswap.c9
32 files changed, 1409 insertions, 524 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..56badfc4810a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,8 +1,18 @@
+config PAGE_EXTENSION
+ bool "Extend memmap on extra space for more information on page"
+ ---help---
+ Extend memmap on extra space for more information on page. This
+ could be used for debugging features that need to insert extra
+ field for every page. This extension enables us to save memory
+ by not allocating this extra memory according to boottime
+ configuration.
+
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
+ select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
diff --git a/mm/Makefile b/mm/Makefile
index b3c6ce932c64..4bf586e66378 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/cma.c b/mm/cma.c
index 8e9ec13d31db..f8917629cbdd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
#include <linux/log2.h>
#include <linux/cma.h>
#include <linux/highmem.h>
+#include <linux/io.h>
struct cma {
unsigned long base_pfn;
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
return (1UL << (align_order - cma->order_per_bit)) - 1;
}
+static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+{
+ unsigned int alignment;
+
+ if (align_order <= cma->order_per_bit)
+ return 0;
+ alignment = 1UL << (align_order - cma->order_per_bit);
+ return ALIGN(cma->base_pfn, alignment) -
+ (cma->base_pfn >> cma->order_per_bit);
+}
+
static unsigned long cma_bitmap_maxno(struct cma *cma)
{
return cma->count >> cma->order_per_bit;
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base,
}
}
+ /*
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
+ */
+ kmemleak_ignore(phys_to_virt(addr));
base = addr;
}
@@ -340,7 +357,7 @@ err:
*/
struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
{
- unsigned long mask, pfn, start = 0;
+ unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
int ret;
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
return NULL;
mask = cma_bitmap_aligned_mask(cma, align);
+ offset = cma_bitmap_aligned_offset(cma, align);
bitmap_maxno = cma_bitmap_maxno(cma);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
for (;;) {
mutex_lock(&cma->lock);
- bitmap_no = bitmap_find_next_zero_area(cma->bitmap,
- bitmap_maxno, start, bitmap_count, mask);
+ bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ bitmap_maxno, start, bitmap_count, mask,
+ offset);
if (bitmap_no >= bitmap_maxno) {
mutex_unlock(&cma->lock);
break;
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 789ff70c8a4a..5bf5906ce13b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -2,23 +2,55 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+static bool page_poisoning_enabled __read_mostly;
+
+static bool need_page_poisoning(void)
+{
+ if (!debug_pagealloc_enabled())
+ return false;
+
+ return true;
+}
+
+static void init_page_poisoning(void)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ page_poisoning_enabled = true;
+}
+
+struct page_ext_operations page_poisoning_ops = {
+ .need = need_page_poisoning,
+ .init = init_page_poisoning,
+};
+
static inline void set_page_poison(struct page *page)
{
- __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static inline void clear_page_poison(struct page *page)
{
- __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static inline bool page_poison(struct page *page)
{
- return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
static void poison_page(struct page *page)
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
+ if (!page_poisoning_enabled)
+ return;
+
if (enable)
unpoison_pages(page, numpages);
else
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45e..2ad7adf4f0a4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
- /* First and last FULL page! */
+ /*
+ * First and last FULL page! Partial pages are deliberately
+ * preserved on the expectation that it is better to preserve
+ * needed memory than to discard unneeded memory.
+ */
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642279f1..e8905bc3cbd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
/*
* Lock ordering:
*
- * ->i_mmap_mutex (truncate_pagecache)
+ * ->i_mmap_rwsem (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_mutex (truncate->unmap_mapping_range)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
@@ -85,7 +85,7 @@
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
@@ -105,7 +105,7 @@
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685c..0d105aeff82f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
EXPORT_SYMBOL_GPL(xip_file_read);
/*
- * __xip_unmap is invoked from xip_unmap and
- * xip_write
+ * __xip_unmap is invoked from xip_unmap and xip_write
*
* This function walks all vmas of the address_space and unmaps the
* __xip_sparse_page when found at pgoff.
*/
-static void
-__xip_unmap (struct address_space * mapping,
- unsigned long pgoff)
+static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long address;
- pte_t *pte;
- pte_t pteval;
- spinlock_t *ptl;
struct page *page;
unsigned count;
int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
return;
retry:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- mm = vma->vm_mm;
- address = vma->vm_start +
+ pte_t *pte, pteval;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
pte = page_check_address(page, mm, address, &ptl, 1);
if (pte) {
@@ -202,7 +197,7 @@ retry:
page_cache_release(page);
}
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (locked) {
mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa361433..11ef7ec40d13 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -238,13 +238,13 @@ get_write_lock:
}
goto out_freed;
}
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_interval_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 919b86a2164d..47f6070d7c46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
return 0;
found:
- BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node)
* devices of nodes that have memory. All on-line nodes should have
* registered their associated device by this time.
*/
-static void hugetlb_register_all_nodes(void)
+static void __init hugetlb_register_all_nodes(void)
{
int nid;
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
* on its way out. We're lucky that the flag has such an appropriate
* name, and can in fact be safely cleared here. We could clear it
* before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_mutex. This works
+ * is to clear it before releasing the i_mmap_rwsem. This works
* because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_mutex is held.
+ * destroyed and the i_mmap_rwsem is held.
*/
vma->vm_flags &= ~VM_MAYSHARE;
}
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
unmap_hugepage_range(iter_vma, address,
address + huge_page_size(h), page);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, address, end);
mmu_notifier_invalidate_range_start(mm, start, end);
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address);
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
spin_unlock(ptl);
}
/*
- * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
- * once we release i_mmap_mutex, another task can do the final put_page
+ * once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk.
*/
flush_tlb_range(vma, start, end);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
* bad pmd for sharing.
*/
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
return pte;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 6ecb0d937fb5..252b77bdf65e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
}
/**
- * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
- * @base: the base phys addr of the region
- * @size: the size of the region
*
- * This function isolates region [@base, @base + @size), and mark it with flag
- * MEMBLOCK_HOTPLUG.
+ * This function isolates region [@base, @base + @size), and sets/clears flag
*
* Return 0 on succees, -errno on failure.
*/
-int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_setclr_flag(phys_addr_t base,
+ phys_addr_t size, int set, int flag)
{
struct memblock_type *type = &memblock.memory;
int i, ret, start_rgn, end_rgn;
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
return ret;
for (i = start_rgn; i < end_rgn; i++)
- memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+ if (set)
+ memblock_set_region_flags(&type->regions[i], flag);
+ else
+ memblock_clear_region_flags(&type->regions[i], flag);
memblock_merge_regions(type);
return 0;
}
/**
- * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
* @base: the base phys addr of the region
* @size: the size of the region
*
- * This function isolates region [@base, @base + @size), and clear flag
- * MEMBLOCK_HOTPLUG for the isolated regions.
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
*
* Return 0 on succees, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
- struct memblock_type *type = &memblock.memory;
- int i, ret, start_rgn, end_rgn;
-
- ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
- if (ret)
- return ret;
-
- for (i = start_rgn; i < end_rgn; i++)
- memblock_clear_region_flags(&type->regions[i],
- MEMBLOCK_HOTPLUG);
-
- memblock_merge_regions(type);
- return 0;
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85df503ec023..ef91e856c7e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,7 +296,6 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
@@ -366,22 +365,11 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
};
-/* internal only representation about the status of kmem accounting. */
-enum {
- KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-};
-
#ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+ return memcg->kmemcg_id >= 0;
}
-
#endif
/* Stuffs for move charges at task migration. */
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ if (fatal_signal_pending(current) || task_will_free_mem(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
NULL, "Memory cgroup out of memory");
}
+#if MAX_NUMNODES > 1
+
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false;
}
-#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
if (!cachep)
return;
- css_get(&memcg->css);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/*
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
list_del(&cachep->memcg_params->list);
kmem_cache_destroy(cachep);
-
- /* drop the reference taken in memcg_register_cache */
- css_put(&memcg->css);
-}
-
-/*
- * During the creation a new cache, we need to disable our accounting mechanism
- * altogether. This is true even if we are not creating, but rather just
- * enqueing new caches to be created.
- *
- * This is because that process will trigger allocations; some visible, like
- * explicit kmallocs to auxiliary data structures, name strings and internal
- * cache structures; some well concealed, like INIT_WORK() that can allocate
- * objects during debug.
- *
- * If any allocation happens during memcg_kmem_get_cache, we will recurse back
- * to it. This may not be a bounded recursion: since the first cache creation
- * failed to complete (waiting on the allocation), we'll just try to create the
- * cache again, failing at the same point.
- *
- * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
- * memcg_kmem_skip_account. So we enclose anything that might allocate memory
- * inside the following two functions.
- */
-static inline void memcg_stop_kmem_account(void)
-{
- VM_BUG_ON(!current->mm);
- current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
- VM_BUG_ON(!current->mm);
- current->memcg_kmem_skip_account--;
}
int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
mutex_lock(&memcg_slab_mutex);
list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
- kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- memcg_unregister_cache(cachep);
+ memcg_unregister_cache(cachep);
}
mutex_unlock(&memcg_slab_mutex);
}
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
struct memcg_register_cache_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
- if (cw == NULL) {
- css_put(&memcg->css);
+ if (!cw)
return;
- }
+
+ css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
- memcg_stop_kmem_account();
+ current->memcg_kmem_skip_account = 1;
__memcg_schedule_register_cache(memcg, cachep);
- memcg_resume_kmem_account();
+ current->memcg_kmem_skip_account = 0;
}
int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
{
unsigned int nr_pages = 1 << order;
- int res;
- res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
- if (!res)
- atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
- return res;
+ return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
}
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
unsigned int nr_pages = 1 << order;
memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
- atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
}
/*
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
- gfp_t gfp)
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
- if (!current->mm || current->memcg_kmem_skip_account)
+ if (current->memcg_kmem_skip_account)
return cachep;
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
-
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_kmem_is_active(memcg))
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
- if (likely(memcg_cachep)) {
- cachep = memcg_cachep;
- goto out;
- }
-
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget_online(&memcg->css))
- goto out;
- rcu_read_unlock();
+ if (likely(memcg_cachep))
+ return memcg_cachep;
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* defer everything.
*/
memcg_schedule_register_cache(memcg, cachep);
- return cachep;
out:
- rcu_read_unlock();
+ css_put(&memcg->css);
return cachep;
}
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+ if (!is_root_cache(cachep))
+ css_put(&cachep->memcg_params->memcg->css);
+}
+
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
*_memcg = NULL;
- /*
- * Disabling accounting is only relevant for some specific memcg
- * internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are
- * accounted to kmemcg (alloc_kmem_pages and friends) only happen
- * outside memcg core. We are mostly concerned with cache allocations,
- * and by having this test at memcg_kmem_get_cache, we are already able
- * to relay the allocation to the root cache and bypass the memcg cache
- * altogether.
- *
- * There is one exception, though: the SLUB allocator does not create
- * large order caches, but rather service large kmallocs directly from
- * the page allocator. Therefore, the following sequence when backed by
- * the SLUB allocator:
- *
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
- *
- * would effectively ignore the fact that we should skip accounting,
- * since it will drive us directly to this function without passing
- * through the cache selector memcg_kmem_get_cache. Such large
- * allocations are extremely rare but can happen, for instance, for the
- * cache arrays. We bring this test here.
- */
- if (!current->mm || current->memcg_kmem_skip_account)
- return true;
-
memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_kmem_is_active(memcg)) {
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
-#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
return 0;
/*
- * We are going to allocate memory for data shared by all memory
- * cgroups so let's stop accounting here.
- */
- memcg_stop_kmem_account();
-
- /*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
* already joined.
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
goto out;
}
- memcg->kmemcg_id = memcg_id;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-
/*
- * We couldn't have accounted to this cgroup, because it hasn't got the
- * active bit set yet, so this should succeed.
+ * We couldn't have accounted to this cgroup, because it hasn't got
+ * activated yet, so this should succeed.
*/
err = page_counter_limit(&memcg->kmem, nr_pages);
VM_BUG_ON(err);
static_key_slow_inc(&memcg_kmem_enabled_key);
/*
- * Setting the active bit after enabling static branching will
+ * A memory cgroup is considered kmem-active as soon as it gets
+ * kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
- memcg_kmem_set_active(memcg);
+ memcg->kmemcg_id = memcg_id;
out:
- memcg_resume_kmem_account();
return err;
}
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
-static inline void mem_cgroup_lru_names_not_uptodate(void)
-{
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
-}
-
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
struct mem_cgroup *mi;
unsigned int i;
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
int ret;
- memcg->kmemcg_id = -1;
ret = memcg_propagate_kmem(memcg);
if (ret)
return ret;
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
+ memcg_unregister_all_caches(memcg);
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_percpu(memcg->stat);
- /*
- * We need to make sure that (at least for now), the jump label
- * destruction code runs outside of the cgroup lock. This is because
- * get_online_cpus(), which is called from the static_branch update,
- * can't be called inside the cgroup_lock. cpusets are the ones
- * enforcing this dependency, so if they ever change, we might as well.
- *
- * schedule_work() will guarantee this happens. Be careful if you need
- * to move this code around, and make sure it is outside
- * the cgroup_lock.
- */
disarm_static_keys(memcg);
kfree(memcg);
}
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
+#ifdef CONFIG_MEMCG_KMEM
+ memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+#endif
return &memcg->css;
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5ee0ca7ae85..feb803bf3443 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access)
}
/*
- * Only call shrink_slab here (which would also shrink other caches) if
- * access is not potentially fatal.
+ * Only call shrink_node_slabs here (which would also shrink
+ * other caches) if access is not potentially fatal.
*/
if (access) {
int nr;
int nid = page_to_nid(p);
do {
- struct shrink_control shrink = {
- .gfp_mask = GFP_KERNEL,
- };
- node_set(nid, shrink.nodes_to_scan);
-
- nr = shrink_slab(&shrink, 1000, 1000);
+ nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 4b5a282e1107..fbf74112de5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* safe to do nothing in this case.
*/
if (vma->vm_file) {
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
+EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_PUD_FOLDED
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 01439953abf5..253474c22239 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache, enum migrate_mode mode)
+ int page_was_mapped, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
newpage->mapping = NULL;
} else {
mem_cgroup_migrate(page, newpage, false);
- if (remap_swapcache)
+ if (page_was_mapped)
remove_migration_ptes(page, newpage);
page->mapping = NULL;
}
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
int force, enum migrate_mode mode)
{
int rc = -EAGAIN;
- int remap_swapcache = 1;
+ int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
if (!trylock_page(page)) {
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* migrated but are not remapped when migration
* completes
*/
- remap_swapcache = 0;
} else {
goto out_unlock;
}
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
/* Establish migration ptes or remove ptes */
- try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ if (page_mapped(page)) {
+ try_to_unmap(page,
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ page_was_mapped = 1;
+ }
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache, mode);
+ rc = move_to_new_page(newpage, page, page_was_mapped, mode);
- if (rc && remap_swapcache)
+ if (rc && page_was_mapped)
remove_migration_ptes(page, page);
/* Drop an anon_vma reference if we took one */
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
{
int rc = 0;
int *result = NULL;
+ int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
- try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ if (page_mapped(hpage)) {
+ try_to_unmap(hpage,
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ page_was_mapped = 1;
+ }
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1, mode);
+ rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..c8c528b36641 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
} else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_migration_entry(entry)) {
- /* migration entries are always uptodate */
+ if (non_swap_entry(entry)) {
+ /*
+ * migration or hwpoison entries are always
+ * uptodate
+ */
*vec = 1;
} else {
#ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index b6c0a77fc1c8..7b36aa7cc89a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
}
/*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
__remove_shared_vm_struct(vma, file, mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
}
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
}
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
@@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end);
next->vm_end);
}
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (insert) {
/*
* Put into interval tree now, so instantiated pages
@@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end);
anon_vma_unlock_write(anon_vma);
}
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (root) {
uprobe_mmap(vma);
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
}
#endif
+EXPORT_SYMBOL_GPL(find_extend_vma);
+
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm)
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
}
}
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..84aa36f9f308 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spinlock_t *old_ptl, *new_ptl;
/*
- * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+ * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
* locks to ensure that rmap will always observe either the old or the
* new ptes. This is the easiest way to avoid races with
* truncate_pagecache(), page migration, etc...
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (need_rmap_locks) {
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (anon_vma)
anon_vma_unlock_write(anon_vma);
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e194a7..b51eadf6d952 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/* remove from the MM's tree and list */
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
unsigned long len,
unsigned long capabilities)
{
- struct page *pages;
- unsigned long total, point, n;
+ unsigned long total, point;
void *base;
int ret, order;
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma,
order = get_order(len);
kdebug("alloc order %d for %lx", order, len);
- pages = alloc_pages(GFP_KERNEL, order);
- if (!pages)
- goto enomem;
-
total = 1 << order;
- atomic_long_add(total, &mmap_pages_allocated);
-
point = len >> PAGE_SHIFT;
- /* we allocated a power-of-2 sized page set, so we may want to trim off
- * the excess */
+ /* we don't want to allocate a power-of-2 sized page set */
if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
- while (total > point) {
- order = ilog2(total - point);
- n = 1 << order;
- kdebug("shave %lu/%lu @%lu", n, total - point, total);
- atomic_long_sub(n, &mmap_pages_allocated);
- total -= n;
- set_page_refcounted(pages + total);
- __free_pages(pages + total, order);
- }
+ total = point;
+ kdebug("try to alloc exact %lu pages", total);
+ base = alloc_pages_exact(len, GFP_KERNEL);
+ } else {
+ base = (void *)__get_free_pages(GFP_KERNEL, order);
}
- for (point = 1; point < total; point++)
- set_page_refcounted(&pages[point]);
+ if (!base)
+ goto enomem;
+
+ atomic_long_add(total, &mmap_pages_allocated);
- base = page_address(pages);
region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
region->vm_start = (unsigned long) base;
region->vm_end = region->vm_start + len;
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
down_write(&nommu_region_sem);
- mutex_lock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_lock_read(inode->i_mapping);
/* search for VMAs that fall within the dead zone */
vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
/* found one - only interested if it's shared out of the page
* cache */
if (vma->vm_flags & VM_SHARED) {
- mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_unlock_read(inode->i_mapping);
up_write(&nommu_region_sem);
return -ETXTBSY; /* not quite true, but near enough */
}
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
* we don't check for any regions that start beyond the EOF as there
* shouldn't be any
*/
- vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
- 0, ULONG_MAX) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
if (!(vma->vm_flags & VM_SHARED))
continue;
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
}
}
- mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ i_mmap_unlock_read(inode->i_mapping);
up_write(&nommu_region_sem);
return 0;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 864bba992735..d503e9ce1c7b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task->flags & PF_EXITING && !force_kill) {
- /*
- * If this task is not being ptraced on exit, then wait for it
- * to finish before killing some other task unnecessarily.
- */
- if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
- return OOM_SCAN_ABORT;
- }
+ if (task_will_free_mem(task) && !force_kill)
+ return OOM_SCAN_ABORT;
+
return OOM_SCAN_OK;
}
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
- if (p->flags & PF_EXITING) {
+ if (task_will_free_mem(p)) {
set_tsk_thread_flag(p, TIF_MEMDIE);
put_task_struct(p);
return;
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ if (fatal_signal_pending(current) || task_will_free_mem(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df542feaac3b..fa974d87f60d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
+#include <linux/page_ext.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -55,9 +56,10 @@
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
+#include <linux/page_owner.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
+bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_guardpage_enabled __read_mostly;
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ _debug_pagealloc_enabled = true;
+
+ return 0;
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static bool need_debug_guardpage(void)
+{
+ /* If we don't use debug_pagealloc, we don't need guard page */
+ if (!debug_pagealloc_enabled())
+ return false;
+
+ return true;
+}
+
+static void init_debug_guardpage(void)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ _debug_guardpage_enabled = true;
+}
+
+struct page_ext_operations debug_guardpage_ops = {
+ .need = need_debug_guardpage,
+ .init = init_debug_guardpage,
+};
static int __init debug_guardpage_minorder_setup(char *buf)
{
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
-static inline void set_page_guard_flag(struct page *page)
+static inline void set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
{
- __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ if (!debug_guardpage_enabled())
+ return;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+
+ INIT_LIST_HEAD(&page->lru);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
}
-static inline void clear_page_guard_flag(struct page *page)
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
{
- __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ struct page_ext *page_ext;
+
+ if (!debug_guardpage_enabled())
+ return;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-static inline void set_page_guard_flag(struct page *page) { }
-static inline void clear_page_guard_flag(struct page *page) { }
+struct page_ext_operations debug_guardpage_ops = { NULL, };
+static inline void set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
#endif
static inline void set_page_order(struct page *page, unsigned int order)
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
- clear_page_guard_flag(buddy);
- set_page_private(buddy, 0);
- if (!is_migrate_isolate(migratetype)) {
- __mod_zone_freepage_state(zone, 1 << order,
- migratetype);
- }
+ clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
if (bad)
return false;
+ reset_page_owner(page, order);
+
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page,
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (high < debug_guardpage_minorder()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
+ debug_guardpage_enabled() &&
+ high < debug_guardpage_minorder()) {
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
- INIT_LIST_HEAD(&page[size].lru);
- set_page_guard_flag(&page[size]);
- set_page_private(&page[size], high);
- /* Guard pages are not available for any usage */
- __mod_zone_freepage_state(zone, -(1 << high),
- migratetype);
+ set_page_guard(zone, &page[size], high, migratetype);
continue;
}
-#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
+ set_page_owner(page, order, gfp_flags);
+
return 0;
}
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- for (i = 1; i < (1 << order); i++)
+ set_page_owner(page, 0, 0);
+ for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
+ set_page_owner(page + i, 0, 0);
+ }
}
EXPORT_SYMBOL_GPL(split_page);
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
+ set_page_owner(page, order, 0);
return 1UL << order;
}
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+ pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, realsize);
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
+ if (!is_highmem_idx(j)) {
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ printk(KERN_WARNING
+ " %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+ }
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (!PageLRU(page))
found++;
/*
- * If there are RECLAIMABLE pages, we need to check it.
- * But now, memory offline itself doesn't call shrink_slab()
- * and it still to be fixed.
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
*/
/*
* If the page is not RAM, page_count()should be 0.
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..d86fd2f5353f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,403 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/page_ext.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
+#include <linux/page_owner.h>
+
+/*
+ * struct page extension
+ *
+ * This is the feature to manage memory for extended data per page.
+ *
+ * Until now, we must modify struct page itself to store extra data per page.
+ * This requires rebuilding the kernel and it is really time consuming process.
+ * And, sometimes, rebuild is impossible due to third party module dependency.
+ * At last, enlarging struct page could cause un-wanted system behaviour change.
+ *
+ * This feature is intended to overcome above mentioned problems. This feature
+ * allocates memory for extended data per page in certain place rather than
+ * the struct page itself. This memory can be accessed by the accessor
+ * functions provided by this code. During the boot process, it checks whether
+ * allocation of huge chunk of memory is needed or not. If not, it avoids
+ * allocating memory at all. With this advantage, we can include this feature
+ * into the kernel in default and can avoid rebuild and solve related problems.
+ *
+ * To help these things to work well, there are two callbacks for clients. One
+ * is the need callback which is mandatory if user wants to avoid useless
+ * memory allocation at boot-time. The other is optional, init callback, which
+ * is used to do proper initialization after memory is allocated.
+ *
+ * The need callback is used to decide whether extended memory allocation is
+ * needed or not. Sometimes users want to deactivate some features in this
+ * boot and extra memory would be unneccessary. In this case, to avoid
+ * allocating huge chunk of memory, each clients represent their need of
+ * extra memory through the need callback. If one of the need callbacks
+ * returns true, it means that someone needs extra memory so that
+ * page extension core should allocates memory for page extension. If
+ * none of need callbacks return true, memory isn't needed at all in this boot
+ * and page extension core can skip to allocate memory. As result,
+ * none of memory is wasted.
+ *
+ * The init callback is used to do proper initialization after page extension
+ * is completely initialized. In sparse memory system, extra memory is
+ * allocated some time later than memmap is allocated. In other words, lifetime
+ * of memory for page extension isn't same with memmap for struct page.
+ * Therefore, clients can't store extra data until page extension is
+ * initialized, even if pages are allocated and used freely. This could
+ * cause inadequate state of extra data per page, so, to prevent it, client
+ * can utilize this callback to initialize the state of it correctly.
+ */
+
+static struct page_ext_operations *page_ext_ops[] = {
+ &debug_guardpage_ops,
+#ifdef CONFIG_PAGE_POISONING
+ &page_poisoning_ops,
+#endif
+#ifdef CONFIG_PAGE_OWNER
+ &page_owner_ops,
+#endif
+};
+
+static unsigned long total_usage;
+
+static bool __init invoke_need_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->need && page_ext_ops[i]->need())
+ return true;
+ }
+
+ return false;
+}
+
+static void __init invoke_init_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->init)
+ page_ext_ops[i]->init();
+ }
+}
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+ pgdat->node_page_ext = NULL;
+}
+
+struct page_ext *lookup_page_ext(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset;
+ struct page_ext *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_ext;
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (unlikely(!base))
+ return NULL;
+#endif
+ offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
+ MAX_ORDER_NR_PAGES);
+ return base + offset;
+}
+
+static int __init alloc_node_page_ext(int nid)
+{
+ struct page_ext *base;
+ unsigned long table_size;
+ unsigned long nr_pages;
+
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+ if (!nr_pages)
+ return 0;
+
+ /*
+ * Need extra space if node range is not aligned with
+ * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
+ * checks buddy's status, range could be out of exact node range.
+ */
+ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
+ !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
+ nr_pages += MAX_ORDER_NR_PAGES;
+
+ table_size = sizeof(struct page_ext) * nr_pages;
+
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ if (!base)
+ return -ENOMEM;
+ NODE_DATA(nid)->node_page_ext = base;
+ total_usage += table_size;
+ return 0;
+}
+
+void __init page_ext_init_flatmem(void)
+{
+
+ int nid, fail;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_ext(nid);
+ if (fail)
+ goto fail;
+ }
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+fail:
+ pr_crit("allocation of page_ext failed.\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_ext *lookup_page_ext(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (!section->page_ext)
+ return NULL;
+#endif
+ return section->page_ext + pfn;
+}
+
+static void *__meminit alloc_page_ext(size_t size, int nid)
+{
+ gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+ void *addr = NULL;
+
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
+ return addr;
+ }
+
+ if (node_state(nid, N_HIGH_MEMORY))
+ addr = vzalloc_node(size, nid);
+ else
+ addr = vzalloc(size);
+
+ return addr;
+}
+
+static int __meminit init_section_page_ext(unsigned long pfn, int nid)
+{
+ struct mem_section *section;
+ struct page_ext *base;
+ unsigned long table_size;
+
+ section = __pfn_to_section(pfn);
+
+ if (section->page_ext)
+ return 0;
+
+ table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+ base = alloc_page_ext(table_size, nid);
+
+ /*
+ * The value stored in section->page_ext is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
+
+ if (!base) {
+ pr_err("page ext allocation failure\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
+ section->page_ext = base - pfn;
+ total_usage += table_size;
+ return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_ext(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size;
+
+ table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
+ }
+}
+
+static void __free_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ struct page_ext *base;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ base = ms->page_ext + pfn;
+ free_page_ext(base);
+ ms->page_ext = NULL;
+}
+
+static int __meminit online_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
+{
+ unsigned long start, end, pfn;
+ int fail = 0;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ if (nid == -1) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
+
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+ fail = init_section_page_ext(pfn, nid);
+ }
+ if (!fail)
+ return 0;
+
+ /* rollback */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+
+ return -ENOMEM;
+}
+
+static int __meminit offline_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
+{
+ unsigned long start, end, pfn;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+ return 0;
+
+}
+
+static int __meminit page_ext_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = online_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_OFFLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_CANCEL_ONLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_GOING_OFFLINE:
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+#endif
+
+void __init page_ext_init(void)
+{
+ unsigned long pfn;
+ int nid;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn; pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_ext(pfn, nid))
+ goto oom;
+ }
+ }
+ hotplug_memory_notifier(page_ext_callback, 0);
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+oom:
+ panic("Out of memory");
+}
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+}
+
+#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000000..9ab4a9b5bc09
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,311 @@
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/bootmem.h>
+#include <linux/stacktrace.h>
+#include <linux/page_owner.h>
+#include "internal.h"
+
+static bool page_owner_disabled = true;
+bool page_owner_inited __read_mostly;
+
+static void init_early_allocated_pages(void);
+
+static int early_page_owner_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ page_owner_disabled = false;
+
+ return 0;
+}
+early_param("page_owner", early_page_owner_param);
+
+static bool need_page_owner(void)
+{
+ if (page_owner_disabled)
+ return false;
+
+ return true;
+}
+
+static void init_page_owner(void)
+{
+ if (page_owner_disabled)
+ return;
+
+ page_owner_inited = true;
+ init_early_allocated_pages();
+}
+
+struct page_ext_operations page_owner_ops = {
+ .need = need_page_owner,
+ .init = init_page_owner,
+};
+
+void __reset_page_owner(struct page *page, unsigned int order)
+{
+ int i;
+ struct page_ext *page_ext;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ }
+}
+
+void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+{
+ struct page_ext *page_ext;
+ struct stack_trace *trace;
+
+ page_ext = lookup_page_ext(page);
+
+ trace = &page_ext->trace;
+ trace->nr_entries = 0;
+ trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
+ trace->entries = &page_ext->trace_entries[0];
+ trace->skip = 3;
+ save_stack_trace(&page_ext->trace);
+
+ page_ext->order = order;
+ page_ext->gfp_mask = gfp_mask;
+
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+}
+
+static ssize_t
+print_page_owner(char __user *buf, size_t count, unsigned long pfn,
+ struct page *page, struct page_ext *page_ext)
+{
+ int ret;
+ int pageblock_mt, page_mt;
+ char *kbuf;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = snprintf(kbuf, count,
+ "Page allocated via order %u, mask 0x%x\n",
+ page_ext->order, page_ext->gfp_mask);
+
+ if (ret >= count)
+ goto err;
+
+ /* Print information relevant to grouping pages by mobility */
+ pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+ ret += snprintf(kbuf + ret, count - ret,
+ "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ pfn,
+ pfn >> pageblock_order,
+ pageblock_mt,
+ pageblock_mt != page_mt ? "Fallback" : " ",
+ PageLocked(page) ? "K" : " ",
+ PageError(page) ? "E" : " ",
+ PageReferenced(page) ? "R" : " ",
+ PageUptodate(page) ? "U" : " ",
+ PageDirty(page) ? "D" : " ",
+ PageLRU(page) ? "L" : " ",
+ PageActive(page) ? "A" : " ",
+ PageSlab(page) ? "S" : " ",
+ PageWriteback(page) ? "W" : " ",
+ PageCompound(page) ? "C" : " ",
+ PageSwapCache(page) ? "B" : " ",
+ PageMappedToDisk(page) ? "M" : " ");
+
+ if (ret >= count)
+ goto err;
+
+ ret += snprint_stack_trace(kbuf + ret, count - ret,
+ &page_ext->trace, 0);
+ if (ret >= count)
+ goto err;
+
+ ret += snprintf(kbuf + ret, count - ret, "\n");
+ if (ret >= count)
+ goto err;
+
+ if (copy_to_user(buf, kbuf, ret))
+ ret = -EFAULT;
+
+ kfree(kbuf);
+ return ret;
+
+err:
+ kfree(kbuf);
+ return -ENOMEM;
+}
+
+static ssize_t
+read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned long pfn;
+ struct page *page;
+ struct page_ext *page_ext;
+
+ if (!page_owner_inited)
+ return -EINVAL;
+
+ page = NULL;
+ pfn = min_low_pfn + *ppos;
+
+ /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
+ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
+ pfn++;
+
+ drain_all_pages(NULL);
+
+ /* Find an allocated page */
+ for (; pfn < max_pfn; pfn++) {
+ /*
+ * If the new page is in a new MAX_ORDER_NR_PAGES area,
+ * validate the area as existing, skip it if not
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
+ pfn += MAX_ORDER_NR_PAGES - 1;
+ continue;
+ }
+
+ /* Check for holes within a MAX_ORDER area */
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = page_order_unsafe(page);
+
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
+ continue;
+ }
+
+ page_ext = lookup_page_ext(page);
+
+ /*
+ * Some pages could be missed by concurrent allocation or free,
+ * because we don't hold the zone lock.
+ */
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /* Record the next PFN to read in the file offset */
+ *ppos = (pfn - min_low_pfn) + 1;
+
+ return print_page_owner(buf, count, pfn, page, page_ext);
+ }
+
+ return 0;
+}
+
+static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count = 0;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ page = pfn_to_page(pfn);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+
+ /*
+ * We are safe to check buddy flag and order, because
+ * this is init stage and only single thread runs.
+ */
+ if (PageBuddy(page)) {
+ pfn += (1UL << page_order(page)) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+
+ /* Maybe overraping zone */
+ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /* Found early allocated page */
+ set_page_owner(page, 0, 0);
+ count++;
+ }
+ }
+
+ pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
+ pgdat->node_id, zone->name, count);
+}
+
+static void init_zones_in_node(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ init_pages_in_zone(pgdat, zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
+static void init_early_allocated_pages(void)
+{
+ pg_data_t *pgdat;
+
+ drain_all_pages(NULL);
+ for_each_online_pgdat(pgdat)
+ init_zones_in_node(pgdat);
+}
+
+static const struct file_operations proc_page_owner_operations = {
+ .read = read_page_owner,
+};
+
+static int __init pageowner_init(void)
+{
+ struct dentry *dentry;
+
+ if (!page_owner_inited) {
+ pr_info("page_owner is disabled\n");
+ return 0;
+ }
+
+ dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
+ NULL, &proc_page_owner_operations);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ return 0;
+}
+module_init(pageowner_init)
diff --git a/mm/rmap.c b/mm/rmap.c
index 45eba36fd673..c52f43a69eea 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
* inode->i_mutex (while writing or truncating, not reading or faulting)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_mutex
+ * mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1260,7 +1260,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page_to_pgoff(page);
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (!anon_vma)
return ret;
+ pgoff = page_to_pgoff(page);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page_to_pgoff(page);
+ pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
+ * so we can safely take mapping->i_mmap_rwsem.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
return ret;
- mutex_lock(&mapping->i_mmap_mutex);
+
+ pgoff = page_to_pgoff(page);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
goto done;
ret = rwc->file_nonlinear(page, mapping, rwc->arg);
-
done:
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
return ret;
}
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..65b5dcb6f671 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3015,7 +3015,7 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
- if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) &&
+ if (cpuset_zone_allowed(zone, flags) &&
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
memset(ptr, 0, cachep->object_size);
}
+ memcg_kmem_put_cache(cachep);
return ptr;
}
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
memset(objp, 0, cachep->object_size);
}
+ memcg_kmem_put_cache(cachep);
return objp;
}
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe376fe1f4fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
kmemleak_free(x);
}
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
{
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
might_sleep_if(flags & __GFP_WAIT);
- return should_failslab(s->object_size, flags, s->flags);
+ if (should_failslab(s->object_size, flags, s->flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
flags &= gfp_allowed_mask;
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
+ memcg_kmem_put_cache(s);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed(zone,
- flags | __GFP_HARDWALL) &&
+ if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
object = get_partial_node(s, n, c, flags);
if (object) {
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
- if (slab_pre_alloc_hook(s, gfpflags))
+ s = slab_pre_alloc_hook(s, gfpflags);
+ if (!s)
return NULL;
-
- s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 9f25af825dec..b6e3662fe339 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm)
{
struct task_struct *g, *p;
+ count_vm_vmacache_event(VMACACHE_FULL_FLUSHES);
+
/*
* Single threaded tasks need not iterate the entire
* list of process. We can avoid the flushing as well
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a18196fcdff..39c338896416 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
if (v->flags & VM_UNINITIALIZED)
return;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a384339bf718..bd9a72bc4a1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
-static unsigned long
-shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
- unsigned long nr_pages_scanned, unsigned long lru_pages)
+static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta = (4 * nr_scanned) / shrinker->seeks;
delta *= freeable;
- do_div(delta, lru_pages + 1);
+ do_div(delta, nr_eligible + 1);
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_pages_scanned, lru_pages,
- freeable, delta, total_scan);
+ nr_scanned, nr_eligible,
+ freeable, delta, total_scan);
/*
* Normally, we should not scan less than batch_size objects in one
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
return freed;
}
-/*
- * Call the shrink functions to age shrinkable caches
- *
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object. With this in mind we age equal
- * percentages of the lru and ageable caches. This should balance the seeks
- * generated by these structures.
+/**
+ * shrink_node_slabs - shrink slab caches of a given node
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @nr_scanned: pressure numerator
+ * @nr_eligible: pressure denominator
*
- * If the vm encountered mapped pages on the LRU it increase the pressure on
- * slab to avoid swapping.
+ * Call the shrink functions to age shrinkable caches.
*
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
*
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt. It is used for balancing
- * slab reclaim versus page reclaim.
+ * @nr_scanned and @nr_eligible form a ratio that indicate how much of
+ * the available objects should be scanned. Page reclaim for example
+ * passes the number of pages scanned and the number of pages on the
+ * LRU lists that it considered on @nid, plus a bias in @nr_scanned
+ * when it encountered mapped pages. The ratio is further biased by
+ * the ->seeks setting of the shrink function, which indicates the
+ * cost to recreate an object relative to that of an LRU page.
*
- * Returns the number of slab objects which we shrunk.
+ * Returns the number of reclaimed slab objects.
*/
-unsigned long shrink_slab(struct shrink_control *shrinkctl,
- unsigned long nr_pages_scanned,
- unsigned long lru_pages)
+unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
struct shrinker *shrinker;
unsigned long freed = 0;
- if (nr_pages_scanned == 0)
- nr_pages_scanned = SWAP_CLUSTER_MAX;
+ if (nr_scanned == 0)
+ nr_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/*
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
- shrinkctl->nid = 0;
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
- continue;
- }
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ };
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (node_online(shrinkctl->nid))
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ sc.nid = 0;
- }
+ freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
}
+
up_read(&shrinker_rwsem);
out:
cond_resched();
@@ -1876,7 +1877,8 @@ enum scan_balance {
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *nr)
+ struct scan_control *sc, unsigned long *nr,
+ unsigned long *lru_pages)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
@@ -2022,6 +2024,7 @@ out:
some_scanned = false;
/* Only use force_scan on second pass. */
for (pass = 0; !some_scanned && pass < 2; pass++) {
+ *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long size;
@@ -2048,14 +2051,19 @@ out:
case SCAN_FILE:
case SCAN_ANON:
/* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
+ if ((scan_balance == SCAN_FILE) != file) {
+ size = 0;
scan = 0;
+ }
break;
default:
/* Look ma, no brain */
BUG();
}
+
+ *lru_pages += size;
nr[lru] = scan;
+
/*
* Skip the second pass and don't force_scan,
* if we found something to scan.
@@ -2069,7 +2077,7 @@ out:
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc)
+ struct scan_control *sc, unsigned long *lru_pages)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, swappiness, sc, nr);
+ get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static bool shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc,
+ bool is_classzone)
{
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
.zone = zone,
.priority = sc->priority,
};
+ unsigned long zone_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
+ unsigned long lru_pages;
struct lruvec *lruvec;
int swappiness;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
- shrink_lruvec(lruvec, swappiness, sc);
+ shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ zone_lru_pages += lru_pages;
/*
* Direct reclaim and kswapd have to scan all memory
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
+ /*
+ * Shrink the slab caches in the same proportion that
+ * the eligible LRU pages were scanned.
+ */
+ if (global_reclaim(sc) && is_classzone) {
+ struct reclaim_state *reclaim_state;
+
+ shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
+ sc->nr_scanned - nr_scanned,
+ zone_lru_pages);
+
+ reclaim_state = current->reclaim_state;
+ if (reclaim_state) {
+ sc->nr_reclaimed +=
+ reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
- unsigned long lru_pages = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
gfp_t orig_mask;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
bool reclaimable = false;
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
- nodes_clear(shrink.nodes_to_scan);
-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
+ requested_highidx, sc->nodemask) {
+ enum zone_type classzone_idx;
+
if (!populated_zone(zone))
continue;
+
+ classzone_idx = requested_highidx;
+ while (!populated_zone(zone->zone_pgdat->node_zones +
+ classzone_idx))
+ classzone_idx--;
+
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
GFP_KERNEL | __GFP_HARDWALL))
continue;
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}
- if (shrink_zone(zone, sc))
+ if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
reclaimable = true;
if (global_reclaim(sc) &&
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
}
/*
- * Don't shrink slabs when reclaiming memory from over limit cgroups
- * but do shrink slab at least once when aborting reclaim for
- * compaction to avoid unevenly scanning file/anon LRU pages over slab
- * pages.
- */
- if (global_reclaim(sc)) {
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- }
-
- /*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
*/
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
int swappiness = mem_cgroup_swappiness(memcg);
+ unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc);
+ shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
- unsigned long lru_pages,
unsigned long *nr_attempted)
{
int testorder = sc->order;
unsigned long balance_gap;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
bool lowmem_pressure;
/* Reclaim above the high watermark. */
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
balance_gap, classzone_idx))
return true;
- shrink_zone(zone, sc);
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
- reclaim_state->reclaimed_slab = 0;
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long lru_pages = 0;
unsigned long nr_attempted = 0;
bool raise_priority = true;
bool pgdat_needs_compaction = (order > 0);
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone))
continue;
- lru_pages += zone_reclaimable_pages(zone);
-
/*
* If any zone is currently balanced then kswapd will
* not call compaction as it is expected that the
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone, &sc,
- lru_pages, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone,
+ &sc, &nr_attempted))
raise_priority = false;
}
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
- unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
/*
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* priorities until we have enough memory freed.
*/
do {
- shrink_zone(zone, &sc);
+ shrink_zone(zone, &sc, true);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages0 > zone->min_slab_pages) {
- /*
- * shrink_slab() does not currently allow us to determine how
- * many pages were freed in this zone. So we take the current
- * number of slab pages and shake the slab until it is reduced
- * by the same nr_pages that we used for reclaiming unmapped
- * pages.
- */
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
- for (;;) {
- unsigned long lru_pages = zone_reclaimable_pages(zone);
-
- /* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
- break;
-
- /* Freed enough memory */
- nr_slab_pages1 = zone_page_state(zone,
- NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
- break;
- }
-
- /*
- * Update nr_reclaimed by the number of slab pages we
- * reclaimed from this zone.
- */
- nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 < nr_slab_pages0)
- sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
- }
-
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1b12d390dc68..1284f89fca08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,6 +22,8 @@
#include <linux/writeback.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
+#include <linux/page_ext.h>
+#include <linux/page_owner.h>
#include "internal.h"
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_DEBUG_VM_VMACACHE
"vmacache_find_calls",
"vmacache_find_hits",
+ "vmacache_full_flushes",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
return 0;
}
+#ifdef CONFIG_PAGE_OWNER
+static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+ pg_data_t *pgdat,
+ struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+ int pageblock_mt, page_mt;
+ int i;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ page = pfn_to_page(pfn);
+ pageblock_mt = get_pfnblock_migratetype(page, pfn);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ pfn += (1UL << page_order(page)) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+ if (pageblock_mt != page_mt) {
+ if (is_migrate_cma(pageblock_mt))
+ count[MIGRATE_MOVABLE]++;
+ else
+ count[pageblock_mt]++;
+
+ pfn = block_end_pfn;
+ break;
+ }
+ pfn += (1UL << page_ext->order) - 1;
+ }
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (i = 0; i < MIGRATE_TYPES; i++)
+ seq_printf(m, "%12lu ", count[i]);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_PAGE_OWNER */
+
+/*
+ * Print out the number of pageblocks for each migratetype that contain pages
+ * of other types. This gives an indication of how well fallbacks are being
+ * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
+ * to determine what is going on
+ */
+static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
+{
+#ifdef CONFIG_PAGE_OWNER
+ int mtype;
+
+ if (!page_owner_inited)
+ return;
+
+ drain_all_pages(NULL);
+
+ seq_printf(m, "\n%-23s", "Number of mixed blocks ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+#endif /* CONFIG_PAGE_OWNER */
+}
+
/*
* This prints out statistics in relation to grouping pages by mobility.
* It is expensive to collect so do not constantly read the file.
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
seq_putc(m, '\n');
pagetypeinfo_showfree(m, pgdat);
pagetypeinfo_showblockcount(m, pgdat);
+ pagetypeinfo_showmixedcount(m, pgdat);
return 0;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index ec71b37fb06c..4e387bea702e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = {
static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zbud_create_pool(gfp, &zbud_zpool_ops);
+ return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
}
static void zbud_zpool_destroy(void *pool)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..4d0a063145ec 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -155,8 +155,6 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
- ZS_SIZE_CLASS_DELTA + 1)
/*
* We do not maintain any list for completely empty or full pages
@@ -171,6 +169,11 @@ enum fullness_group {
};
/*
+ * number of size_classes
+ */
+static int zs_size_classes;
+
+/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
* n = number of allocated objects
@@ -214,7 +217,7 @@ struct link_free {
};
struct zs_pool {
- struct size_class size_class[ZS_SIZE_CLASSES];
+ struct size_class **size_class;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
struct page *next_page;
struct link_free *link;
unsigned int i = 1;
+ void *vaddr;
/*
* page->index stores offset of first object starting
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
if (page != first_page)
page->index = off;
- link = (struct link_free *)kmap_atomic(page) +
- off / sizeof(*link);
+ vaddr = kmap_atomic(page);
+ link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
link->next = obj_location_to_handle(page, i++);
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
*/
next_page = get_next_page(page);
link->next = obj_location_to_handle(next_page, 0);
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm_buf)
return 0;
- area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
if (!area->vm_buf)
return -ENOMEM;
return 0;
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
static inline void __zs_cpu_down(struct mapping_area *area)
{
- if (area->vm_buf)
- free_page((unsigned long)area->vm_buf);
+ kfree(area->vm_buf);
area->vm_buf = NULL;
}
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = {
.notifier_call = zs_cpu_notifier
};
-static void zs_exit(void)
+static void zs_unregister_cpu_notifier(void)
{
int cpu;
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
-#endif
-
cpu_notifier_register_begin();
for_each_online_cpu(cpu)
@@ -898,31 +897,74 @@ static void zs_exit(void)
cpu_notifier_register_done();
}
-static int zs_init(void)
+static int zs_register_cpu_notifier(void)
{
- int cpu, ret;
+ int cpu, uninitialized_var(ret);
cpu_notifier_register_begin();
__register_cpu_notifier(&zs_cpu_nb);
for_each_online_cpu(cpu) {
ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret)) {
- cpu_notifier_register_done();
- goto fail;
- }
+ if (notifier_to_errno(ret))
+ break;
}
cpu_notifier_register_done();
+ return notifier_to_errno(ret);
+}
+
+static void init_zs_size_classes(void)
+{
+ int nr;
+ nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+ if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+ nr += 1;
+
+ zs_size_classes = nr;
+}
+
+static void __exit zs_exit(void)
+{
#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
+ zpool_unregister_driver(&zs_zpool_driver);
#endif
+ zs_unregister_cpu_notifier();
+}
+static int __init zs_init(void)
+{
+ int ret = zs_register_cpu_notifier();
+
+ if (ret) {
+ zs_unregister_cpu_notifier();
+ return ret;
+ }
+
+ init_zs_size_classes();
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zs_zpool_driver);
+#endif
return 0;
-fail:
- zs_exit();
- return notifier_to_errno(ret);
+}
+
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+{
+ if (prev->pages_per_zspage != pages_per_zspage)
+ return false;
+
+ if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+ != get_maxobj_per_zspage(size, pages_per_zspage))
+ return false;
+
+ return true;
}
/**
@@ -937,33 +979,71 @@ fail:
*/
struct zs_pool *zs_create_pool(gfp_t flags)
{
- int i, ovhd_size;
+ int i;
struct zs_pool *pool;
+ struct size_class *prev_class = NULL;
- ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
- pool = kzalloc(ovhd_size, GFP_KERNEL);
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return NULL;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+ GFP_KERNEL);
+ if (!pool->size_class) {
+ kfree(pool);
+ return NULL;
+ }
+
+ /*
+ * Iterate reversly, because, size of size_class that we want to use
+ * for merging should be larger or equal to current size.
+ */
+ for (i = zs_size_classes - 1; i >= 0; i--) {
int size;
+ int pages_per_zspage;
struct size_class *class;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE;
+ pages_per_zspage = get_pages_per_zspage(size);
+
+ /*
+ * size_class is used for normal zsmalloc operation such
+ * as alloc/free for that size. Although it is natural that we
+ * have one size_class for each size, there is a chance that we
+ * can get more memory utilization if we use one size_class for
+ * many different sizes whose size_class have same
+ * characteristics. So, we makes size_class point to
+ * previous size_class if possible.
+ */
+ if (prev_class) {
+ if (can_merge(prev_class, size, pages_per_zspage)) {
+ pool->size_class[i] = prev_class;
+ continue;
+ }
+ }
+
+ class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+ if (!class)
+ goto err;
- class = &pool->size_class[i];
class->size = size;
class->index = i;
+ class->pages_per_zspage = pages_per_zspage;
spin_lock_init(&class->lock);
- class->pages_per_zspage = get_pages_per_zspage(size);
+ pool->size_class[i] = class;
+ prev_class = class;
}
pool->flags = flags;
return pool;
+
+err:
+ zs_destroy_pool(pool);
+ return NULL;
}
EXPORT_SYMBOL_GPL(zs_create_pool);
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ for (i = 0; i < zs_size_classes; i++) {
int fg;
- struct size_class *class = &pool->size_class[i];
+ struct size_class *class = pool->size_class[i];
+
+ if (!class)
+ continue;
+
+ if (class->index != i)
+ continue;
for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
if (class->fullness_list[fg]) {
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool)
class->size, fg);
}
}
+ kfree(class);
}
+
+ kfree(pool->size_class);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
unsigned long obj;
struct link_free *link;
- int class_idx;
struct size_class *class;
+ void *vaddr;
struct page *first_page, *m_page;
unsigned long m_objidx, m_offset;
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- class_idx = get_size_class_index(size);
- class = &pool->size_class[class_idx];
- BUG_ON(class_idx != class->index);
+ class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
first_page = find_get_zspage(class);
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
obj_handle_to_location(obj, &m_page, &m_objidx);
m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
- link = (struct link_free *)kmap_atomic(m_page) +
- m_offset / sizeof(*link);
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
first_page->freelist = link->next;
memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
first_page->inuse++;
/* Now move the zspage to another fullness group, if required */
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
+ void *vaddr;
int class_idx;
struct size_class *class;
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
spin_lock(&class->lock);
/* Insert this object in containing zspage's freelist */
- link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
- + f_offset);
+ vaddr = kmap_atomic(f_page);
+ link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
- kunmap_atomic(link);
+ kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
first_page->inuse--;
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
obj_handle_to_location(handle, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
area = &get_cpu_var(zs_map_area);
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
obj_handle_to_location(handle, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
+ class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
area = this_cpu_ptr(&zs_map_area);
diff --git a/mm/zswap.c b/mm/zswap.c
index c1543061a192..0cfce9bc51e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void)
return 0;
}
-static void zswap_comp_exit(void)
+static void __init zswap_comp_exit(void)
{
/* free percpu transforms */
- if (zswap_comp_pcpu_tfms)
- free_percpu(zswap_comp_pcpu_tfms);
+ free_percpu(zswap_comp_pcpu_tfms);
}
/*********************************
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
**********************************/
static struct kmem_cache *zswap_entry_cache;
-static int zswap_entry_cache_create(void)
+static int __init zswap_entry_cache_create(void)
{
zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
return zswap_entry_cache == NULL;
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = {
.notifier_call = zswap_cpu_notifier
};
-static int zswap_cpu_init(void)
+static int __init zswap_cpu_init(void)
{
unsigned long cpu;