summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c156
-rw-r--r--mm/filemap.c69
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/frontswap.c344
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/hugetlb.c3
-rw-r--r--mm/internal.h13
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c115
-rw-r--r--mm/memcontrol.c523
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/memory.c12
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/mmap.c54
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/nobootmem.c40
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_alloc.c23
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/process_vm_access.c16
-rw-r--r--mm/shmem.c259
-rw-r--r--mm/slub.c23
-rw-r--r--mm/sparse.c20
-rw-r--r--mm/swap.c86
-rw-r--r--mm/swapfile.c66
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmscan.c462
39 files changed, 1289 insertions, 1230 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
@@ -710,6 +710,10 @@ again:
if (ptr)
return ptr;
+ /* do not panic in alloc_bootmem_bdata() */
+ if (limit && goal + size > limit)
+ limit = 0;
+
ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
if (ptr)
return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
static __init int init_emergency_pool(void)
{
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
if (max_pfn <= max_low_pfn)
return 0;
#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+ printk("bounce pool size: %d pages\n", POOL_SIZE);
return 0;
}
__initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
/*
* highmem version, map in to vec
*/
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f613..32e6f4136fa2 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
static int cleancache_get_key(struct inode *inode,
struct cleancache_filekey *key)
{
- int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+ int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
int len = 0, maxlen = CLEANCACHE_KEY_MAX;
struct super_block *sb = inode->i_sb;
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
if (sb->s_export_op != NULL) {
fhfn = sb->s_export_op->encode_fh;
if (fhfn) {
- struct dentry d;
- d.d_inode = inode;
- len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+ len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
if (len <= 0 || len == 255)
return -1;
if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/compaction.c b/mm/compaction.c
index 840ee288e296..2f42d9528539 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -226,7 +226,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
+ isolate_mode_t mode = 0;
+ struct lruvec *lruvec;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -235,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (cc->mode != COMPACT_SYNC)
+ if (!cc->sync)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -303,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* satisfies the allocation
*/
pageblock_nr = low_pfn >> pageblock_order;
- if (cc->mode != COMPACT_SYNC &&
- last_pageblock_nr != pageblock_nr &&
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
low_pfn += pageblock_nr_pages;
low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -325,17 +325,19 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (cc->mode != COMPACT_SYNC)
+ if (!cc->sync)
mode |= ISOLATE_ASYNC_MIGRATE;
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
/* Try isolate the page */
- if (__isolate_lru_page(page, mode, 0) != 0)
+ if (__isolate_lru_page(page, mode) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
- del_page_from_lru_list(zone, page, page_lru(page));
+ del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
@@ -358,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
-/*
- * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
- * converted to MIGRATE_MOVABLE type, false otherwise.
- */
-static bool rescue_unmovable_pageblock(struct page *page)
-{
- unsigned long pfn, start_pfn, end_pfn;
- struct page *start_page, *end_page;
-
- pfn = page_to_pfn(page);
- start_pfn = pfn & ~(pageblock_nr_pages - 1);
- end_pfn = start_pfn + pageblock_nr_pages;
-
- start_page = pfn_to_page(start_pfn);
- end_page = pfn_to_page(end_pfn);
-
- /* Do not deal with pageblocks that overlap zones */
- if (page_zone(start_page) != page_zone(end_page))
- return false;
-
- for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
- page++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- if (PageBuddy(page)) {
- int order = page_order(page);
-
- pfn += (1 << order) - 1;
- page += (1 << order) - 1;
-
- continue;
- } else if (page_count(page) == 0 || PageLRU(page))
- continue;
-
- return false;
- }
-
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
- return true;
-}
-enum smt_result {
- GOOD_AS_MIGRATION_TARGET,
- FAIL_UNMOVABLE_TARGET,
- FAIL_BAD_TARGET,
-};
-
-/*
- * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
- * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
- * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
- */
-static enum smt_result suitable_migration_target(struct page *page,
- struct compact_control *cc)
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
{
int migratetype = get_pageblock_migratetype(page);
/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
- return FAIL_BAD_TARGET;
+ return false;
/* If the page is a large free page, then allow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return GOOD_AS_MIGRATION_TARGET;
+ return true;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
- migrate_async_suitable(migratetype))
- return GOOD_AS_MIGRATION_TARGET;
-
- if (cc->mode == COMPACT_ASYNC_MOVABLE &&
- migratetype == MIGRATE_UNMOVABLE)
- return FAIL_UNMOVABLE_TARGET;
-
- if (cc->mode != COMPACT_ASYNC_MOVABLE &&
- migratetype == MIGRATE_UNMOVABLE &&
- rescue_unmovable_pageblock(page))
- return GOOD_AS_MIGRATION_TARGET;
+ if (migrate_async_suitable(migratetype))
+ return true;
/* Otherwise skip the block */
- return FAIL_BAD_TARGET;
+ return false;
}
/*
@@ -475,13 +414,6 @@ static void isolate_freepages(struct zone *zone,
zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
/*
- * isolate_freepages() may be called more than once during
- * compact_zone_order() run and we want only the most recent
- * count.
- */
- cc->nr_pageblocks_skipped = 0;
-
- /*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
@@ -489,7 +421,6 @@ static void isolate_freepages(struct zone *zone,
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
- enum smt_result ret;
if (!pfn_valid(pfn))
continue;
@@ -506,12 +437,9 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Check the block is suitable for migration */
- ret = suitable_migration_target(page, cc);
- if (ret != GOOD_AS_MIGRATION_TARGET) {
- if (ret == FAIL_UNMOVABLE_TARGET)
- cc->nr_pageblocks_skipped++;
+ if (!suitable_migration_target(page))
continue;
- }
+
/*
* Found a block suitable for isolating free pages from. Now
* we disabled interrupts, double check things are ok and
@@ -520,14 +448,12 @@ static void isolate_freepages(struct zone *zone,
*/
isolated = 0;
spin_lock_irqsave(&zone->lock, flags);
- ret = suitable_migration_target(page, cc);
- if (ret == GOOD_AS_MIGRATION_TARGET) {
+ if (suitable_migration_target(page)) {
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
isolated = isolate_freepages_block(pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
- } else if (ret == FAIL_UNMOVABLE_TARGET)
- cc->nr_pageblocks_skipped++;
+ }
spin_unlock_irqrestore(&zone->lock, flags);
/*
@@ -759,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)&cc->freepages, false,
- (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
- : MIGRATE_ASYNC);
+ (unsigned long)cc, false,
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -776,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
}
-
}
out:
@@ -790,8 +718,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- enum compact_mode mode,
- unsigned long *nr_pageblocks_skipped)
+ bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -799,17 +726,12 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .mode = mode,
+ .sync = sync,
};
- unsigned long rc;
-
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- rc = compact_zone(zone, &cc);
- *nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
-
- return rc;
+ return compact_zone(zone, &cc);
}
int sysctl_extfrag_threshold = 500;
@@ -834,8 +756,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
- unsigned long nr_pageblocks_skipped;
- enum compact_mode mode;
/*
* Check whether it is worth even starting compaction. The order check is
@@ -852,22 +772,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
-retry:
- status = compact_zone_order(zone, order, gfp_mask, mode,
- &nr_pageblocks_skipped);
+ status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
-
- if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
- if (nr_pageblocks_skipped) {
- mode = COMPACT_ASYNC_UNMOVABLE;
- goto retry;
- }
- }
}
return rc;
@@ -901,7 +811,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
if (ok && cc->order > zone->compact_order_failed)
zone->compact_order_failed = cc->order + 1;
/* Currently async compaction is never deferred. */
- else if (!ok && cc->mode == COMPACT_SYNC)
+ else if (!ok && cc->sync)
defer_compaction(zone, cc->order);
}
@@ -916,7 +826,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .mode = COMPACT_ASYNC_MOVABLE,
+ .sync = false,
};
return __compact_pgdat(pgdat, &cc);
@@ -926,7 +836,7 @@ static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .mode = COMPACT_SYNC,
+ .sync = true,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 64b48f934b89..a4a5260b0279 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1899,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page);
-/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = dentry->d_inode->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-static int __remove_suid(struct dentry *dentry, int kill)
-{
- struct iattr newattrs;
-
- newattrs.ia_valid = ATTR_FORCE | kill;
- return notify_change(dentry, &newattrs);
-}
-
-int file_remove_suid(struct file *file)
-{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- int killsuid;
- int killpriv;
- int error = 0;
-
- /* Fast path for nothing security related */
- if (IS_NOSEC(inode))
- return 0;
-
- killsuid = should_remove_suid(dentry);
- killpriv = security_inode_need_killpriv(dentry);
-
- if (killpriv < 0)
- return killpriv;
- if (killpriv)
- error = security_inode_killpriv(dentry);
- if (!error && killsuid)
- error = __remove_suid(dentry, killsuid);
- if (!error && (inode->i_sb->s_flags & MS_NOSEC))
- inode->i_flags |= S_NOSEC;
-
- return error;
-}
-EXPORT_SYMBOL(file_remove_suid);
-
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -2489,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- file_update_time(file);
+ err = file_update_time(file);
+ if (err)
+ goto out;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb31132229..213ca1f53409 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (ret)
goto out_backing;
- file_update_time(filp);
+ ret = file_update_time(filp);
+ if (ret)
+ goto out_backing;
ret = __xip_file_write (filp, buf, count, pos, ppos);
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ frontswap_clear(sis, offset);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = frontswap_ops.store(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ inc_frontswap_failed_stores();
+ if (dup)
+ __frontswap_clear(sis, offset);
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ ret = frontswap_ops.load(type, offset, page);
+ if (ret == 0)
+ inc_frontswap_loads();
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset)) {
+ frontswap_ops.invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += atomic_read(&si->frontswap_pages);
+ }
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 0;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int type, ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0 && pages_to_unuse)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d0def42c121b..57c4b9309015 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1231,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
{
int i;
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
compound_lock(page);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(page);
@@ -1309,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(zone, page, page_tail);
+ lru_add_page_tail(page, page_tail, lruvec);
}
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 285a81e87ec8..e198831276a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3036,7 +3036,8 @@ int hugetlb_reserve_pages(struct inode *inode,
region_add(&inode->i_mapping->private_list, from, to);
return 0;
out_err:
- resv_map_put(vma);
+ if (vma)
+ resv_map_put(vma);
return ret;
}
diff --git a/mm/internal.h b/mm/internal.h
index 4194ab9dc19b..2ba87fbfb75b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
-extern void set_pageblock_migratetype(struct page *page, int migratetype);
-extern int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype);
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
@@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page);
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#include <linux/compaction.h>
/*
* in mm/compaction.c
@@ -123,14 +119,11 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- enum compact_mode mode; /* Compaction mode */
+ bool sync; /* Synchronous migration */
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-
- /* Number of UNMOVABLE destination pageblocks skipped during scan */
- unsigned long nr_pageblocks_skipped;
};
unsigned long
@@ -350,3 +343,7 @@ extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
+#include <linux/file.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
{
loff_t offset;
int error;
+ struct file *f;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
- if (!vma->vm_file || !vma->vm_file->f_mapping
- || !vma->vm_file->f_mapping->host) {
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* filesystem's fallocate may need to take i_mutex */
+ /*
+ * Filesystem's fallocate may need to take i_mutex. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ get_file(f);
up_read(&current->mm->mmap_sem);
- error = do_fallocate(vma->vm_file,
+ error = do_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
+ fput(f);
down_read(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..5cc6731b00cc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
MAX_NUMNODES);
}
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_free(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_reserve(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.reserved.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+}
+
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+ phys_addr_t new_area_start,
+ phys_addr_t new_area_size)
{
struct memblock_region *new_array, *old_array;
+ phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
/* Calculate new doubled size */
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
+ /*
+ * We need to allocated new one align to PAGE_SIZE,
+ * so we can free them completely later.
+ */
+ old_alloc_size = PAGE_ALIGN(old_size);
+ new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */
if (type == &memblock.memory)
@@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
- addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ /* only exclude range when trying to double reserved.regions */
+ if (type != &memblock.reserved)
+ new_area_start = new_area_size = 0;
+
+ addr = memblock_find_in_range(new_area_start + new_area_size,
+ memblock.current_limit,
+ new_alloc_size, PAGE_SIZE);
+ if (!addr && new_area_size)
+ addr = memblock_find_in_range(0,
+ min(new_area_start, memblock.current_limit),
+ new_alloc_size, PAGE_SIZE);
+
new_array = addr ? __va(addr) : 0;
}
if (!addr) {
@@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
- memblock_free(__pa(old_array), old_size);
+ memblock_free(__pa(old_array), old_alloc_size);
/* Reserve the new array if that comes from the memblock.
* Otherwise, we needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_size));
+ BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -399,7 +422,7 @@ repeat:
*/
if (!insert) {
while (type->cnt + nr_new > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
@@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
/* we'll create at most two more regions */
while (type->cnt + 2 > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
for (i = 0; i < type->cnt; i++) {
@@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Find the first free area from *@idx which matches @nid, fill the out
* parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Reverse of __next_free_mem_range().
*/
@@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
return memblock_search(&memblock.memory, addr) != -1;
}
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
memblock.memory.regions[idx].size) >= end;
}
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00c8898dbb81..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
-struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0;
#endif
#else
-#define do_swap_account (0)
+#define do_swap_account 0
#endif
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
MEM_CGROUP_STAT_NSTATS,
};
+static const char * const mem_cgroup_stat_names[] = {
+ "cache",
+ "rss",
+ "mapped_file",
+ "swap",
+};
+
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
- MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
+
+static const char * const mem_cgroup_events_names[] = {
+ "pgpgin",
+ "pgpgout",
+ "pgfault",
+ "pgmajfault",
+};
+
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
-#define THRESHOLDS_EVENTS_TARGET (128)
-#define SOFTLIMIT_EVENTS_TARGET (1024)
-#define NUMAINFO_EVENTS_TARGET (1024)
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+#define NUMAINFO_EVENTS_TARGET 1024
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@@ -181,7 +195,7 @@ struct mem_cgroup_threshold {
/* For threshold */
struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below usage. */
+ /* An array index points to threshold just below or equal to usage. */
int current_threshold;
/* Size of entries[] */
unsigned int size;
@@ -244,8 +258,8 @@ struct mem_cgroup {
*/
struct rcu_head rcu_freeing;
/*
- * But when using vfree(), that cannot be done at
- * interrupt time, so we must then queue the work.
+ * We also need some space for a worker in deferred freeing.
+ * By the time we call it, rcu_freeing is no longer in use.
*/
struct work_struct work_freeing;
};
@@ -304,7 +318,7 @@ struct mem_cgroup {
/*
* percpu counter.
*/
- struct mem_cgroup_stat_cpu *stat;
+ struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
@@ -359,8 +373,8 @@ static bool move_file(void)
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
-#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
-#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -376,8 +390,8 @@ enum charge_type {
#define _MEM (0)
#define _MEMSWAP (1)
#define _OOM_TYPE (2)
-#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
-#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
+#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
@@ -403,6 +417,7 @@ void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
BUG_ON(!sk->sk_prot->proto_cgroup);
@@ -422,9 +437,10 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (!mem_cgroup_is_root(memcg)) {
+ cg_proto = sk->sk_prot->proto_cgroup(memcg);
+ if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
mem_cgroup_get(memcg);
- sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+ sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
}
@@ -453,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
#endif /* CONFIG_INET */
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+ if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ return;
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
+
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
@@ -717,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+ __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
preempt_enable();
}
unsigned long
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static unsigned long
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
@@ -769,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
if ((long)next - (long)val < 0) {
@@ -1012,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
- * @mem: memcg of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
@@ -1045,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
*/
/**
- * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
- * @zone: zone of the page
+ * mem_cgroup_page_lruvec - return lruvec for adding an lru page
* @page: the page
- * @lru: current lru
- *
- * This function accounts for @page being added to @lru, and returns
- * the lruvec for the given @zone and the memcg @page is charged to.
- *
- * The callsite is then responsible for physically linking the page to
- * the returned lruvec->lists[@lru].
+ * @zone: zone of the page
*/
-struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
- enum lru_list lru)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
@@ -1070,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
memcg = pc->mem_cgroup;
/*
- * Surreptitiously switch any uncharged page to root:
+ * Surreptitiously switch any uncharged offlist page to root:
* an uncharged page off lru does nothing to secure
* its former mem_cgroup from sudden removal.
*
@@ -1078,70 +1108,35 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
* under page_cgroup lock: between them, they make all uses
* of pc->mem_cgroup safe.
*/
- if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+ if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
- /* compound_order() is stabilized through lru_lock */
- mz->lru_size[lru] += 1 << compound_order(page);
return &mz->lruvec;
}
/**
- * mem_cgroup_lru_del_list - account for removing an lru page
- * @page: the page
- * @lru: target lru
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
+ * @lruvec: mem_cgroup per zone lru vector
+ * @lru: index of lru list the page is sitting on
+ * @nr_pages: positive when adding or negative when removing
*
- * This function accounts for @page being removed from @lru.
- *
- * The callsite is then responsible for physically unlinking
- * @page->lru.
+ * This function must be called when a page is added to or removed from an
+ * lru list.
*/
-void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages)
{
struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
- struct page_cgroup *pc;
+ unsigned long *lru_size;
if (mem_cgroup_disabled())
return;
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
- VM_BUG_ON(!memcg);
- mz = page_cgroup_zoneinfo(memcg, page);
- /* huge page split is done under lru_lock. so, we have no races. */
- VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
- mz->lru_size[lru] -= 1 << compound_order(page);
-}
-
-void mem_cgroup_lru_del(struct page *page)
-{
- mem_cgroup_lru_del_list(page, page_lru(page));
-}
-
-/**
- * mem_cgroup_lru_move_lists - account for moving a page between lrus
- * @zone: zone of the page
- * @page: the page
- * @from: current lru
- * @to: target lru
- *
- * This function accounts for @page being moved between the lrus @from
- * and @to, and returns the lruvec for the given @zone and the memcg
- * @page is charged to.
- *
- * The callsite is then responsible for physically relinking
- * @page->lru to the returned lruvec->lists[@to].
- */
-struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
- struct page *page,
- enum lru_list from,
- enum lru_list to)
-{
- /* XXX: Optimize this, especially for @from == @to */
- mem_cgroup_lru_del_list(page, from);
- return mem_cgroup_lru_add_list(zone, page, to);
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ lru_size = mz->lru_size + lru;
+ *lru_size += nr_pages;
+ VM_BUG_ON((long)(*lru_size) < 0);
}
/*
@@ -1153,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
{
if (root_memcg == memcg)
return true;
- if (!root_memcg->use_hierarchy)
+ if (!root_memcg->use_hierarchy || !memcg)
return false;
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
@@ -1204,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
unsigned long inactive_ratio;
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
unsigned long inactive;
unsigned long active;
unsigned long gb;
- inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_INACTIVE_ANON));
- active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_ACTIVE_ANON));
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -1227,45 +1218,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
return inactive * inactive_ratio < active;
}
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long active;
unsigned long inactive;
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_INACTIVE_FILE));
- active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_ACTIVE_FILE));
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
return (active > inactive);
}
-struct zone_reclaim_stat *
-mem_cgroup_get_reclaim_stat_from_page(struct page *page)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- pc = lookup_page_cgroup(page);
- if (!PageCgroupUsed(pc))
- return NULL;
- /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
- smp_rmb();
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- return &mz->lruvec.reclaim_stat;
-}
-
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
@@ -1539,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
/**
* test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
@@ -1633,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
@@ -1668,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return 0;
}
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}
@@ -1842,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+ int order)
{
struct oom_wait_info owait;
bool locked, need_to_kill;
@@ -1991,7 +1961,7 @@ struct memcg_stock_pcp {
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE (0)
+#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2138,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
int i;
spin_lock(&memcg->pcp_counter_lock);
- for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long x = per_cpu(memcg->stat->count[i], cpu);
per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2426,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
}
/*
+ * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
+ * This is useful when moving usage to parent cgroup.
+ */
+static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+ if (do_swap_account)
+ res_counter_uncharge_until(&memcg->memsw,
+ memcg->memsw.parent, bytes);
+}
+
+/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
* it's concern. (dropping refcnt from swap can be called against removed
@@ -2480,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
{
struct page_cgroup *pc = lookup_page_cgroup(page);
struct zone *uninitialized_var(zone);
+ struct lruvec *lruvec;
bool was_on_lru = false;
bool anon;
@@ -2502,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_lru(page));
+ del_page_from_lru_list(page, lruvec, page_lru(page));
was_on_lru = true;
}
}
@@ -2521,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- add_page_to_lru_list(zone, page, page_lru(page));
+ add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -2546,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2577,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
- * @uncharge: whether we should call uncharge and css_put against @from.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - compound_lock is held when nr_pages > 1
*
- * This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
- * true, this function does "uncharge" from old cgroup, but it doesn't if
- * @uncharge is false, so a caller should do "uncharge".
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
struct mem_cgroup *from,
- struct mem_cgroup *to,
- bool uncharge)
+ struct mem_cgroup *to)
{
unsigned long flags;
int ret;
@@ -2627,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page,
preempt_enable();
}
mem_cgroup_charge_statistics(from, anon, -nr_pages);
- if (uncharge)
- /* This is not "cancel", but cancel_charge does all we need. */
- __mem_cgroup_cancel_charge(from, nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
@@ -2663,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
- struct cgroup *cg = child->css.cgroup;
- struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
unsigned int nr_pages;
unsigned long uninitialized_var(flags);
int ret;
/* Is ROOT ? */
- if (!pcg)
+ if (mem_cgroup_is_root(child))
return -EINVAL;
ret = -EBUSY;
@@ -2682,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page,
nr_pages = hpage_nr_pages(page);
- parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
- if (ret)
- goto put_back;
+ parent = parent_mem_cgroup(child);
+ /*
+ * If no parent, move charges to root cgroup.
+ */
+ if (!parent)
+ parent = root_mem_cgroup;
if (nr_pages > 1)
flags = compound_lock_irqsave(page);
- ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
- if (ret)
- __mem_cgroup_cancel_charge(parent, nr_pages);
+ ret = mem_cgroup_move_account(page, nr_pages,
+ pc, child, parent);
+ if (!ret)
+ __mem_cgroup_cancel_local_charge(child, nr_pages);
if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
-put_back:
putback_lru_page(page);
put:
put_page(page);
@@ -3772,7 +3756,7 @@ try_to_free:
goto move_account;
}
-int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}
@@ -4030,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
}
#endif
-
-/* For read statistics */
-enum {
- MCS_CACHE,
- MCS_RSS,
- MCS_FILE_MAPPED,
- MCS_PGPGIN,
- MCS_PGPGOUT,
- MCS_SWAP,
- MCS_PGFAULT,
- MCS_PGMAJFAULT,
- MCS_INACTIVE_ANON,
- MCS_ACTIVE_ANON,
- MCS_INACTIVE_FILE,
- MCS_ACTIVE_FILE,
- MCS_UNEVICTABLE,
- NR_MCS_STAT,
-};
-
-struct mcs_total_stat {
- s64 stat[NR_MCS_STAT];
-};
-
-struct {
- char *local_name;
- char *total_name;
-} memcg_stat_strings[NR_MCS_STAT] = {
- {"cache", "total_cache"},
- {"rss", "total_rss"},
- {"mapped_file", "total_mapped_file"},
- {"pgpgin", "total_pgpgin"},
- {"pgpgout", "total_pgpgout"},
- {"swap", "total_swap"},
- {"pgfault", "total_pgfault"},
- {"pgmajfault", "total_pgmajfault"},
- {"inactive_anon", "total_inactive_anon"},
- {"active_anon", "total_active_anon"},
- {"inactive_file", "total_inactive_file"},
- {"active_file", "total_active_file"},
- {"unevictable", "total_unevictable"}
-};
-
-
-static void
-mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
- s64 val;
-
- /* per cpu stat */
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
- s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
- s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
- s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
- s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
- s->stat[MCS_PGPGOUT] += val;
- if (do_swap_account) {
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
- s->stat[MCS_SWAP] += val * PAGE_SIZE;
- }
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
- s->stat[MCS_PGFAULT] += val;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
- s->stat[MCS_PGMAJFAULT] += val;
-
- /* per zone stat */
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
- s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
- s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
- s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
- s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-}
-
-static void
-mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- mem_cgroup_get_local_stat(iter, s);
-}
-
#ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
{
int nid;
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
- struct cgroup *cont = m->private;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4167,38 +4061,76 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
}
#endif /* CONFIG_NUMA */
+static const char * const mem_cgroup_lru_names[] = {
+ "inactive_anon",
+ "active_anon",
+ "inactive_file",
+ "active_file",
+ "unevictable",
+};
+
+static inline void mem_cgroup_lru_names_not_uptodate(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+}
+
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ struct seq_file *m)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- struct mcs_total_stat mystat;
- int i;
-
- memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_local_stat(memcg, &mystat);
+ struct mem_cgroup *mi;
+ unsigned int i;
-
- for (i = 0; i < NR_MCS_STAT; i++) {
- if (i == MCS_SWAP && !do_swap_account)
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
continue;
- cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+ seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+ mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
}
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
+ mem_cgroup_read_events(memcg, i));
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+ mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+
/* Hierarchical information */
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
- cb->fill(cb, "hierarchical_memory_limit", limit);
+ seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
if (do_swap_account)
- cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+ seq_printf(m, "hierarchical_memsw_limit %llu\n",
+ memsw_limit);
}
- memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_total_stat(memcg, &mystat);
- for (i = 0; i < NR_MCS_STAT; i++) {
- if (i == MCS_SWAP && !do_swap_account)
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ long long val = 0;
+
+ if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
continue;
- cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+ seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+ }
+
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_events(mi, i);
+ seq_printf(m, "total_%s %llu\n",
+ mem_cgroup_events_names[i], val);
+ }
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
}
#ifdef CONFIG_DEBUG_VM
@@ -4219,10 +4151,10 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
recent_scanned[0] += rstat->recent_scanned[0];
recent_scanned[1] += rstat->recent_scanned[1];
}
- cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
- cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
- cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
- cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+ seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
+ seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
+ seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
+ seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
}
#endif
@@ -4284,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
usage = mem_cgroup_usage(memcg, swap);
/*
- * current_threshold points to threshold just below usage.
+ * current_threshold points to threshold just below or equal to usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
@@ -4410,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
/* Find current threshold */
new->current_threshold = -1;
for (i = 0; i < size; i++) {
- if (new->entries[i].threshold < usage) {
+ if (new->entries[i].threshold <= usage) {
/*
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
* it here.
*/
++new->current_threshold;
- }
+ } else
+ break;
}
/* Free old spare buffer and save old primary buffer as spare */
@@ -4486,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
continue;
new->entries[j] = thresholds->primary->entries[i];
- if (new->entries[j].threshold < usage) {
+ if (new->entries[j].threshold <= usage) {
/*
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
@@ -4600,22 +4533,6 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return 0;
}
-#ifdef CONFIG_NUMA
-static const struct file_operations mem_control_numa_stat_file_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
-{
- struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
- file->f_op = &mem_control_numa_stat_file_operations;
- return single_open(file, mem_control_numa_stat_show, cont);
-}
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
@@ -4671,7 +4588,7 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "stat",
- .read_map = mem_control_stat_show,
+ .read_seq_string = mem_control_stat_show,
},
{
.name = "force_empty",
@@ -4703,8 +4620,7 @@ static struct cftype mem_cgroup_files[] = {
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .open = mem_control_numa_stat_open,
- .mode = S_IRUGO,
+ .read_seq_string = mem_control_numa_stat_show,
},
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4741,7 +4657,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
int zone, tmp = node;
/*
* This routine is called against possible nodes.
@@ -4759,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
- for_each_lru(lru)
- INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
+ lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
@@ -4803,23 +4717,40 @@ out_free:
}
/*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
* but in process context. The work_freeing structure is overlaid
* on the rcu_freeing structure, which itself is overlaid on memsw.
*/
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
{
struct mem_cgroup *memcg;
+ int size = sizeof(struct mem_cgroup);
memcg = container_of(work, struct mem_cgroup, work_freeing);
- vfree(memcg);
+ /*
+ * We need to make sure that (at least for now), the jump label
+ * destruction code runs outside of the cgroup lock. This is because
+ * get_online_cpus(), which is called from the static_branch update,
+ * can't be called inside the cgroup_lock. cpusets are the ones
+ * enforcing this dependency, so if they ever change, we might as well.
+ *
+ * schedule_work() will guarantee this happens. Be careful if you need
+ * to move this code around, and make sure it is outside
+ * the cgroup_lock.
+ */
+ disarm_sock_keys(memcg);
+ if (size < PAGE_SIZE)
+ kfree(memcg);
+ else
+ vfree(memcg);
}
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
{
struct mem_cgroup *memcg;
memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, vfree_work);
+ INIT_WORK(&memcg->work_freeing, free_work);
schedule_work(&memcg->work_freeing);
}
@@ -4845,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
- if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- kfree_rcu(memcg, rcu_freeing);
- else
- call_rcu(&memcg->rcu_freeing, vfree_rcu);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
}
static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5154,7 +5082,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
- struct inode *inode;
struct address_space *mapping;
pgoff_t pgoff;
@@ -5163,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
if (!move_file())
return NULL;
- inode = vma->vm_file->f_path.dentry->d_inode;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
@@ -5462,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (!isolate_lru_page(page)) {
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
- pc, mc.from, mc.to,
- false)) {
+ pc, mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
@@ -5493,7 +5418,7 @@ retry:
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, 1, pc,
- mc.from, mc.to, false)) {
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e290..de4ce7058450 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
int fail, struct page *page, unsigned long pfn,
int flags)
{
struct to_kill *tk, *next;
list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
+ if (forcekill) {
/*
* In case something went wrong with munmapping
* make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int kill = 1;
+ int kill = 1, forcekill;
struct page *hpage = compound_head(p);
struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* be called inside page lock (it's recommended but not enforced).
*/
mapping = page_mapping(hpage);
- if (!PageDirty(hpage) && mapping &&
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
* freed. When there was a problem unmapping earlier
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs(&tokill, !!PageDirty(ppage), trapno,
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+ if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+ pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+ __func__, addr, end,
+ vma->vm_start,
+ vma->vm_end);
+ BUG();
+ }
+#endif
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb,
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..427bb291dd0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
pgdat = hotadd_new_pgdat(nid, start);
ret = -ENOMEM;
if (!pgdat)
- goto out;
+ goto error;
new_pgdat = 1;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, true);
+ false, MIGRATE_SYNC);
if (nr_failed)
putback_lru_pages(&pagelist);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index ab81d482ae6f..be26d5cbe56b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* is actually a signal that all of the page has become dirty.
* Whereas only part of our page may be dirty.
*/
- __set_page_dirty_nobuffers(newpage);
+ if (PageSwapBacked(page))
+ SetPageDirty(newpage);
+ else
+ __set_page_dirty_nobuffers(newpage);
}
mlock_migrate_page(newpage, page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a9c2a391e28..3edfcdfa42d9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -971,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
* The caller must hold down_write(&current->mm->mmap_sem).
*/
-static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct inode *inode;
vm_flags_t vm_flags;
- int error;
- unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1101,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
}
- error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
- if (error)
- return error;
-
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
-unsigned long do_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
-{
- if (unlikely(offset + PAGE_ALIGN(len) < offset))
- return -EINVAL;
- if (unlikely(offset & ~PAGE_MASK))
- return -EINVAL;
- return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
-{
- unsigned long ret;
- struct mm_struct *mm = current->mm;
-
- down_write(&mm->mmap_sem);
- ret = do_mmap(file, addr, len, prot, flag, offset);
- up_write(&mm->mmap_sem);
- return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
@@ -1165,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(&current->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&current->mm->mmap_sem);
-
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
out:
@@ -1629,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (addr & ~PAGE_MASK)
return -EINVAL;
- return arch_rebalance_pgtables(addr, len);
+ addr = arch_rebalance_pgtables(addr, len);
+ error = security_mmap_addr(addr);
+ return error ? error : addr;
}
EXPORT_SYMBOL(get_unmapped_area);
@@ -1819,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
address &= PAGE_MASK;
- error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ error = security_mmap_addr(address);
if (error)
return error;
@@ -2159,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
-EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long start, size_t len)
{
@@ -2207,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (!len)
return addr;
- error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
- if (error)
- return error;
-
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2563,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
- if (ret)
- goto out;
-
ret = insert_vm_struct(mm, vma);
if (ret)
goto out;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c5..6830eab5bf09 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+{
+ enum lru_list lru;
+
+ memset(lruvec, 0, sizeof(struct lruvec));
+
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+ lruvec->zone = zone;
+#endif
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7d..21fed202ddad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
-
ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
* This option implies MREMAP_MAYMOVE.
*/
-unsigned long do_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
+ down_write(&current->mm->mmap_sem);
+
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
goto out;
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
goto out;
}
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
- return ret;
-}
-
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
- unsigned long, new_len, unsigned long, flags,
- unsigned long, new_addr)
-{
- unsigned long ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_mremap(addr, old_len, new_len, flags, new_addr);
up_write(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
__free_pages_bootmem(pfn_to_page(i), 0);
}
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn > end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
unsigned long __init free_low_memory_core_early(int nodeid)
{
unsigned long count = 0;
- phys_addr_t start, end;
+ phys_addr_t start, end, size;
u64 i;
- /* free reserved array temporarily so that it's treated as free area */
- memblock_free_reserved_regions();
-
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
- if (start_pfn < end_pfn) {
- __free_pages_memory(start_pfn, end_pfn);
- count += end_pfn - start_pfn;
- }
- }
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ count += __free_memory_core(start, end);
+
+ /* free range that is used for reserved array if we allocate it */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
- /* put region array back? */
- memblock_reserve_reserved_regions();
return count;
}
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index bb8f4f004a82..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
unsigned long *_capabilities)
{
unsigned long capabilities, rlen;
- unsigned long reqprot = prot;
int ret;
/* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
}
/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ ret = security_mmap_addr(addr);
if (ret < 0)
return ret;
@@ -1233,7 +1232,7 @@ enomem:
/*
* handle mapping creation for uClinux
*/
-static unsigned long do_mmap_pgoff(struct file *file,
+unsigned long do_mmap_pgoff(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
@@ -1471,32 +1470,6 @@ error_getting_region:
return -ENOMEM;
}
-unsigned long do_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
-{
- if (unlikely(offset + PAGE_ALIGN(len) < offset))
- return -EINVAL;
- if (unlikely(offset & ~PAGE_MASK))
- return -EINVAL;
- return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
-{
- unsigned long ret;
- struct mm_struct *mm = current->mm;
-
- down_write(&mm->mmap_sem);
- ret = do_mmap(file, addr, len, prot, flag, offset);
- up_write(&mm->mmap_sem);
- return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(&current->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&current->mm->mmap_sem);
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points;
+ long points;
+ long adj;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ adj = p->signal->oom_score_adj;
+ if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= 30 * totalpages / 1000;
+ adj -= 30;
- /*
- * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
- * either completely disable oom killing or always prefer a certain
- * task.
- */
- points += p->signal->oom_score_adj * totalpages / 1000;
+ /* Normalize to oom_score_adj units */
+ adj *= totalpages / 1000;
+ points += adj;
/*
* Never return 0 for an eligible task regardless of the root bonus and
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
- return points ? points : 1;
+ return points > 0 ? points : 1;
}
/*
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8cbfc38e68ac..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-void set_pageblock_migratetype(struct page *page, int migratetype)
+static void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
return pages_moved;
}
-int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -4358,7 +4358,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4408,12 +4407,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(lru)
- INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
- zone->lruvec.reclaim_stat.recent_rotated[0] = 0;
- zone->lruvec.reclaim_stat.recent_rotated[1] = 0;
- zone->lruvec.reclaim_stat.recent_scanned[0] = 0;
- zone->lruvec.reclaim_stat.recent_scanned[1] = 0;
+ lruvec_init(&zone->lruvec, zone);
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
@@ -5641,7 +5635,12 @@ static struct page *
__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
int **resultp)
{
- return alloc_page(GFP_HIGHUSER_MOVABLE);
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
}
/* [start, end) must belong to a single zone. */
@@ -5657,7 +5656,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .mode = COMPACT_SYNC,
+ .sync = true,
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
* @old: old id
* @new: new id
*
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
/**
* walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
* @addr: starting address
* @end: ending address
* @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c2..926b46649749 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid,
/* Check iovecs */
if (vm_write)
rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l, 1);
+ iovstack_l, &iov_l);
else
rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l, 1);
+ iovstack_l, &iov_l);
if (rc <= 0)
goto free_iovecs;
- rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
- iovstack_r, &iov_r, 0);
+ rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
+ iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid,
if (vm_write)
rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
- &iov_l, 1);
+ &iov_l);
else
rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
- &iov_l, 1);
+ &iov_l);
if (rc <= 0)
goto free_iovecs;
- rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+ rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
- &iov_r, 0);
+ &iov_r);
if (rc <= 0)
goto free_iovecs;
diff --git a/mm/shmem.c b/mm/shmem.c
index d576b84d913c..c15b998e5a86 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
}
/*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+ pgoff_t index, swp_entry_t swap)
+{
+ void *item;
+
+ rcu_read_lock();
+ item = radix_tree_lookup(&mapping->page_tree, index);
+ rcu_read_unlock();
+ return item == swp_to_radix_entry(swap);
+}
+
+/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
- int error = 0;
+ int error;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapBacked(page));
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
if (!expected)
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ error = radix_tree_insert(&mapping->page_tree, index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index, expected,
+ page);
if (!error) {
- page_cache_get(page);
- page->mapping = mapping;
- page->index = index;
-
- spin_lock_irq(&mapping->tree_lock);
- if (!expected)
- error = radix_tree_insert(&mapping->page_tree,
- index, page);
- else
- error = shmem_radix_tree_replace(mapping, index,
- expected, page);
- if (!error) {
- mapping->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(page, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
- } else {
- page->mapping = NULL;
- spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
- }
- if (!expected)
- radix_tree_preload_end();
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- if (error)
- mem_cgroup_uncharge_cache_page(page);
return error;
}
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
mutex_lock(&shmem_swaplist_mutex);
/*
* We needed to drop mutex to make that restrictive page
- * allocation; but the inode might already be freed by now,
- * and we cannot refer to inode or mapping or info to check.
- * However, we do hold page lock on the PageSwapCache page,
- * so can check if that still has our reference remaining.
+ * allocation, but the inode might have been freed while we
+ * dropped it: although a racing shmem_evict_inode() cannot
+ * complete without emptying the radix_tree, our page lock
+ * on this swapcache page is not enough to prevent that -
+ * free_swap_and_cache() of our swap entry will only
+ * trylock_page(), removing swap from radix_tree whatever.
+ *
+ * We must not proceed to shmem_add_to_page_cache() if the
+ * inode has been freed, but of course we cannot rely on
+ * inode or mapping or info to check that. However, we can
+ * safely check if our swap entry is still in use (and here
+ * it can't have got reused for another page): if it's still
+ * in use, then the inode cannot have been freed yet, and we
+ * can safely proceed (if it's no longer in use, that tells
+ * nothing about the inode, but we don't need to unuse swap).
*/
if (!page_swapcount(*pagep))
error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
/*
* There's a faint possibility that swap page was replaced before
- * caller locked it: it will come back later with the right page.
+ * caller locked it: caller will come back later with the right page.
*/
- if (unlikely(!PageSwapCache(page)))
+ if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
goto out;
/*
@@ -995,21 +1015,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
newpage = shmem_alloc_page(gfp, info, index);
if (!newpage)
return -ENOMEM;
- VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
- *pagep = newpage;
page_cache_get(newpage);
copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
- VM_BUG_ON(!PageLocked(oldpage));
__set_page_locked(newpage);
- VM_BUG_ON(!PageUptodate(oldpage));
SetPageUptodate(newpage);
- VM_BUG_ON(!PageSwapBacked(oldpage));
SetPageSwapBacked(newpage);
- VM_BUG_ON(!swap_index);
set_page_private(newpage, swap_index);
- VM_BUG_ON(!PageSwapCache(oldpage));
SetPageSwapCache(newpage);
/*
@@ -1019,13 +1033,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
spin_lock_irq(&swap_mapping->tree_lock);
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ if (!error) {
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ }
spin_unlock_irq(&swap_mapping->tree_lock);
- BUG_ON(error);
- mem_cgroup_replace_page_cache(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ if (unlikely(error)) {
+ /*
+ * Is this possible? I think not, now that our callers check
+ * both PageSwapCache and page_private after getting page lock;
+ * but be defensive. Reverse old to newpage for clear and free.
+ */
+ oldpage = newpage;
+ } else {
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+ *pagep = newpage;
+ }
ClearPageSwapCache(oldpage);
set_page_private(oldpage, 0);
@@ -1033,7 +1058,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
unlock_page(oldpage);
page_cache_release(oldpage);
page_cache_release(oldpage);
- return 0;
+ return error;
}
/*
@@ -1107,9 +1132,10 @@ repeat:
/* We have to do this with page locked to prevent races */
lock_page(page);
- if (!PageSwapCache(page) || page->mapping) {
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST; /* try again */
- goto failed;
+ goto unlock;
}
if (!PageUptodate(page)) {
error = -EIO;
@@ -1125,9 +1151,12 @@ repeat:
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
+ if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, swp_to_radix_entry(swap));
+ /* We already confirmed swap, and make no allocation */
+ VM_BUG_ON(error);
+ }
if (error)
goto failed;
@@ -1164,11 +1193,18 @@ repeat:
__set_page_locked(page);
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
- error = shmem_add_to_page_cache(page, mapping, index,
- gfp, NULL);
if (error)
goto decused;
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ radix_tree_preload_end();
+ }
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto decused;
+ }
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -1228,14 +1264,10 @@ decused:
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
- if (swap.val && error != -EINVAL) {
- struct page *test = find_get_page(mapping, index);
- if (test && !radix_tree_exceptional_entry(test))
- page_cache_release(test);
- /* Have another try if the entry has changed */
- if (test != swp_to_radix_entry(swap))
- error = -EEXIST;
- }
+ if (swap.val && error != -EINVAL &&
+ !shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
@@ -1247,7 +1279,7 @@ failed:
spin_unlock(&info->lock);
goto repeat;
}
- if (error == -EEXIST)
+ if (error == -EEXIST) /* from above or from radix_tree_insert */
goto repeat;
return error;
}
@@ -1577,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &page_cache_pipe_buf_ops,
.spd_release = spd_release_page,
@@ -1665,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
if (error > 0) {
*ppos += error;
@@ -1674,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int origin)
-{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
-
- pagevec_init(&pvec, 0);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (origin == SEEK_DATA)
- index = end;
- break;
- }
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (origin == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && origin == SEEK_DATA) ||
- (!page && origin == SEEK_HOLE)) {
- done = true;
- break;
- }
- }
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
- cond_resched();
- }
- return index;
-}
-
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
- struct address_space *mapping;
- struct inode *inode;
- pgoff_t start, end;
- loff_t new_offset;
-
- if (origin != SEEK_DATA && origin != SEEK_HOLE)
- return generic_file_llseek_size(file, offset, origin,
- MAX_LFS_FILESIZE);
- mapping = file->f_mapping;
- inode = mapping->host;
- mutex_lock(&inode->i_mutex);
- /* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0)
- offset = -EINVAL;
- else if (offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_CACHE_SHIFT;
- end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, origin);
- new_offset <<= PAGE_CACHE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (origin == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
-
- if (offset >= 0 && offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- mutex_unlock(&inode->i_mutex);
- return offset;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -1936,7 +1877,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
}
@@ -2439,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
return dentry;
}
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
- int connectable)
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
-
if (*len < 3) {
*len = 3;
return 255;
@@ -2771,7 +2710,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = shmem_file_llseek,
+ .llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
diff --git a/mm/slub.c b/mm/slub.c
index 80848cd3901c..8c691fa1cf3c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1369,7 +1369,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
- page->flags |= 1 << PG_slab;
+ __SetPageSlab(page);
start = page_address(page);
@@ -1514,15 +1514,19 @@ static inline void *acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- if (mode)
+ if (mode) {
new.inuse = page->objects;
+ new.freelist = NULL;
+ } else {
+ new.freelist = freelist;
+ }
VM_BUG_ON(new.frozen);
new.frozen = 1;
} while (!__cmpxchg_double_slab(s, page,
freelist, counters,
- NULL, new.counters,
+ new.freelist, new.counters,
"lock and freeze"));
remove_partial(n, page);
@@ -1564,7 +1568,6 @@ static void *get_partial_node(struct kmem_cache *s,
object = t;
available = page->objects - page->inuse;
} else {
- page->freelist = t;
available = put_cpu_partial(s, page, 0);
stat(s, CPU_PARTIAL_NODE);
}
@@ -1579,7 +1582,7 @@ static void *get_partial_node(struct kmem_cache *s,
/*
* Get a page from somewhere. Search in increasing NUMA distances.
*/
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
struct kmem_cache_cpu *c)
{
#ifdef CONFIG_NUMA
@@ -2766,7 +2769,7 @@ static unsigned long calculate_alignment(unsigned long flags,
}
static void
-init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
+init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
spin_lock_init(&n->list_lock);
@@ -2836,7 +2839,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- init_kmem_cache_node(n, kmem_cache_node);
+ init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
add_partial(n, page, DEACTIVATE_TO_HEAD);
@@ -2876,7 +2879,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
}
s->node[node] = n;
- init_kmem_cache_node(n, s);
+ init_kmem_cache_node(n);
}
return 1;
}
@@ -3625,7 +3628,7 @@ static int slab_mem_going_online_callback(void *arg)
ret = -ENOMEM;
goto out;
}
- init_kmem_cache_node(n, s);
+ init_kmem_cache_node(n);
s->node[nid] = n;
}
out:
@@ -3968,9 +3971,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
}
return s;
}
- kfree(n);
kfree(s);
}
+ kfree(n);
err:
up_write(&slub_lock);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..c7bb952400c8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -275,8 +275,9 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- pg_data_t *host_pgdat;
- unsigned long goal;
+ unsigned long goal, limit;
+ unsigned long *p;
+ int nid;
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
@@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & PAGE_SECTION_MASK;
- host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
- return __alloc_bootmem_node_nopanic(host_pgdat, size,
- SMP_CACHE_BYTES, goal);
+ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ limit = goal + (1UL << PA_SECTION_SHIFT);
+ nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+ p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+ SMP_CACHE_BYTES, goal, limit);
+ if (!p && limit) {
+ limit = 0;
+ goto again;
+ }
+ return p;
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
diff --git a/mm/swap.c b/mm/swap.c
index 0503ad705e7c..4e7e2ec67078 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- unsigned long flags;
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+ unsigned long flags;
spin_lock_irqsave(&zone->lru_lock, flags);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
@@ -235,11 +237,12 @@ void put_pages_list(struct list_head *pages)
EXPORT_SYMBOL(put_pages_list);
static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, void *arg),
- void *arg)
+ void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
+ void *arg)
{
int i;
struct zone *zone = NULL;
+ struct lruvec *lruvec;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
@@ -253,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
spin_lock_irqsave(&zone->lru_lock, flags);
}
- (*move_fn)(page, arg);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ (*move_fn)(page, lruvec, arg);
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -261,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
pagevec_reinit(pvec);
}
-static void pagevec_move_tail_fn(struct page *page, void *arg)
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
int *pgmoved = arg;
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_lru_move_lists(page_zone(page),
- page, lru, lru);
list_move_tail(&page->lru, &lruvec->lists[lru]);
(*pgmoved)++;
}
@@ -309,35 +310,30 @@ void rotate_reclaimable_page(struct page *page)
}
}
-static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+static void update_page_reclaim_stat(struct lruvec *lruvec,
int file, int rotated)
{
- struct zone_reclaim_stat *reclaim_stat;
-
- reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
- if (!reclaim_stat)
- reclaim_stat = &zone->lruvec.reclaim_stat;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
reclaim_stat->recent_scanned[file]++;
if (rotated)
reclaim_stat->recent_rotated[file]++;
}
-static void __activate_page(struct page *page, void *arg)
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
- struct zone *zone = page_zone(page);
-
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
int lru = page_lru_base_type(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
SetPageActive(page);
lru += LRU_ACTIVE;
- add_page_to_lru_list(zone, page, lru);
- __count_vm_event(PGACTIVATE);
+ add_page_to_lru_list(page, lruvec, lru);
- update_page_reclaim_stat(zone, page, file, 1);
+ __count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 1);
}
}
@@ -374,7 +370,7 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- __activate_page(page, NULL);
+ __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
spin_unlock_irq(&zone->lru_lock);
}
#endif
@@ -441,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
void add_page_to_unevictable_list(struct page *page)
{
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
SetPageUnevictable(page);
SetPageLRU(page);
- add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
spin_unlock_irq(&zone->lru_lock);
}
@@ -470,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page)
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_fn(struct page *page, void *arg)
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
int lru, file;
bool active;
- struct zone *zone = page_zone(page);
if (!PageLRU(page))
return;
@@ -487,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
return;
active = PageActive(page);
-
file = page_is_file_cache(page);
lru = page_lru_base_type(page);
- del_page_from_lru_list(zone, page, lru + active);
+
+ del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(zone, page, lru);
+ add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
@@ -503,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
*/
SetPageReclaim(page);
} else {
- struct lruvec *lruvec;
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
list_move_tail(&page->lru, &lruvec->lists[lru]);
__count_vm_event(PGROTATED);
}
if (active)
__count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(zone, page, file, 0);
+ update_page_reclaim_stat(lruvec, file, 0);
}
/*
@@ -615,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
int i;
LIST_HEAD(pages_to_free);
struct zone *zone = NULL;
+ struct lruvec *lruvec;
unsigned long uninitialized_var(flags);
for (i = 0; i < nr; i++) {
@@ -642,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold)
zone = pagezone;
spin_lock_irqsave(&zone->lru_lock, flags);
}
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
list_add(&page->lru, &pages_to_free);
@@ -676,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct zone* zone,
- struct page *page, struct page *page_tail)
+void lru_add_page_tail(struct page *page, struct page *page_tail,
+ struct lruvec *lruvec)
{
int uninitialized_var(active);
enum lru_list lru;
@@ -686,7 +685,8 @@ void lru_add_page_tail(struct zone* zone,
VM_BUG_ON(!PageHead(page));
VM_BUG_ON(PageCompound(page_tail));
VM_BUG_ON(PageLRU(page_tail));
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
+ VM_BUG_ON(NR_CPUS != 1 &&
+ !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
SetPageLRU(page_tail);
@@ -715,20 +715,20 @@ void lru_add_page_tail(struct zone* zone,
* Use the standard add function to put page_tail on the list,
* but then correct its position so they all end up in order.
*/
- add_page_to_lru_list(zone, page_tail, lru);
+ add_page_to_lru_list(page_tail, lruvec, lru);
list_head = page_tail->lru.prev;
list_move_tail(&page_tail->lru, list_head);
}
if (!PageUnevictable(page))
- update_page_reclaim_stat(zone, page_tail, file, active);
+ update_page_reclaim_stat(lruvec, file, active);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __pagevec_lru_add_fn(struct page *page, void *arg)
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
enum lru_list lru = (enum lru_list)arg;
- struct zone *zone = page_zone(page);
int file = is_file_lru(lru);
int active = is_active_lru(lru);
@@ -739,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
SetPageLRU(page);
if (active)
SetPageActive(page);
- add_page_to_lru_list(zone, page, lru);
- update_page_reclaim_stat(zone, page, file, active);
+ add_page_to_lru_list(page, lruvec, lru);
+ update_page_reclaim_stat(lruvec, file, active);
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
if ((p->flags & SWP_BLKDEV) &&
disk->fops->swap_slot_free_notify)
disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map)
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
{
int i, prev;
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
- err = try_to_unuse(type);
+ err = try_to_unuse(type, false, 0); /* force all pages to be unused */
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map);
+ enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
goto out_dput;
}
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ vfree(frontswap_map_get(p));
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
/*
* Find out how many pages are allowed for a single swap
- * device. There are three limiting factors: 1) the number
+ * device. There are two limiting factors: 1) the number
* of bits for the swap offset in the swp_entry_t type, and
* 2) the number of bits in the swap pte as defined by the
- * the different architectures, and 3) the number of free bits
- * in an exceptional radix_tree entry. In order to find the
+ * different architectures. In order to find the
* largest possible bit mask, a swap entry with swap type 0
* and swap offset ~0UL is created, encoded to a swap pte,
* decoded to a swp_entry_t again, and finally the swap
* offset is extracted. This will mask all the bits from
* the initial ~0UL mask that can't be encoded in either
* the swp_entry_t or the architecture definition of a
- * swap pte. Then the same is done for a radix_tree entry.
+ * swap pte.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL))));
- maxpages = swp_offset(radix_to_swp_entry(
- swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
-
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
if (maxpages > swap_header->info.last_page) {
maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled)
+ frontswap_map = vzalloc(maxpages / sizeof(long));
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "");
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
diff --git a/mm/util.c b/mm/util.c
index ae962b31de88..8c7265afa29f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/security.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff)
+{
+ unsigned long ret;
+ struct mm_struct *mm = current->mm;
+
+ ret = security_mmap_file(file, prot, flag);
+ if (!ret) {
+ down_write(&mm->mmap_sem);
+ ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+ up_write(&mm->mmap_sem);
+ }
+ return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ if (unlikely(offset + PAGE_ALIGN(len) < offset))
+ return -EINVAL;
+ if (unlikely(offset & ~PAGE_MASK))
+ return -EINVAL;
+
+ return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8deb5f4da4d9..347b3ff2a478 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,6 +78,9 @@ struct scan_control {
int order;
+ /* Scan (total_size >> priority) pages at once */
+ int priority;
+
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
@@ -91,11 +94,6 @@ struct scan_control {
nodemask_t *nodemask;
};
-struct mem_cgroup_zone {
- struct mem_cgroup *mem_cgroup;
- struct zone *zone;
-};
-
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
@@ -147,24 +145,14 @@ static bool global_reclaim(struct scan_control *sc)
}
#endif
-static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
-{
- return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat;
-}
-
-static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
- enum lru_list lru)
+static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
- return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
- zone_to_nid(mz->zone),
- zone_idx(mz->zone),
- BIT(lru));
+ return mem_cgroup_get_lru_size(lruvec, lru);
- return zone_page_state(mz->zone, NR_LRU_BASE + lru);
+ return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
}
-
/*
* Add a shrinker callback to be called from the vm
*/
@@ -626,7 +614,6 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct mem_cgroup_zone *mz,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;
@@ -685,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct mem_cgroup_zone *mz,
+ struct zone *zone,
struct scan_control *sc,
- int priority,
unsigned long *ret_nr_dirty,
unsigned long *ret_nr_writeback)
{
@@ -716,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
- VM_BUG_ON(page_zone(page) != mz->zone);
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -739,7 +725,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
}
- references = page_check_references(page, mz, sc);
+ references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
@@ -790,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* unless under significant pressure.
*/
if (page_is_file_cache(page) &&
- (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+ (!current_is_kswapd() ||
+ sc->priority >= DEF_PRIORITY - 2)) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -928,7 +915,7 @@ keep:
* will encounter the same problem
*/
if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
- zone_set_flag(mz->zone, ZONE_CONGESTED);
+ zone_set_flag(zone, ZONE_CONGESTED);
free_hot_cold_page_list(&free_pages, 1);
@@ -949,29 +936,14 @@ keep:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
- bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
- all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
- (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
-
- /*
- * When checking the active state, we need to be sure we are
- * dealing with comparible boolean values. Take the logical not
- * of each.
- */
- if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
- return ret;
-
- if (!all_lru_mode && !!page_is_file_cache(page) != file)
- return ret;
-
/* Do not give back unevictable pages for compaction */
if (PageUnevictable(page))
return ret;
@@ -1039,47 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
- * @mz: The mem_cgroup_zone to pull pages from.
+ * @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
- * @active: True [1] if isolating active pages
- * @file: True [1] if isolating file [!anon] pages
+ * @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz, struct list_head *dst,
+ struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, int active, int file)
+ isolate_mode_t mode, enum lru_list lru)
{
- struct lruvec *lruvec;
- struct list_head *src;
+ struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
unsigned long scan;
- int lru = LRU_BASE;
-
- lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
- if (active)
- lru += LRU_ACTIVE;
- if (file)
- lru += LRU_FILE;
- src = &lruvec->lists[lru];
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
+ int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON(!PageLRU(page));
- switch (__isolate_lru_page(page, mode, file)) {
+ switch (__isolate_lru_page(page, mode)) {
case 0:
- mem_cgroup_lru_del(page);
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
list_move(&page->lru, dst);
- nr_taken += hpage_nr_pages(page);
+ nr_taken += nr_pages;
break;
case -EBUSY:
@@ -1093,11 +1057,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
*nr_scanned = scan;
-
- trace_mm_vmscan_lru_isolate(sc->order,
- nr_to_scan, scan,
- nr_taken,
- mode, file);
+ trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+ nr_taken, mode, is_file_lru(lru));
return nr_taken;
}
@@ -1134,15 +1095,16 @@ int isolate_lru_page(struct page *page)
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
if (PageLRU(page)) {
int lru = page_lru(page);
- ret = 0;
get_page(page);
ClearPageLRU(page);
-
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
+ ret = 0;
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -1175,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
}
static noinline_for_stack void
-putback_inactive_pages(struct mem_cgroup_zone *mz,
- struct list_head *page_list)
+putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
{
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
- struct zone *zone = mz->zone;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ struct zone *zone = lruvec_zone(lruvec);
LIST_HEAD(pages_to_free);
/*
@@ -1197,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
spin_lock_irq(&zone->lru_lock);
continue;
}
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
+ add_page_to_lru_list(page, lruvec, lru);
+
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
@@ -1208,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1225,71 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
list_splice(&pages_to_free, page_list);
}
-static noinline_for_stack void
-update_isolated_counts(struct mem_cgroup_zone *mz,
- struct list_head *page_list,
- unsigned long *nr_anon,
- unsigned long *nr_file)
-{
- struct zone *zone = mz->zone;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- unsigned long nr_active = 0;
- struct page *page;
- int lru;
-
- /*
- * Count pages and clear active flags
- */
- list_for_each_entry(page, page_list, lru) {
- int numpages = hpage_nr_pages(page);
- lru = page_lru_base_type(page);
- if (PageActive(page)) {
- lru += LRU_ACTIVE;
- ClearPageActive(page);
- nr_active += numpages;
- }
- count[lru] += numpages;
- }
-
- preempt_disable();
- __count_vm_events(PGDEACTIVATE, nr_active);
-
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
-
- *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
- preempt_enable();
-}
-
/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
- struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
- isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
- struct zone *zone = mz->zone;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1308,31 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
- sc, isolate_mode, 0, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ &nr_scanned, sc, isolate_mode, lru);
+
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scanned);
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scanned);
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
return 0;
- update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
-
- nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc,
&nr_dirty, &nr_writeback);
spin_lock_irq(&zone->lru_lock);
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
if (current_is_kswapd())
@@ -1343,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
nr_reclaimed);
}
- putback_inactive_pages(mz, &page_list);
+ putback_inactive_pages(lruvec, &page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1375,13 +1291,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
* isolated page is PageWriteback
*/
- if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+ if (nr_writeback && nr_writeback >=
+ (nr_taken >> (DEF_PRIORITY - sc->priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
- priority,
+ sc->priority,
trace_shrink_flags(file));
return nr_reclaimed;
}
@@ -1404,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
* But we had to alter page->flags anyway.
*/
-static void move_active_pages_to_lru(struct zone *zone,
+static void move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *list,
struct list_head *pages_to_free,
enum lru_list lru)
{
+ struct zone *zone = lruvec_zone(lruvec);
unsigned long pgmoved = 0;
struct page *page;
+ int nr_pages;
while (!list_empty(list)) {
- struct lruvec *lruvec;
-
page = lru_to_page(list);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
- pgmoved += hpage_nr_pages(page);
+ pgmoved += nr_pages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1443,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
}
static void shrink_active_list(unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz,
+ struct lruvec *lruvec,
struct scan_control *sc,
- int priority, int file)
+ enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
@@ -1454,10 +1373,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
- isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
- struct zone *zone = mz->zone;
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
lru_add_drain();
@@ -1468,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
- isolate_mode, 1, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ &nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
zone->pages_scanned += nr_scanned;
reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- if (file)
- __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
- else
- __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1535,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(zone, &l_active, &l_hold,
- LRU_ACTIVE + file * LRU_FILE);
- move_active_pages_to_lru(zone, &l_inactive, &l_hold,
- LRU_BASE + file * LRU_FILE);
+ move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+ move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1561,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
/**
* inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- * @sc: scan control of this context
+ * @lruvec: LRU vector to check
*
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static int inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
@@ -1577,13 +1491,12 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
return 0;
if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
- mz->zone);
+ return mem_cgroup_inactive_anon_is_low(lruvec);
- return inactive_anon_is_low_global(mz->zone);
+ return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
#else
-static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static inline int inactive_anon_is_low(struct lruvec *lruvec)
{
return 0;
}
@@ -1601,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
/**
* inactive_file_is_low - check if file pages need to be deactivated
- * @mz: memory cgroup and zone to check
+ * @lruvec: LRU vector to check
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
@@ -1613,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct mem_cgroup_zone *mz)
+static int inactive_file_is_low(struct lruvec *lruvec)
{
if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
- mz->zone);
+ return mem_cgroup_inactive_file_is_low(lruvec);
- return inactive_file_is_low_global(mz->zone);
+ return inactive_file_is_low_global(lruvec_zone(lruvec));
}
-static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
- if (file)
- return inactive_file_is_low(mz);
+ if (is_file_lru(lru))
+ return inactive_file_is_low(lruvec);
else
- return inactive_anon_is_low(mz);
+ return inactive_anon_is_low(lruvec);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz,
- struct scan_control *sc, int priority)
+ struct lruvec *lruvec, struct scan_control *sc)
{
- int file = is_file_lru(lru);
-
if (is_active_lru(lru)) {
- if (inactive_list_is_low(mz, file))
- shrink_active_list(nr_to_scan, mz, sc, priority, file);
+ if (inactive_list_is_low(lruvec, lru))
+ shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
- return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
+ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
-static int vmscan_swappiness(struct mem_cgroup_zone *mz,
- struct scan_control *sc)
+static int vmscan_swappiness(struct scan_control *sc)
{
if (global_reclaim(sc))
return vm_swappiness;
- return mem_cgroup_swappiness(mz->mem_cgroup);
+ return mem_cgroup_swappiness(sc->target_mem_cgroup);
}
/*
@@ -1662,17 +1570,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
- unsigned long *nr, int priority)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long *nr)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2], denominator;
enum lru_list lru;
int noswap = 0;
bool force_scan = false;
+ struct zone *zone = lruvec_zone(lruvec);
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1684,7 +1593,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && mz->zone->all_unreclaimable)
+ if (current_is_kswapd() && zone->all_unreclaimable)
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
@@ -1698,16 +1607,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
goto out;
}
- anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
if (global_reclaim(sc)) {
- free = zone_page_state(mz->zone, NR_FREE_PAGES);
+ free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
- if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
+ if (unlikely(file + free <= high_wmark_pages(zone))) {
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
@@ -1719,8 +1628,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(mz, sc);
- file_prio = 200 - vmscan_swappiness(mz, sc);
+ anon_prio = vmscan_swappiness(sc);
+ file_prio = 200 - anon_prio;
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1733,7 +1642,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
- spin_lock_irq(&mz->zone->lru_lock);
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
@@ -1754,7 +1663,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&mz->zone->lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
@@ -1764,9 +1673,9 @@ out:
int file = is_file_lru(lru);
unsigned long scan;
- scan = zone_nr_lru_pages(mz, lru);
- if (priority || noswap || !vmscan_swappiness(mz, sc)) {
- scan >>= priority;
+ scan = get_lru_size(lruvec, lru);
+ if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+ scan >>= sc->priority;
if (!scan && force_scan)
scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
@@ -1776,11 +1685,11 @@ out:
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
-static bool in_reclaim_compaction(int priority, struct scan_control *sc)
+static bool in_reclaim_compaction(struct scan_control *sc)
{
if (COMPACTION_BUILD && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
- priority < DEF_PRIORITY - 2))
+ sc->priority < DEF_PRIORITY - 2))
return true;
return false;
@@ -1793,17 +1702,16 @@ static bool in_reclaim_compaction(int priority, struct scan_control *sc)
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
-static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
+static inline bool should_continue_reclaim(struct lruvec *lruvec,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
- int priority,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
- if (!in_reclaim_compaction(priority, sc))
+ if (!in_reclaim_compaction(sc))
return false;
/* Consider stopping depending on scan and reclaim activity */
@@ -1834,15 +1742,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+ inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
if (nr_swap_pages > 0)
- inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+ inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(mz->zone, sc->order)) {
+ switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -1854,8 +1762,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
- struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -1867,7 +1774,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;
- get_scan_count(mz, sc, nr, priority);
+ get_scan_count(lruvec, sc, nr);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1879,7 +1786,7 @@ restart:
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- mz, sc, priority);
+ lruvec, sc);
}
}
/*
@@ -1890,7 +1797,8 @@ restart:
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+ if (nr_reclaimed >= nr_to_reclaim &&
+ sc->priority < DEF_PRIORITY)
break;
}
blk_finish_plug(&plug);
@@ -1900,36 +1808,33 @@ restart:
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(mz))
- shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
/* reclaim/compaction might need reclaim to continue */
- if (should_continue_reclaim(mz, nr_reclaimed,
- sc->nr_scanned - nr_scanned,
- priority, sc))
+ if (should_continue_reclaim(lruvec, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
goto restart;
throttle_vm_writeout(sc->gfp_mask);
}
-static void shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
- .priority = priority,
+ .priority = sc->priority,
};
struct mem_cgroup *memcg;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+ shrink_lruvec(lruvec, sc);
- shrink_mem_cgroup_zone(priority, &mz, sc);
/*
* Limit reclaim has historically picked one memcg and
* scanned it with decreasing priority levels until
@@ -2005,8 +1910,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* the caller that it should consider retrying the allocation instead of
* further reclaim.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
- struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2033,7 +1937,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
if (COMPACTION_BUILD) {
/*
@@ -2065,7 +1970,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
/* need some check for avoid more shrink_zone() */
}
- shrink_zone(priority, zone, sc);
+ shrink_zone(zone, sc);
}
return aborted_reclaim;
@@ -2116,7 +2021,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc,
struct shrink_control *shrink)
{
- int priority;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
@@ -2129,9 +2033,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (global_reclaim(sc))
count_vm_event(ALLOCSTALL);
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ do {
sc->nr_scanned = 0;
- aborted_reclaim = shrink_zones(priority, zonelist, sc);
+ aborted_reclaim = shrink_zones(zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -2173,7 +2077,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2) {
+ sc->priority < DEF_PRIORITY - 2) {
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2181,7 +2085,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
&preferred_zone);
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
- }
+ } while (--sc->priority >= 0);
out:
delayacct_freepages_end();
@@ -2219,6 +2123,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_unmap = 1,
.may_swap = 1,
.order = order,
+ .priority = DEF_PRIORITY,
.target_mem_cgroup = NULL,
.nodemask = nodemask,
};
@@ -2251,17 +2156,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
+ .priority = 0,
.target_mem_cgroup = memcg,
};
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.may_writepage,
sc.gfp_mask);
@@ -2272,7 +2175,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_mem_cgroup_zone(0, &mz, &sc);
+ shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2293,6 +2196,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
+ .priority = DEF_PRIORITY,
.target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2323,8 +2227,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc,
- int priority)
+static void age_active_anon(struct zone *zone, struct scan_control *sc)
{
struct mem_cgroup *memcg;
@@ -2333,14 +2236,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_anon_is_low(&mz))
- shrink_active_list(SWAP_CLUSTER_MAX, &mz,
- sc, priority, 0);
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
@@ -2449,7 +2349,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
{
int all_zones_ok;
unsigned long balanced;
- int priority;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2473,11 +2372,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
};
loop_again:
total_scanned = 0;
+ sc.priority = DEF_PRIORITY;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ do {
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
@@ -2494,14 +2394,15 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
- age_active_anon(zone, &sc, priority);
+ age_active_anon(zone, &sc);
/*
* If the number of buffer_heads in the machine
@@ -2549,7 +2450,8 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
sc.nr_scanned = 0;
@@ -2593,7 +2495,7 @@ loop_again:
!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone) + balance_gap,
end_zone, 0)) {
- shrink_zone(priority, zone, &sc);
+ shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2650,7 +2552,7 @@ loop_again:
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/
- if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+ if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
else
@@ -2665,7 +2567,7 @@ loop_again:
*/
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
- }
+ } while (--sc.priority >= 0);
out:
/*
@@ -2715,7 +2617,8 @@ out:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
/* Would compaction fail due to lack of free memory? */
@@ -2786,7 +2689,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
@@ -2982,6 +2888,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
.order = 0,
+ .priority = DEF_PRIORITY,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
@@ -3052,14 +2959,17 @@ int kswapd_run(int nid)
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)
@@ -3159,7 +3069,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int priority;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3168,6 +3077,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.order = order,
+ .priority = ZONE_RECLAIM_PRIORITY,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
@@ -3190,11 +3100,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
- priority = ZONE_RECLAIM_PRIORITY;
do {
- shrink_zone(priority, zone, &sc);
- priority--;
- } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+ shrink_zone(zone, &sc);
+ } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3345,6 +3253,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
+ lruvec = mem_cgroup_page_lruvec(page, zone);
if (!PageLRU(page) || !PageUnevictable(page))
continue;
@@ -3354,11 +3263,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
- __dec_zone_state(zone, NR_UNEVICTABLE);
- lruvec = mem_cgroup_lru_move_lists(zone, page,
- LRU_UNEVICTABLE, lru);
- list_move(&page->lru, &lruvec->lists[lru]);
- __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+ del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, lru);
pgrescued++;
}
}