diff options
author | Steven Rostedt (Red Hat) <rostedt@goodmis.org> | 2014-10-28 16:38:11 -0400 |
---|---|---|
committer | Steven Rostedt <rostedt@goodmis.org> | 2014-10-28 16:38:11 -0400 |
commit | c1a3bd301a943469f9b575edb26e195a9c987390 (patch) | |
tree | 83ce790eb3ffc8e73a45c9966317fbe09cfa7947 /mm | |
parent | 77014e82b50da13ab801c9cfeb5cb8c29881cc9c (diff) | |
parent | b0807bc10a6ac95ab8bf3bbf57703a0f2edd9aa9 (diff) | |
download | linux-rt-c1a3bd301a943469f9b575edb26e195a9c987390.tar.gz |
Merge tag 'v3.12.30' into v3.12-rt
This is the 3.12.30 stable release
Conflicts:
fs/exec.c
include/linux/radix-tree.h
mm/filemap.c
mm/page_alloc.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/compaction.c | 347 | ||||
-rw-r--r-- | mm/filemap.c | 470 | ||||
-rw-r--r-- | mm/fremap.c | 28 | ||||
-rw-r--r-- | mm/frontswap.c | 13 | ||||
-rw-r--r-- | mm/huge_memory.c | 93 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/internal.h | 22 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 16 | ||||
-rw-r--r-- | mm/migrate.c | 56 | ||||
-rw-r--r-- | mm/mincore.c | 20 | ||||
-rw-r--r-- | mm/mmap.c | 55 | ||||
-rw-r--r-- | mm/nommu.c | 24 | ||||
-rw-r--r-- | mm/page_alloc.c | 423 | ||||
-rw-r--r-- | mm/readahead.c | 37 | ||||
-rw-r--r-- | mm/shmem.c | 133 | ||||
-rw-r--r-- | mm/slab.c | 12 | ||||
-rw-r--r-- | mm/slub.c | 16 | ||||
-rw-r--r-- | mm/swap.c | 101 | ||||
-rw-r--r-- | mm/swap_state.c | 65 | ||||
-rw-r--r-- | mm/swapfile.c | 224 | ||||
-rw-r--r-- | mm/truncate.c | 74 | ||||
-rw-r--r-- | mm/vmacache.c | 114 | ||||
-rw-r--r-- | mm/vmalloc.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 144 | ||||
-rw-r--r-- | mm/vmstat.c | 13 |
30 files changed, 1617 insertions, 920 deletions
diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..fb51bc61d80a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o \ + compaction.o balloon_compaction.o vmacache.o \ interval_tree.o list_lru.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/compaction.c b/mm/compaction.c index 6441083e76d3..adb6d0560e96 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) { - return compact_checklock_irqsave(lock, flags, false, cc); + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; } /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_RESERVE) - return false; - - if (is_migrate_isolate(migratetype)) - return false; - - /* If the page is a large free page, then allow migration */ + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return false; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) + if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, struct page *cursor, *valid_page = NULL; unsigned long flags; bool locked = false; + bool checked_pageblock = false; cursor = pfn_to_page(blockpfn); @@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; /* Recheck this is a suitable migration target under lock */ - if (!strict && !suitable_migration_target(page)) - break; + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -330,7 +362,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; + bool set_unsuitable = true; + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (compact_should_abort(cc)) + return 0; + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; @@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* If isolation recently failed, do not retry */ pageblock_nr = low_pfn >> pageblock_order; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { + set_unsuitable = false; + goto next_pageblock; + } + } /* Skip if free */ if (PageBuddy(page)) continue; /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation - */ - if (!cc->sync && last_pageblock_nr != pageblock_nr && - !migrate_async_suitable(get_pageblock_migratetype(page))) { - cc->finished_update_migrate = true; - goto next_pageblock; - } - - /* * Check may be lockless but that's ok as we recheck later. * It's possible to migrate LRU pages and balloon pages * Skip any other type of page @@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (unlikely(balloon_page_movable(page))) { if (locked && balloon_page_isolate(page)) { /* Successfully isolated */ - cc->finished_update_migrate = true; - list_add(&page->lru, migratelist); - cc->nr_migratepages++; - nr_isolated++; - goto check_compact_cluster; + goto isolate_success; } } continue; @@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); @@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; - - if (unevictable) - mode |= ISOLATE_UNEVICTABLE; - lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ - cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; -check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -626,7 +670,6 @@ check_compact_cluster: next_pageblock: low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); @@ -634,9 +677,13 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + */ if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone, * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do pfn -= pageblock_nr_pages - * in the for loop. + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ - pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; - unsigned long end_pfn; /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; + cc->free_pfn = block_start_pfn; + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); + nr_freepages += isolated; /* - * Take care when isolating in last pageblock of a zone which - * ends in the middle of a pageblock. + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); - nr_freepages += isolated; + if (isolated) + cc->finished_update_free = true; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * isolate_freepages_block() might have aborted due to async + * compaction being contended */ - if (isolated) { - cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); - } + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + cc->free_pfn = cc->migrate_pfn; + cc->nr_freepages = nr_freepages; } @@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. */ -static void update_nr_listpages(struct compact_control *cc) +static void compaction_free(struct page *page, unsigned long data) { - int nr_migratepages = 0; - int nr_freepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; + struct compact_control *cc = (struct compact_control *)data; - cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; } /* possible outcome of isolate_migratepages */ @@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kswapd does not set the @@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it @@ -1035,12 +1084,13 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order >= zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; - /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) - defer_compaction(zone, cc->order); + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); } VM_BUG_ON(!list_empty(&cc->freepages)); @@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1171,7 +1217,8 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, }; __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/filemap.c b/mm/filemap.c index 3d2d39a5a508..b012daefc2d7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } @@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page) +{ + void **slot; + int error; + + slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); + if (slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + return 0; + } + error = radix_tree_insert(&mapping->page_tree, page->index, page); + if (!error) + mapping->nrpages++; + return error; +} + /** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add @@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); trace_mm_filemap_add_to_page_cache(page); @@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } @@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (TestClearPageReclaim(page)) + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); rotate_reclaimable_page(page); + } if (!test_clear_page_writeback(page)) BUG(); @@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -734,24 +854,30 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -764,44 +890,87 @@ repeat: } return page; } -EXPORT_SYMBOL(find_lock_page); +EXPORT_SYMBOL(find_lock_entry); /** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * @fgp_flags: PCG flags + * @gfp_mask: gfp mask to use if a page is to be allocated + * + * Looks up the page cache slot at @mapping & @offset. + * + * PCG flags modify how the page is returned * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, %NULL is returned. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON(page->index != offset); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - /* - * We want a regular kernel memory (not highmem or DMA etc) - * allocation for the radix tree nodes, but we need to honour - * the context-specific requirements the caller has asked for. - * GFP_RECLAIM_MASK collects those requirements. - */ - err = add_to_page_cache_lru(page, mapping, index, - (gfp_mask & GFP_RECLAIM_MASK)); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoit atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + init_page_accessed(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -809,9 +978,80 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages are included in the returned + * array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} /** * find_get_pages - gang pagecache lookup @@ -1031,39 +1271,6 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (trylock_page(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1823,6 +2042,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1859,6 +2080,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1866,40 +2091,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, char *kaddr; size_t copied; - BUG_ON(!pagefault_disabled()); kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; @@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); @@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { - int status; - gfp_t gfp_mask; struct page *page; - gfp_t gfp_notmask = 0; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - gfp_mask = mapping_gfp_mask(mapping); - if (mapping_cap_account_dirty(mapping)) - gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) - gfp_notmask = __GFP_FS; -repeat: - page = find_lock_page(mapping, index); + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); if (page) - goto found; + wait_for_stable_page(page); - page = __page_cache_alloc(gfp_mask & ~gfp_notmask); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & ~gfp_notmask); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } -found: - wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2344,18 +2515,15 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); - mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221a..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -23,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb3197..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); static unsigned long __frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) totalpages += atomic_read(&si->frontswap_pages); - } return totalpages; } @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; - int type; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } vm_unacct_memory(pages); *unused = pages_to_unuse; - *swapid = type; + *swapid = si->type; ret = 0; break; } @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time + * so restart scan from swap_active_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 389973fd6bb7..2ee53749eb48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) -{ - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); -} -#endif - static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } +static int khugepaged_node_load[MAX_NUMNODES]; + +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2231,9 +2274,8 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2249,6 +2291,17 @@ static struct page return *hpage; } #else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; @@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == NUMA_NO_NODE) - node = page_to_nid(page); + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; + khugepaged_node_load[node]++; VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f80b17106d24..c33d8a65298c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -596,7 +596,7 @@ retry_cpuset: } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2192,6 +2198,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", diff --git a/mm/internal.h b/mm/internal.h index fdddbc83ac5f..d610f7ce4e9c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include <linux/fs.h> #include <linux/mm.h> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. @@ -120,7 +135,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated @@ -130,7 +145,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb96b323..a402f8fdc68e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, for (; start < end; start += PAGE_SIZE) { index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - page = find_get_page(mapping, index); + page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) page_cache_release(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6e3f9c39bc22..4ab233d4714a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); diff --git a/mm/memory.c b/mm/memory.c index f37b609503bb..4e1d48e8d900 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -878,7 +878,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..d31730564617 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0437f3595b32..cc61c7a7d6a1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, + nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); @@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp) * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -2045,7 +2045,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2055,7 +2055,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2102,7 +2102,7 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; diff --git a/mm/migrate.c b/mm/migrate.c index e3cf71dd1288..96d4d814ae2f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -867,8 +867,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -912,11 +913,18 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); + put_new_page(newpage, private); + } else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -945,8 +953,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1012,6 +1031,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1025,7 +1046,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1194,7 +1217,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node) list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; diff --git a/mm/mincore.c b/mm/mincore.c index da2be56a7b8f..06cb81005c77 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); diff --git a/mm/mmap.c b/mm/mmap.c index af99b9ed2007..c1249cb7dc15 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* diff --git a/mm/nommu.c b/mm/nommu.c index ecd1f158548e..1221d2b66e97 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,7 @@ #include <linux/export.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(curr->mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32fbe4d7577c..b43aa10fd623 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -418,7 +418,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -462,7 +463,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -513,21 +514,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; @@ -559,6 +570,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -575,7 +587,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON(page_idx & ((1 << order) - 1)); VM_BUG_ON(bad_range(zone, page)); @@ -663,10 +675,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct list_head *list) { int to_free = count; + unsigned long nr_scanned; unsigned long flags; spin_lock_irqsave(&zone->lock, flags); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (!list_empty(list)) { struct page *page = list_first_entry(list, struct page, lru); @@ -677,7 +692,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -733,15 +748,20 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src, } } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, int migratetype) { + unsigned long nr_scanned; unsigned long flags; spin_lock_irqsave(&zone->lock, flags); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock_irqrestore(&zone->lock, flags); @@ -778,15 +798,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_lock_irqsave(pa_lock, flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_unlock_irqrestore(pa_lock, flags); } @@ -906,7 +927,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -955,6 +976,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1079,6 +1101,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, { int current_order = page_order(page); + /* + * When borrowing from MIGRATE_CMA, we need to release the excess + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. + */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1110,16 +1138,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1143,21 +1172,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - /* - * Borrow the excess buddy pages as well, irrespective - * of whether we stole freepages, or took ownership of - * the pageblock or not. - * - * Exception: When borrowing from MIGRATE_CMA, release - * the excess buddy pages to CMA itself. - */ expand(zone, page, order, current_order, area, - is_migrate_cma(migratetype) - ? migratetype : start_migratetype); + new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); - trace_mm_page_alloc_extfrag(page, order, - current_order, start_migratetype, migratetype, - new_type == start_migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype, new_type); return page; } @@ -1203,9 +1228,9 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1222,18 +1247,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1369,7 +1388,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1401,19 +1420,20 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_lock_irqsave(pa_lock, flags); __count_vm_event(PGFREE); @@ -1427,17 +1447,17 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1457,7 +1477,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1569,12 +1589,12 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -1620,11 +1640,14 @@ again: goto failed; } __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); spin_unlock(&zone->lock); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1721,12 +1744,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1741,7 +1764,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1756,15 +1779,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -1906,7 +1929,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) { int i; - for_each_online_node(i) + for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); else @@ -1949,6 +1972,18 @@ static inline void init_zone_allows_reclaim(int nid) } #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1956,18 +1991,22 @@ static inline void init_zone_allows_reclaim(int nid) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1979,12 +2018,10 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a @@ -1993,9 +2030,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -2023,15 +2062,19 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + if (consider_zone_dirty && !zone_dirty_ok(zone)) + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2093,17 +2136,11 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2112,8 +2149,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2229,7 +2295,7 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; @@ -2247,7 +2313,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2282,7 +2348,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2296,7 +2362,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2310,13 +2376,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } @@ -2332,7 +2395,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2345,9 +2408,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2386,7 +2449,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2404,7 +2467,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2427,14 +2491,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2443,28 +2507,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2535,14 +2577,14 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2584,15 +2626,19 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, + &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2607,7 +2653,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2629,17 +2675,16 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2656,7 +2701,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2675,7 +2721,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2714,12 +2760,11 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2745,11 +2790,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; struct mem_cgroup *memcg = NULL; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2776,42 +2823,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, return NULL; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2819,7 +2850,7 @@ retry: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2831,7 +2862,7 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; memcg_kmem_commit_charge(page, memcg, order); @@ -2870,7 +2901,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } @@ -3099,9 +3130,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } @@ -3254,12 +3285,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -3999,6 +4030,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -4020,6 +4052,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -4057,6 +4095,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* @@ -4136,7 +4180,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -4959,7 +5003,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); + if (node_state(nid, N_MEMORY)) + init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif @@ -5549,7 +5594,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -5791,7 +5836,12 @@ module_init(init_per_zone_wmark_min) int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + if (write) { user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); @@ -6033,17 +6083,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; @@ -6055,25 +6104,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, } /** - * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_mask(struct page *page, unsigned long flags, +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; @@ -6251,7 +6300,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6290,7 +6339,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); @@ -6445,7 +6494,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6497,7 +6546,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..0f35e983bffb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include <linux/kernel.h> -#include <linux/fs.h> #include <linux/gfp.h> -#include <linux/mm.h> #include <linux/export.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -20,6 +18,8 @@ #include <linux/syscalls.h> #include <linux/file.h> +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); -} - -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; + return min(nr, MAX_READAHEAD); } /* @@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); + pgoff_t prev_offset; /* * start of file @@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) @@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss + * trivial case: (offset - prev_offset) == 1 + * unaligned reads: (offset - prev_offset) == 0 */ - if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + if (offset - prev_offset <= 1UL) goto initial_readahead; /* diff --git a/mm/shmem.c b/mm/shmem.c index 0da81aaeb4cc..ab05681f41cd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; while (index < end) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1102,6 +1037,9 @@ repeat: goto failed; } + if (page && sgp == SGP_WRITE) + mark_page_accessed(page); + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -1183,6 +1121,9 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); + if (sgp == SGP_WRITE) + mark_page_accessed(page); + delete_from_swap_cache(page); set_page_dirty(page); swap_free(swap); @@ -1207,8 +1148,11 @@ repeat: goto decused; } - SetPageSwapBacked(page); + __SetPageSwapBacked(page); __set_page_locked(page); + if (sgp == SGP_WRITE) + init_page_accessed(page); + error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) @@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); diff --git a/mm/slab.c b/mm/slab.c index 2580db062df9..eb4078c7d183 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); + struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(slabp->s_mem); if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } @@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) __SetPageSlab(page + i); if (page->pfmemalloc) - SetPageSlabPfmemalloc(page + i); + SetPageSlabPfmemalloc(page); } memcg_bind_pages(cachep, cachep->gfporder); @@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); + + __ClearPageSlabPfmemalloc(page); while (i--) { BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page++; } @@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); retry: @@ -3276,7 +3278,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index a1646484f76e..90154580b304 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1664,7 +1664,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1676,19 +1676,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } diff --git a/mm/swap.c b/mm/swap.c index 05a695114c43..8ab73ba62a68 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -72,7 +72,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -441,7 +441,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -554,12 +554,17 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); /* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). + * Used to mark_page_accessed(page) that is not visible yet and when it is + * still safe to use non-atomic ops */ -void __lru_cache_add(struct page *page) +void init_page_accessed(struct page *page) +{ + if (!PageReferenced(page)) + __SetPageReferenced(page); +} +EXPORT_SYMBOL(init_page_accessed); + +static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); @@ -569,11 +574,34 @@ void __lru_cache_add(struct page *page) pagevec_add(pvec, page); put_locked_var(swapvec_lock, lru_add_pvec); } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { @@ -785,7 +813,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); @@ -826,7 +854,7 @@ void release_pages(struct page **pages, int nr, int cold) } /* Clear Active bit in case of parallel mark_page_accessed */ - ClearPageActive(page); + __ClearPageActive(page); list_add(&page->lru, &pages_to_free); } @@ -908,7 +936,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* @@ -922,6 +950,57 @@ void __pagevec_lru_add(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search diff --git a/mm/swap_state.c b/mm/swap_state.c index e6f15f8ca2af..4079edfff2cc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 0ec2eaf3ccfd..660b9c0e2e40 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; -static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -591,6 +609,9 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); @@ -639,71 +660,65 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - int hp_index; - spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - hp_index = atomic_xchg(&highest_priority_index, -1); - /* - * highest_priority_index records current highest priority swap - * type which just frees swap entries. If its priority is - * higher than that of swap_list.next swap type, we use it. It - * isn't protected by swap_lock, so it can be an invalid value - * if the corresponding swap type is swapoff. We double check - * the flags here. It's even possible the swap type is swapoff - * and swapon again and its priority is changed. In such rare - * case, low prority swap type might be used, but eventually - * high priority swap will be used after several rounds of - * swap. - */ - if (hp_index != -1 && hp_index != type && - swap_info[type]->prio < swap_info[hp_index]->prio && - (swap_info[hp_index]->flags & SWP_WRITEOK)) { - type = hp_index; - swap_list.next = type; - } - - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; - } + spin_lock(&swap_avail_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&si->lock); - if (!si->highest_bit) { + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&si->lock); - continue; + goto nextsi; } - if (!(si->flags & SWP_WRITEOK)) { - spin_unlock(&si->lock); - continue; - } - - swap_list.next = next; - spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) - return swp_entry(type, offset); - spin_lock(&swap_lock); - next = swap_list.next; + return swp_entry(si->type, offset); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. + */ + if (plist_node_empty(&next->avail_list)) + goto start_over; } + spin_unlock(&swap_avail_lock); + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } @@ -765,27 +780,6 @@ out: return NULL; } -/* - * This swap type frees swap entry, check if it is the highest priority swap - * type which just frees swap entry. get_swap_page() uses - * highest_priority_index to search highest priority swap type. The - * swap_info_struct.lock can't protect us if there are multiple swap types - * active, so we use atomic_cmpxchg. - */ -static void set_highest_priority_index(int type) -{ - int old_hp_index, new_hp_index; - - do { - old_hp_index = atomic_read(&highest_priority_index); - if (old_hp_index != -1 && - swap_info[old_hp_index]->prio >= swap_info[type]->prio) - break; - new_hp_index = type; - } while (atomic_cmpxchg(&highest_priority_index, - old_hp_index, new_hp_index) != old_hp_index); -} - static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; - set_highest_priority_index(p->type); + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - int i, prev; - if (prio >= 0) p->prio = prio; else p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int i, type, prev; - int err; + int err, found = 0; unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) @@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; @@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + si->avail_list.prio--; + } least_priority++; } + plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); set_current_oom_origin(); - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); if (err) { @@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map_set(p, NULL); spin_unlock(&p->lock); spin_unlock(&swap_lock); - frontswap_invalidate_area(type); + frontswap_invalidate_area(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; @@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account informatin */ - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); spin_lock_init(&p->lock); diff --git a/mm/truncate.c b/mm/truncate.c index 353b683afd6e..2e84fe59190b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,6 +22,22 @@ #include <linux/cleancache.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + radix_tree_delete_item(&mapping->page_tree, index, entry); + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; @@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; @@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, */ pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..1037a3bab505 --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmacache.h> + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) + return vma; + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) + return vma; + } + + return NULL; +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4283488e9b1c..83e33b8c2178 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2688,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2722,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ad29b2925a0..5461d02ea718 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long freed = 0; unsigned long long delta; long total_scan; - long max_pass; + long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = shrinker->count_objects(shrinker, shrinkctl); - if (max_pass == 0) + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) return 0; /* @@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; + delta *= freeable; do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); - total_scan = max_pass; + total_scan = freeable; } /* @@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in + * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + if (total_scan > freeable * 2) + total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + freeable, delta, total_scan); - while (total_scan >= batch_size) { + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); - shrinkctl->nr_to_scan = batch_size; + shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; cond_resched(); } @@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (!node_online(shrinkctl->nid)) - continue; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && - (shrinkctl->nid != 0)) - break; - + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + nr_pages_scanned, lru_pages); + continue; + } + + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); } } @@ -1089,7 +1108,7 @@ keep: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } - free_hot_cold_page_list(&free_pages, 1); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP @@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file, free; + unsigned long anon, file; bool force_scan = false; unsigned long ap, fp; enum lru_list lru; @@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * If it's foreseeable that reclaiming the file cache won't be * enough to get the zone back into a desirable shape, we have @@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * thrashing - remaining file pages alone. */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); - if (unlikely(file + free <= high_wmark_pages(zone))) { + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; @@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long lru_pages = 0; nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; diff --git a/mm/vmstat.c b/mm/vmstat.c index efea337fb16e..9f16fd3ee575 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } @@ -767,6 +767,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -857,12 +858,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; @@ -1059,7 +1062,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1069,10 +1072,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); |