diff options
Diffstat (limited to 'mm')
63 files changed, 4885 insertions, 3090 deletions
diff --git a/mm/Makefile b/mm/Makefile index 6f9ffa968a1a..8083fa85a348 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/compaction.c b/mm/compaction.c index a2c53fcf933e..f1dfeb6a4090 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -505,6 +505,25 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +static struct lruvec * +compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, + struct compact_control *cc) +{ + struct lruvec *lruvec; + + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + rcu_read_unlock(); + + return lruvec; +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -831,6 +850,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { + struct folio *folio; if (skip_on_failure && low_pfn >= next_skip_pfn) { /* @@ -861,7 +881,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -974,7 +994,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1053,18 +1073,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!TestClearPageLRU(page)) goto isolate_fail_put; - lruvec = folio_lruvec(page_folio(page)); + folio = page_folio(page); + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, page_folio(page)); - /* Try get exclusive access under lock */ if (!skip_updated) { skip_updated = true; @@ -1117,7 +1136,7 @@ isolate_success_no_list: isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } put_page(page); @@ -1133,7 +1152,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } putback_movable_pages(&cc->migratepages); @@ -1165,7 +1184,7 @@ isolate_fail: isolate_abort: if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); if (page) { SetPageLRU(page); put_page(page); @@ -3009,7 +3028,7 @@ void kcompactd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kcompactd_stop(int nid) { diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 9b559c76d6dd..66265e3a9c65 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -92,4 +92,12 @@ config DAMON_RECLAIM reclamation under light memory pressure, while the traditional page scanning-based reclamation is used for heavy pressure. +config DAMON_LRU_SORT + bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)" + depends on DAMON_PADDR + help + This builds the DAMON-based LRU-lists sorting subsystem. It tries to + protect frequently accessed (hot) pages while rarely accessed (cold) + pages reclaimed first under memory pressure. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index dbf7190b4144..3e6b8ad73858 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index a0dab8b5e45f..cb8a7e9926a4 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -97,6 +97,31 @@ out: return ret; } +/* + * Return corresponding dbgfs' scheme action value (int) for the given + * damos_action if the given damos_action value is valid and supported by + * dbgfs, negative error code otherwise. + */ +static int damos_action_to_dbgfs_scheme_action(enum damos_action action) +{ + switch (action) { + case DAMOS_WILLNEED: + return 0; + case DAMOS_COLD: + return 1; + case DAMOS_PAGEOUT: + return 2; + case DAMOS_HUGEPAGE: + return 3; + case DAMOS_NOHUGEPAGE: + return 4; + case DAMOS_STAT: + return 5; + default: + return -EINVAL; + } +} + static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) { struct damos *s; @@ -109,7 +134,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, - s->action, + damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, s->quota.weight_sz, @@ -160,18 +185,27 @@ static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) kfree(schemes); } -static bool damos_action_valid(int action) +/* + * Return corresponding damos_action for the given dbgfs input for a scheme + * action if the input is valid, negative error code otherwise. + */ +static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) { - switch (action) { - case DAMOS_WILLNEED: - case DAMOS_COLD: - case DAMOS_PAGEOUT: - case DAMOS_HUGEPAGE: - case DAMOS_NOHUGEPAGE: - case DAMOS_STAT: - return true; + switch (dbgfs_action) { + case 0: + return DAMOS_WILLNEED; + case 1: + return DAMOS_COLD; + case 2: + return DAMOS_PAGEOUT; + case 3: + return DAMOS_HUGEPAGE; + case 4: + return DAMOS_NOHUGEPAGE; + case 5: + return DAMOS_STAT; default: - return false; + return -EINVAL; } } @@ -189,7 +223,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, int pos = 0, parsed, ret; unsigned long min_sz, max_sz; unsigned int min_nr_a, max_nr_a, min_age, max_age; - unsigned int action; + unsigned int action_input; + enum damos_action action; schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), GFP_KERNEL); @@ -204,7 +239,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action, "a.ms, + &min_age, &max_age, &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -212,7 +247,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, &wmarks.low, &parsed); if (ret != 18) break; - if (!damos_action_valid(action)) + action = dbgfs_scheme_action_to_damos_action(action_input); + if ((int)action < 0) goto fail; if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) @@ -275,11 +311,6 @@ out: return ret; } -static inline bool target_has_pid(const struct damon_ctx *ctx) -{ - return ctx->ops.id == DAMON_OPS_VADDR; -} - static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; @@ -288,7 +319,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) int rc; damon_for_each_target(t, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) /* Show pid numbers to debugfs users */ id = pid_vnr(t->pid); else @@ -415,7 +446,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -425,11 +456,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, if (!t) { damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) dbgfs_put_pids(pids, nr_targets); return -ENOMEM; } - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) t->pid = pids[i]; damon_add_target(ctx, t); } @@ -722,7 +753,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!target_has_pid(ctx)) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c new file mode 100644 index 000000000000..c276736a071c --- /dev/null +++ b/mm/damon/lru_sort.c @@ -0,0 +1,546 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based LRU-lists Sorting + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#define pr_fmt(fmt) "damon-lru-sort: " fmt + +#include <linux/damon.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/workqueue.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_lru_sort." + +/* + * Enable or disable DAMON_LRU_SORT. + * + * You can enable DAMON_LRU_SORT by setting the value of this parameter as + * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that + * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the + * watermarks-based activation condition. Refer to below descriptions for the + * watermarks parameter for this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_LRU_SORT is running are not + * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT + * reads values of parametrs except ``enabled`` again. Once the re-reading is + * done, this parameter is set as ``N``. If invalid parameters are found while + * the re-reading, DAMON_LRU_SORT will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Access frequency threshold for hot memory regions identification in permil. + * + * If a memory region is accessed in frequency of this or higher, + * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the + * LRU list, so that it could not be reclaimed under memory pressure. 50% by + * default. + */ +static unsigned long hot_thres_access_freq = 500; +module_param(hot_thres_access_freq, ulong, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT + * identifies the region as cold, and mark it as unaccessed on the LRU list, so + * that it could be reclaimed first under memory pressure. 120 seconds by + * default. + */ +static unsigned long cold_min_age __read_mostly = 120000000; +module_param(cold_min_age, ulong, 0600); + +/* + * Limit of time for trying the LRU lists sorting in milliseconds. + * + * DAMON_LRU_SORT tries to use only up to this time within a time window + * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used + * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the + * limit is disabled. + * + * 10 ms by default. + */ +static unsigned long quota_ms __read_mostly = 10; +module_param(quota_ms, ulong, 0600); + +/* + * The time quota charge reset interval in milliseconds. + * + * The charge reset interval for the quota of time (quota_ms). That is, + * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms + * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. + * + * 1 second by default. + */ +static unsigned long quota_reset_interval_ms __read_mostly = 1000; +module_param(quota_reset_interval_ms, ulong, 0600); + +/* + * The watermarks check time interval in microseconds. + * + * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is + * enabled but inactive due to its watermarks rule. 5 seconds by default. + */ +static unsigned long wmarks_interval __read_mostly = 5000000; +module_param(wmarks_interval, ulong, 0600); + +/* + * Free memory rate (per thousand) for the high watermark. + * + * If free memory of the system in bytes per thousand bytes is higher than + * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically + * checks the watermarks. 200 (20%) by default. + */ +static unsigned long wmarks_high __read_mostly = 200; +module_param(wmarks_high, ulong, 0600); + +/* + * Free memory rate (per thousand) for the middle watermark. + * + * If free memory of the system in bytes per thousand bytes is between this and + * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring + * and the LRU-lists sorting. 150 (15%) by default. + */ +static unsigned long wmarks_mid __read_mostly = 150; +module_param(wmarks_mid, ulong, 0600); + +/* + * Free memory rate (per thousand) for the low watermark. + * + * If free memory of the system in bytes per thousand bytes is lower than this, + * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks + * the watermarks. 50 (5%) by default. + */ +static unsigned long wmarks_low __read_mostly = 50; +module_param(wmarks_low, ulong, 0600); + +/* + * Sampling interval for the monitoring in microseconds. + * + * The sampling interval of DAMON for the hot/cold memory monitoring. Please + * refer to the DAMON documentation for more detail. 5 ms by default. + */ +static unsigned long sample_interval __read_mostly = 5000; +module_param(sample_interval, ulong, 0600); + +/* + * Aggregation interval for the monitoring in microseconds. + * + * The aggregation interval of DAMON for the hot/cold memory monitoring. + * Please refer to the DAMON documentation for more detail. 100 ms by default. + */ +static unsigned long aggr_interval __read_mostly = 100000; +module_param(aggr_interval, ulong, 0600); + +/* + * Minimum number of monitoring regions. + * + * The minimal number of monitoring regions of DAMON for the hot/cold memory + * monitoring. This can be used to set lower-bound of the monitoring quality. + * But, setting this too high could result in increased monitoring overhead. + * Please refer to the DAMON documentation for more detail. 10 by default. + */ +static unsigned long min_nr_regions __read_mostly = 10; +module_param(min_nr_regions, ulong, 0600); + +/* + * Maximum number of monitoring regions. + * + * The maximum number of monitoring regions of DAMON for the hot/cold memory + * monitoring. This can be used to set upper-bound of the monitoring overhead. + * However, setting this too low could result in bad monitoring quality. + * Please refer to the DAMON documentation for more detail. 1000 by default. + */ +static unsigned long max_nr_regions __read_mostly = 1000; +module_param(max_nr_regions, ulong, 0600); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +/* + * Number of hot memory regions that tried to be LRU-sorted. + */ +static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; +module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); + +/* + * Total bytes of hot memory regions that tried to be LRU-sorted. + */ +static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; +module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); + +/* + * Number of hot memory regions that successfully be LRU-sorted. + */ +static unsigned long nr_lru_sorted_hot_regions __read_mostly; +module_param(nr_lru_sorted_hot_regions, ulong, 0400); + +/* + * Total bytes of hot memory regions that successfully be LRU-sorted. + */ +static unsigned long bytes_lru_sorted_hot_regions __read_mostly; +module_param(bytes_lru_sorted_hot_regions, ulong, 0400); + +/* + * Number of times that the time quota limit for hot regions have exceeded + */ +static unsigned long nr_hot_quota_exceeds __read_mostly; +module_param(nr_hot_quota_exceeds, ulong, 0400); + +/* + * Number of cold memory regions that tried to be LRU-sorted. + */ +static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; +module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); + +/* + * Total bytes of cold memory regions that tried to be LRU-sorted. + */ +static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; +module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); + +/* + * Number of cold memory regions that successfully be LRU-sorted. + */ +static unsigned long nr_lru_sorted_cold_regions __read_mostly; +module_param(nr_lru_sorted_cold_regions, ulong, 0400); + +/* + * Total bytes of cold memory regions that successfully be LRU-sorted. + */ +static unsigned long bytes_lru_sorted_cold_regions __read_mostly; +module_param(bytes_lru_sorted_cold_regions, ulong, 0400); + +/* + * Number of times that the time quota limit for cold regions have exceeded + */ +static unsigned long nr_cold_quota_exceeds __read_mostly; +module_param(nr_cold_quota_exceeds, ulong, 0400); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +struct damon_lru_sort_ram_walk_arg { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_lru_sort_ram_walk_arg *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool get_monitoring_region(unsigned long *start, unsigned long *end) +{ + struct damon_lru_sort_ram_walk_arg arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +/* Create a DAMON-based operation scheme for hot memory regions */ +static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try LRU-lists sorting of hot pages for more than half + * of quota_ms milliseconds within quota_reset_interval_ms. + */ + .ms = quota_ms / 2, + .sz = 0, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and accessed for more than the threshold */ + hot_thres, UINT_MAX, + /* no matter its age */ + 0, UINT_MAX, + /* prioritize those on LRU lists, as soon as found */ + DAMOS_LRU_PRIO, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +/* Create a DAMON-based operation scheme for cold memory regions */ +static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try LRU-lists sorting of cold pages for more than + * half of quota_ms milliseconds within + * quota_reset_interval_ms. + */ + .ms = quota_ms / 2, + .sz = 0, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, mark colder regions not accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1, + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and not accessed at all */ + 0, 0, + /* for cold_thres or more micro-seconds, and */ + cold_thres, UINT_MAX, + /* mark those as not accessed, as soon as found */ + DAMOS_LRU_DEPRIO, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +static int damon_lru_sort_apply_parameters(void) +{ + struct damos *scheme, *next_scheme; + struct damon_addr_range addr_range; + unsigned int hot_thres, cold_thres; + int err = 0; + + err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, + min_nr_regions, max_nr_regions); + if (err) + return err; + + /* free previously set schemes */ + damon_for_each_scheme_safe(scheme, next_scheme, ctx) + damon_destroy_scheme(scheme); + + /* aggr_interval / sample_interval is the maximum nr_accesses */ + hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / + 1000; + scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + cold_thres = cold_min_age / aggr_interval; + scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + if (monitor_region_start > monitor_region_end) + return -EINVAL; + if (!monitor_region_start && !monitor_region_end && + !get_monitoring_region(&monitor_region_start, + &monitor_region_end)) + return -EINVAL; + addr_range.start = monitor_region_start; + addr_range.end = monitor_region_end; + return damon_set_regions(target, &addr_range, 1); +} + +static int damon_lru_sort_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_lru_sort_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static struct delayed_work damon_lru_sort_timer; +static void damon_lru_sort_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_lru_sort_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } +} +static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); + +static bool damon_lru_sort_initialized; + +static int damon_lru_sort_enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + if (!damon_lru_sort_initialized) + return rc; + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_lru_sort_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_LRU_SORT (default: disabled)"); + +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) { + nr_lru_sort_tried_hot_regions = s->stat.nr_tried; + bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; + nr_lru_sorted_hot_regions = s->stat.nr_applied; + bytes_lru_sorted_hot_regions = s->stat.sz_applied; + nr_hot_quota_exceeds = s->stat.qt_exceeds; + } else if (s->action == DAMOS_LRU_DEPRIO) { + nr_lru_sort_tried_cold_regions = s->stat.nr_tried; + bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; + nr_lru_sorted_cold_regions = s->stat.nr_applied; + bytes_lru_sorted_cold_regions = s->stat.sz_applied; + nr_cold_quota_exceeds = s->stat.qt_exceeds; + } + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) +{ + return damon_lru_sort_handle_commit_inputs(); +} + +static int __init damon_lru_sort_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + return -EINVAL; + + ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; + ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + damon_lru_sort_initialized = true; + return 0; +} + +module_init(damon_lru_sort_init); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 10ef20b2003f..b1335de200e7 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, /* Return coldness of the region */ return DAMOS_MAX_SCORE - hotness; } + +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + return hotness; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index e790cb5f8fe0..52329ff361cd 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b40ff5811bb2..dc131c6a5403 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -204,16 +204,11 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) +static unsigned long damon_pa_pageout(struct damon_region *r) { unsigned long addr, applied; LIST_HEAD(page_list); - if (scheme->action != DAMOS_PAGEOUT) - return 0; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -238,6 +233,55 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return applied * PAGE_SIZE; } +static unsigned long damon_pa_mark_accessed(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + mark_page_accessed(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + deactivate_page(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pa_pageout(r); + case DAMOS_LRU_PRIO: + return damon_pa_mark_accessed(r); + case DAMOS_LRU_DEPRIO: + return damon_pa_deactivate_pages(r); + default: + break; + } + return 0; +} + static int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -245,6 +289,10 @@ static int damon_pa_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pageout_score(context, r, scheme); + case DAMOS_LRU_PRIO: + return damon_hot_score(context, r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_pageout_score(context, r, scheme); default: break; } diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 4b07c29effe9..e69b807fefe4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -353,7 +353,6 @@ static int damon_reclaim_turn(bool on) return 0; } -#define ENABLE_CHECK_INTERVAL_MS 1000 static struct delayed_work damon_reclaim_timer; static void damon_reclaim_timer_fn(struct work_struct *work) { @@ -367,16 +366,12 @@ static void damon_reclaim_timer_fn(struct work_struct *work) else enabled = last_enabled; } - - if (enabled) - schedule_delayed_work(&damon_reclaim_timer, - msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); static bool damon_reclaim_initialized; -static int enabled_store(const char *val, +static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { int rc = param_set_bool(val, kp); @@ -388,14 +383,12 @@ static int enabled_store(const char *val, if (!damon_reclaim_initialized) return rc; - if (enabled) - schedule_delayed_work(&damon_reclaim_timer, 0); - + schedule_delayed_work(&damon_reclaim_timer, 0); return 0; } static const struct kernel_param_ops enabled_param_ops = { - .set = enabled_store, + .set = damon_reclaim_enabled_store, .get = param_get_bool, }; @@ -403,10 +396,21 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_RECLAIM (default: disabled)"); +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + static int damon_reclaim_after_aggregation(struct damon_ctx *c) { struct damos *s; - int err = 0; /* update the stats parameter */ damon_for_each_scheme(s, c) { @@ -417,22 +421,12 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) nr_quota_exceeds = s->stat.qt_exceeds; } - if (commit_inputs) { - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - } - return err; + return damon_reclaim_handle_commit_inputs(); } static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) { - int err = 0; - - if (commit_inputs) { - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - } - return err; + return damon_reclaim_handle_commit_inputs(); } static int __init damon_reclaim_init(void) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 09f9e8ca3d1f..7488e27c87c3 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -762,6 +762,8 @@ static const char * const damon_sysfs_damos_action_strs[] = { "pageout", "hugepage", "nohugepage", + "lru_prio", + "lru_deprio", "stat", }; @@ -2136,8 +2138,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -2181,8 +2182,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (!t) return -ENOMEM; - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) { + if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) goto destroy_targets_out; @@ -2210,7 +2210,7 @@ static struct damon_target *damon_sysfs_existing_target( struct pid *pid; struct damon_target *t; - if (ctx->ops.id == DAMON_OPS_PADDR) { + if (!damon_target_has_pid(ctx)) { /* Up to only one target for paddr could exist */ damon_for_each_target(t, ctx) return t; @@ -2359,6 +2359,23 @@ static inline bool damon_sysfs_kdamond_running( damon_sysfs_ctx_running(kdamond->damon_ctx); } +static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, + struct damon_sysfs_context *sys_ctx) +{ + int err; + + err = damon_select_ops(ctx, sys_ctx->ops_id); + if (err) + return err; + err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + return err; + err = damon_sysfs_set_targets(ctx, sys_ctx->targets); + if (err) + return err; + return damon_sysfs_set_schemes(ctx, sys_ctx->schemes); +} + /* * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. * @kdamond: The kobject wrapper for the associated kdamond. @@ -2367,31 +2384,14 @@ static inline bool damon_sysfs_kdamond_running( */ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) { - struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_context *sys_ctx; - int err = 0; - if (!damon_sysfs_kdamond_running(kdamond)) return -EINVAL; /* TODO: Support multiple contexts per kdamond */ if (kdamond->contexts->nr != 1) return -EINVAL; - sys_ctx = kdamond->contexts->contexts_arr[0]; - - err = damon_select_ops(ctx, sys_ctx->ops_id); - if (err) - return err; - err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); - if (err) - return err; - err = damon_sysfs_set_targets(ctx, sys_ctx->targets); - if (err) - return err; - err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); - if (err) - return err; - return err; + return damon_sysfs_apply_inputs(kdamond->damon_ctx, + kdamond->contexts->contexts_arr[0]); } /* @@ -2438,27 +2438,16 @@ static struct damon_ctx *damon_sysfs_build_ctx( if (!ctx) return ERR_PTR(-ENOMEM); - err = damon_select_ops(ctx, sys_ctx->ops_id); - if (err) - goto out; - err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); - if (err) - goto out; - err = damon_sysfs_set_targets(ctx, sys_ctx->targets); - if (err) - goto out; - err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); - if (err) - goto out; + err = damon_sysfs_apply_inputs(ctx, sys_ctx); + if (err) { + damon_destroy_ctx(ctx); + return ERR_PTR(err); + } ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; - -out: - damon_destroy_ctx(ctx); - return ERR_PTR(err); } static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index d4f55f349100..bce37c487540 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,33 +14,19 @@ #include <kunit/test.h> -static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, + ssize_t nr_vmas) { - int i, j; - unsigned long largest_gap, gap; + int i; + MA_STATE(mas, mt, 0, 0); if (!nr_vmas) return; - for (i = 0; i < nr_vmas - 1; i++) { - vmas[i].vm_next = &vmas[i + 1]; - - vmas[i].vm_rb.rb_left = NULL; - vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; - - largest_gap = 0; - for (j = i; j < nr_vmas; j++) { - if (j == 0) - continue; - gap = vmas[j].vm_start - vmas[j - 1].vm_end; - if (gap > largest_gap) - largest_gap = gap; - } - vmas[i].rb_subtree_gap = largest_gap; - } - vmas[i].vm_next = NULL; - vmas[i].vm_rb.rb_right = NULL; - vmas[i].rb_subtree_gap = 0; + mas_lock(&mas); + for (i = 0; i < nr_vmas; i++) + vma_mas_store(&vmas[i], &mas); + mas_unlock(&mas); } /* @@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) */ static void damon_test_three_regions_in_vmas(struct kunit *test) { + static struct mm_struct mm; struct damon_addr_range regions[3] = {0,}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ struct vm_area_struct vmas[] = { @@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, }; - __link_vmas(vmas, 6); + mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); - __damon_va_three_regions(&vmas[0], regions); + __damon_va_three_regions(&mm, regions); KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95..d24148a8149f 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r) * * Returns 0 if success, or negative error code otherwise. */ -static int __damon_va_three_regions(struct vm_area_struct *vma, +static int __damon_va_three_regions(struct mm_struct *mm, struct damon_addr_range regions[3]) { - struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; - struct vm_area_struct *last_vma = NULL; - unsigned long start = 0; - struct rb_root rbroot; - - /* Find two biggest gaps so that first_gap > second_gap > others */ - for (; vma; vma = vma->vm_next) { - if (!last_vma) { - start = vma->vm_start; - goto next; - } + struct damon_addr_range first_gap = {0}, second_gap = {0}; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma, *prev = NULL; + unsigned long start; - if (vma->rb_subtree_gap <= sz_range(&second_gap)) { - rbroot.rb_node = &vma->vm_rb; - vma = rb_entry(rb_last(&rbroot), - struct vm_area_struct, vm_rb); + /* + * Find the two biggest gaps so that first_gap > second_gap > others. + * If this is too slow, it can be optimised to examine the maple + * tree gaps. + */ + for_each_vma(vmi, vma) { + unsigned long gap; + + if (!prev) { + start = vma->vm_start; goto next; } - - gap.start = last_vma->vm_end; - gap.end = vma->vm_start; - if (sz_range(&gap) > sz_range(&second_gap)) { - swap(gap, second_gap); - if (sz_range(&second_gap) > sz_range(&first_gap)) - swap(second_gap, first_gap); + gap = vma->vm_start - prev->vm_end; + + if (gap > sz_range(&first_gap)) { + second_gap = first_gap; + first_gap.start = prev->vm_end; + first_gap.end = vma->vm_start; + } else if (gap > sz_range(&second_gap)) { + second_gap.start = prev->vm_end; + second_gap.end = vma->vm_start; } next: - last_vma = vma; + prev = vma; } if (!sz_range(&second_gap) || !sz_range(&first_gap)) @@ -159,7 +160,7 @@ next: regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); - regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); return 0; } @@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t, return -EINVAL; mmap_read_lock(mm); - rc = __damon_va_three_regions(mm->mmap, regions); + rc = __damon_va_three_regions(mm, regions); mmap_read_unlock(mm); mmput(mm); diff --git a/mm/debug.c b/mm/debug.c index bef329bf28f0..0fd15ba70d16 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %px start %px end %px\n" - "next %px prev %px mm %px\n" + pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, @@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" + pr_emerg("mm %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif - "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" @@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, + mm, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif - mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), diff --git a/mm/filemap.c b/mm/filemap.c index 8ccb868c3d95..b561619cfafe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int err = 0; + int err = 0, err2; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, @@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ - if (err != -EIO) { - int err2 = filemap_fdatawait_range(mapping, - lstart, lend); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); } + err2 = filemap_check_errors(mapping); + if (!err) + err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); @@ -532,7 +532,11 @@ retry: } page = vm_normal_page(vma, address, pte); - if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + if ((flags & FOLL_LRU) && ((page && is_zone_device_page(page)) || + (!page && pte_devmap(pte)))) { + page = ERR_PTR(-EEXIST); + goto out; + } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap @@ -1641,10 +1645,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (!locked) { locked = 1; mmap_read_lock(mm); - vma = find_vma(mm, nstart); + vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) break; /* * Set [nstart; nend) to intersection of desired address @@ -1923,14 +1928,48 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - if (folio_is_pinnable(folio)) + /* + * Device private pages will get faulted in during gup so it + * shouldn't be possible to see one here. + */ + if (WARN_ON_ONCE(folio_is_device_private(folio))) { + ret = -EFAULT; + goto unpin_pages; + } + + /* + * Device coherent pages are managed by a driver and should not + * be pinned indefinitely as it prevents the driver moving the + * page. So when trying to pin with FOLL_LONGTERM instead try + * to migrate the page out of device memory. + */ + if (folio_is_device_coherent(folio)) { + WARN_ON_ONCE(PageCompound(&folio->page)); + + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + if (gup_flags & FOLL_PIN) { + get_page(&folio->page); + unpin_user_page(&folio->page); + } + + pages[i] = migrate_device_page(&folio->page, gup_flags); + if (!pages[i]) { + ret = -EBUSY; + goto unpin_pages; + } continue; + } + if (folio_is_pinnable(folio)) + continue; /* * Try to move out any movable page before pinning the range. */ if (folio_test_hugetlb(folio)) { - if (!isolate_huge_page(&folio->page, + if (isolate_hugetlb(&folio->page, &movable_page_list)) isolation_error_count++; continue; @@ -1961,10 +2000,13 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, return nr_pages; unpin_pages: - if (gup_flags & FOLL_PIN) { - unpin_user_pages(pages, nr_pages); - } else { - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_pages; i++) { + if (!pages[i]) + continue; + + if (gup_flags & FOLL_PIN) + unpin_user_page(pages[i]); + else put_page(pages[i]); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9b90a8d7dfa..7b7eedc6d5dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,21 +69,85 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_active(struct vm_area_struct *vma) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps, bool in_pf) { - /* The addr is used to check if the vma size fits */ - unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + if (!vma->vm_mm) /* vdso */ + return false; + + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; - if (!transhuge_vma_suitable(vma, addr)) + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ + if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ + if (vm_flags & VM_NO_KHUGEPAGED) + return false; + + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; - if (vma_is_anonymous(vma)) - return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma)) + + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + + if (!hugepage_flags_enabled()) + return false; + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + return false; + + /* Only regular file is valid */ + if (!in_pf && file_thp_enabled(vma)) return true; - return false; + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. + */ + if (!vma->anon_vma) + return (smaps || in_pf); + + return true; } static bool get_huge_zero_page(void) @@ -423,10 +487,10 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker); + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); if (err) goto err_split_shrinker; @@ -497,25 +561,125 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) } #ifdef CONFIG_MEMCG -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static struct shrinker deferred_split_shrinker; + +static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = page_memcg(compound_head(page)); - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + if (mem_cgroup_disabled()) + return NULL; + if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + return memcg ? &memcg->deferred_split_queue : NULL; +} + +static void thp_sq_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_lock(&src->deferred_split_queue.split_queue_lock); + spin_lock_nested(&dst->deferred_split_queue.split_queue_lock, + SINGLE_DEPTH_NESTING); } + +static void thp_sq_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + int nid; + struct deferred_split *src_queue, *dst_queue; + + src_queue = &src->deferred_split_queue; + dst_queue = &dst->deferred_split_queue; + + if (!src_queue->split_queue_len) + return; + + list_splice_tail_init(&src_queue->split_queue, &dst_queue->split_queue); + dst_queue->split_queue_len += src_queue->split_queue_len; + src_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(dst, nid, deferred_split_shrinker.id); +} + +static void thp_sq_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_unlock(&dst->deferred_split_queue.split_queue_lock); + spin_unlock(&src->deferred_split_queue.split_queue_lock); +} +DEFINE_MEMCG_REPARENT_OPS(thp_sq); #else -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return NULL; +} - return &pgdat->deferred_split_queue; +static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio) +{ + return NULL; } #endif +static struct deferred_split *folio_split_queue(struct folio *folio) +{ + struct deferred_split *queue = folio_memcg_split_queue(folio); + + return queue ? : &NODE_DATA(folio_nid(folio))->deferred_split_queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + struct deferred_split *queue; + + rcu_read_lock(); +retry: + queue = folio_split_queue(folio); + spin_lock(&queue->split_queue_lock); + + if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) { + spin_unlock(&queue->split_queue_lock); + goto retry; + } + rcu_read_unlock(); + + return queue; +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + struct deferred_split *queue; + + rcu_read_lock(); +retry: + queue = folio_split_queue(folio); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + + if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + goto retry; + } + rcu_read_unlock(); + + return queue; +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + void prep_transhuge_page(struct page *page) { /* @@ -726,7 +890,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && @@ -2266,11 +2430,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); @@ -2455,7 +2619,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); @@ -2540,7 +2704,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); + struct deferred_split *ds_queue; XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -2632,13 +2796,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); + ds_queue = folio_split_queue_lock(folio); if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); if (mapping) { int nr = thp_nr_pages(head); @@ -2656,7 +2820,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __split_huge_page(page, list, end); ret = 0; } else { - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); fail: if (mapping) xas_unlock(&xas); @@ -2680,25 +2844,23 @@ out: void free_transhuge_page(struct page *page) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue; unsigned long flags; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(page_folio(page), &flags); if (!list_empty(page_deferred_list(page))) { ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = page_memcg(compound_head(page)); -#endif + struct deferred_split *ds_queue; unsigned long flags; + struct folio *folio = page_folio(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -2715,18 +2877,19 @@ void deferred_split_huge_page(struct page *page) if (PageSwapCache(page)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (list_empty(page_deferred_list(page))) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); -#endif + shrinker_id(&deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -2906,7 +3069,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, } /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); if (IS_ERR(page)) continue; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da59aece4756..87d875e5e0a9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -/* - * Minimum page order among possible hugepage sizes, set to a proper value - * at boot time. - */ -static unsigned int minimum_order __read_mostly = UINT_MAX; - __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ @@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct page *page; int rc = 0; + unsigned int order; + struct hstate *h; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + order = huge_page_order(&default_hstate); + for_each_hstate(h) + order = min(order, huge_page_order(h)); + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { page = pfn_to_page(pfn); rc = dissolve_free_huge_page(page); if (rc) @@ -2766,8 +2766,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - if (!isolate_huge_page(old_page, list)) - ret = -EBUSY; + ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { @@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && isolate_huge_page(head, list)) + if (page_count(head) && !isolate_hugetlb(head, list)) ret = 0; else if (!page_count(head)) ret = alloc_and_dissolve_huge_page(h, head, list); @@ -3149,9 +3148,6 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { - if (minimum_order > huge_page_order(h)) - minimum_order = huge_page_order(h); - /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3176,7 +3172,6 @@ static void __init hugetlb_init_hstates(void) h->demote_order = h2->order; } } - VM_BUG_ON(minimum_order == UINT_MAX); } static void __init report_hugepages(void) @@ -4732,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; + unsigned long last_addr_mask; int ret = 0; if (cow) { @@ -4751,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } + last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); - if (!src_pte) + if (!src_pte) { + addr |= last_addr_mask; continue; + } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -4772,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + addr |= last_addr_mask; continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -4803,12 +4804,11 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = huge_pte_mkuffd_wp(entry); - set_huge_swap_pte_at(src, addr, src_pte, - entry, sz); + set_huge_pte_at(src, addr, src_pte, entry); } if (!userfaultfd_wp(dst_vma) && uffd_wp) entry = huge_pte_clear_uffd_wp(entry); - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { /* * We copy the pte marker only if the dst vma has @@ -4934,7 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; - unsigned long old_addr_copy; + unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; @@ -4949,23 +4949,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); - if (!src_pte) + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; + } if (huge_pte_none(huge_ptep_get(src_pte))) continue; - /* old_addr arg to huge_pmd_unshare() is a pointer and so the - * arg may be modified. Pass a copy instead to preserve the - * value in old_addr. - */ - old_addr_copy = old_addr; - - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; } @@ -4999,6 +4999,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); @@ -5019,17 +5020,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; + address |= last_addr_mask; continue; } @@ -5709,7 +5714,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -6046,8 +6051,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ @@ -6299,6 +6302,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -6315,14 +6319,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6332,6 +6339,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pages++; spin_unlock(ptl); shared_pmd = true; + address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); @@ -6357,8 +6365,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_swap_pte_at(mm, address, ptep, - newpte, psize); + set_huge_pte_at(mm, address, ptep, newpte); pages++; } spin_unlock(ptl); @@ -6755,11 +6762,11 @@ out: * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { - pgd_t *pgd = pgd_offset(mm, *addr); - p4d_t *p4d = p4d_offset(pgd, *addr); - pud_t *pud = pud_offset(p4d, *addr); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); @@ -6769,14 +6776,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - /* - * This update of passed address optimizes loops sequentially - * processing addresses in increments of huge page size (PMD_SIZE - * in this case). By clearing the pud, a PUD_SIZE area is unmapped. - * Update address to the 'last page' in the cleared area so that - * calling loop can move to first page past this area. - */ - *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@ -6788,7 +6787,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } @@ -6871,6 +6870,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return (pte_t *)pmd; } +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif + return 0UL; +} + #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* @@ -6934,7 +6964,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait_huge((pte_t *)pmd, ptl); goto retry; } /* @@ -6966,15 +6996,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -bool isolate_huge_page(struct page *page, struct list_head *list) +int isolate_hugetlb(struct page *page, struct list_head *list) { - bool ret = true; + int ret = 0; spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { - ret = false; + ret = -EBUSY; goto unlock; } ClearHPageMigratable(page); @@ -7094,14 +7124,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - unsigned long tmp = address; - ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - /* We don't want 'address' to be changed */ - huge_pmd_unshare(mm, vma, &tmp, ptep); + huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba29c15c53d6..1362feb3c6c9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -10,7 +10,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include <linux/memory_hotplug.h> +#include <linux/memory.h> #include "hugetlb_vmemmap.h" /* @@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } +static unsigned int vmemmap_optimizable_pages(struct hstate *h, + struct page *head) +{ + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return 0; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + pmd_t *pmdp, pmd; + struct page *vmemmap_page; + unsigned long vaddr = (unsigned long)head; + + /* + * Only the vmemmap page's vmemmap page can be self-hosted. + * Walking the page tables to find the backing page of the + * vmemmap page. + */ + pmdp = pmd_off_k(vaddr); + /* + * The READ_ONCE() is used to stabilize *pmdp in a register or + * on the stack so that it will stop changing under the code. + * The only concurrent operation where it can be changed is + * split_vmemmap_huge_pmd() (*pmdp will be stable after this + * operation). + */ + pmd = READ_ONCE(*pmdp); + if (pmd_leaf(pmd)) + vmemmap_page = pmd_page(pmd) + pte_index(vaddr); + else + vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); + /* + * Due to HugeTLB alignment requirements and the vmemmap pages + * being at the start of the hotplugged memory region in + * memory_hotplug.memmap_on_memory case. Checking any vmemmap + * page's vmemmap page if it is marked as VmemmapSelfHosted is + * sufficient. + * + * [ hotplugged memory ] + * [ section ][...][ section ] + * [ vmemmap ][ usable memory ] + * ^ | | | + * +---+ | | + * ^ | | + * +-------+ | + * ^ | + * +-------------------------------------------+ + */ + if (PageVmemmapSelfHosted(vmemmap_page)) + return 0; + } + + return hugetlb_optimize_vmemmap_pages(h); +} + void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_pages = vmemmap_optimizable_pages(h, head); if (!vmemmap_pages) return; - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return; - static_branch_inc(&hugetlb_optimize_vmemmap_key); vmemmap_addr += RESERVE_VMEMMAP_SIZE; @@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { static __init int hugetlb_vmemmap_sysctls_init(void) { /* - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" - * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "struct page" crosses page boundaries, the vmemmap pages cannot + * be optimized. */ - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page))) register_sysctl_init("vm", hugetlb_vmemmap_sysctls); return 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index fbe7844d0912..c9327abb771c 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm_types.h> -#include <linux/rbtree.h> +#include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/list.h> @@ -28,7 +28,7 @@ * and size this cpu_bitmask to NR_CPUS. */ struct mm_struct init_mm = { - .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/internal.h b/mm/internal.h index ddd2d6a46f1b..39b6dd400a15 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -84,8 +84,9 @@ void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags) } /* mm/util.c */ -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev); -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU @@ -853,6 +851,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); void free_zone_device_page(struct page *page); +struct page *migrate_device_page(struct page *page, unsigned int gup_flags); /* * mm/gup.c diff --git a/mm/kasan/common.c b/mm/kasan/common.c index c40c0e7b3b5f..707c3a527fcb 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -343,7 +343,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE); return true; } @@ -352,7 +352,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (!kasan_byte_accessible(tagged_object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } @@ -377,12 +377,12 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; } if (!kasan_byte_accessible(ptr)) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE); return true; } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9e1b6544bfa8..9ad8eff71b28 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -257,27 +257,37 @@ static void unpoison_vmalloc_pages(const void *addr, u8 tag) } } +static void init_vmalloc_pages(const void *start, unsigned long size) +{ + const void *addr; + + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + struct page *page = virt_to_page(addr); + + clear_highpage_kasan_tagged(page); + } +} + void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { u8 tag; unsigned long redzone_start, redzone_size; - if (!kasan_vmalloc_enabled()) - return (void *)start; - - if (!is_vmalloc_or_module_addr(start)) + if (!kasan_vmalloc_enabled() || !is_vmalloc_or_module_addr(start)) { + if (flags & KASAN_VMALLOC_INIT) + init_vmalloc_pages(start, size); return (void *)start; + } /* - * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC - * mappings as: + * Don't tag non-VM_ALLOC mappings, as: * * 1. Unlike the software KASAN modes, hardware tag-based KASAN only * supports tagging physical memory. Therefore, it can only tag a * single mapping of normal physical pages. * 2. Hardware tag-based KASAN can only tag memory mapped with special - * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * mapping protection bits, see arch_vmap_pgprot_tagged(). * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, * providing these bits would require tracking all non-VM_ALLOC * mappers. @@ -289,15 +299,19 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. */ - if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); return (void *)start; + } /* * Don't tag executable memory. * The kernel doesn't tolerate having the PC register tagged. */ - if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); return (void *)start; + } tag = kasan_random_tag(); start = set_tag(start, tag); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 610d60d6e5b8..01c03e45acd4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -125,6 +125,7 @@ static inline bool kasan_sync_fault_possible(void) enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, + KASAN_REPORT_DOUBLE_FREE, }; struct kasan_report_info { @@ -277,7 +278,7 @@ static inline void kasan_print_address_stack_frame(const void *addr) { } bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -void kasan_report_invalid_free(void *object, unsigned long ip); +void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b341a191651d..fe3f606b3a98 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -176,8 +176,12 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", - (void *)info->ip); + pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); + return; + } + + if (info->type == KASAN_REPORT_DOUBLE_FREE) { + pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); return; } @@ -433,7 +437,7 @@ static void print_report(struct kasan_report_info *info) } } -void kasan_report_invalid_free(void *ptr, unsigned long ip) +void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; struct kasan_report_info info; @@ -448,7 +452,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip) start_report(&flags, true); - info.type = KASAN_REPORT_INVALID_FREE; + info.type = type; info.access_addr = ptr; info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 16be62d493cd..cfe231c5958f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -147,8 +147,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, return count; } static struct kobj_attribute scan_sleep_millisecs_attr = - __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, - scan_sleep_millisecs_store); + __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -175,8 +174,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, return count; } static struct kobj_attribute alloc_sleep_millisecs_attr = - __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, - alloc_sleep_millisecs_store); + __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -200,8 +198,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, return count; } static struct kobj_attribute pages_to_scan_attr = - __ATTR(pages_to_scan, 0644, pages_to_scan_show, - pages_to_scan_store); + __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -221,22 +218,21 @@ static ssize_t full_scans_show(struct kobject *kobj, static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); -static ssize_t khugepaged_defrag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } -static ssize_t khugepaged_defrag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = - __ATTR(defrag, 0644, khugepaged_defrag_show, - khugepaged_defrag_store); + __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over @@ -246,21 +242,21 @@ static struct kobj_attribute khugepaged_defrag_attr = * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ -static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } -static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR-1) + if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; @@ -268,25 +264,24 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = - __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, - khugepaged_max_ptes_none_store); + __ATTR_RW(max_ptes_none); -static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } -static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); - if (err || max_ptes_swap > HPAGE_PMD_NR-1) + if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; @@ -295,25 +290,24 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, } static struct kobj_attribute khugepaged_max_ptes_swap_attr = - __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, - khugepaged_max_ptes_swap_store); + __ATTR_RW(max_ptes_swap); -static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } -static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); - if (err || max_ptes_shared > HPAGE_PMD_NR-1) + if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; @@ -322,8 +316,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, } static struct kobj_attribute khugepaged_max_ptes_shared_attr = - __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, - khugepaged_max_ptes_shared_store); + __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, @@ -437,43 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!transhuge_vma_enabled(vma, vm_flags)) - return false; - - if (vm_flags & VM_NO_KHUGEPAGED) - return false; - - /* Don't run khugepaged against DAX vma */ - if (vma_is_dax(vma)) - return false; - - if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - - vma->vm_pgoff, HPAGE_PMD_NR)) - return false; - - /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) - return false; - - /* Only regular file is valid */ - if (file_thp_enabled(vma)) - return true; - - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return false; - if (vma_is_temporary_stack(vma)) - return false; - - return true; -} - void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -509,10 +465,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled() && - (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK))) { - if (hugepage_vma_check(vma, vm_flags)) + hugepage_flags_enabled()) { + if (hugepage_vma_check(vma, vm_flags, false, false)) __khugepaged_enter(vma->vm_mm); } } @@ -599,7 +553,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && @@ -618,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } @@ -762,7 +716,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { list_del(&src_page->lru); - release_pte_page(src_page); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); } } @@ -802,6 +761,10 @@ static bool khugepaged_scan_abort(int nid) return false; } +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) + /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { @@ -899,7 +862,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) khugepaged_alloc_sleep(); } else count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); + } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); return hpage; } @@ -947,7 +910,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct **vmap) { struct vm_area_struct *vma; - unsigned long hstart, hend; if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; @@ -956,13 +918,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) return SCAN_VMA_CHECK; - /* Anon VMA expected */ + /* + * Anon VMA expected, the address may be unmapped then + * remapped to file after khugepaged reaquired the mmap_lock. + * + * hugepage_vma_check may return true for qualified file + * vmas. + */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; return 0; @@ -972,8 +938,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * Bring missing pages in from swap, to complete THP collapse. * Only done if khugepaged_scan_pmd believes it is worthwhile. * - * Called and returns without pte mapped or spinlocks held, - * but with mmap_lock held to protect against vma changes. + * Called and returns without pte mapped or spinlocks held. + * Note that if false is returned, mmap_lock will be released. */ static bool __collapse_huge_page_swapin(struct mm_struct *mm, @@ -1000,27 +966,24 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, pte_unmap(vmf.pte); continue; } - swapped_in++; ret = do_swap_page(&vmf); - /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ + /* + * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. + * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because + * we do not retry here and swap entry will remain in pagetable + * resulting in later failure. + */ if (ret & VM_FAULT_RETRY) { - mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, haddr, &vma)) { - /* vma is no longer available, don't continue to swapin */ - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } - /* check if the pmd is still valid */ - if (mm_find_pmd(mm, haddr) != pmd) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; } if (ret & VM_FAULT_ERROR) { + mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } + swapped_in++; } /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ @@ -1086,13 +1049,12 @@ static void collapse_huge_page(struct mm_struct *mm, } /* - * __collapse_huge_page_swapin always returns with mmap_lock locked. - * If it fails, we release mmap_lock and jump out_nolock. + * __collapse_huge_page_swapin will return with mmap_lock released + * when it fails. So we jump out_nolock directly in that case. * Continuing to collapse causes inconsistency. */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mmap_read_unlock(mm); goto out_nolock; } @@ -1219,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { @@ -1267,7 +1229,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } @@ -1309,7 +1271,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see totmal_mapcount > refcount + * Here the check is racy it may see total_mapcount > refcount * in some cases. * For example, one process with one forked child process. * The parent has the PMD split due to MADV_DONTNEED, then @@ -1382,8 +1344,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ -static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) { struct mm_slot *mm_slot; @@ -1394,7 +1356,6 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); - return 0; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1426,7 +1387,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; @@ -1444,7 +1405,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() * will not fail the vma for missing VM_HUGEPAGE */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1479,7 +1440,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto abort; page = vm_normal_page(vma, addr, *pte); - + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. @@ -1497,6 +1459,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + goto abort; page_remove_rmap(page, vma, false); } @@ -1557,7 +1521,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * mmap_write_lock(mm) as PMD-mapping is likely to be split * later. * - * Not that vma->anon_vma check is racy: it can be set up after + * Note that vma->anon_vma check is racy: it can be set up after * the check but before we took mmap_lock by the fault path. * But page lock would prevent establishing any new ptes of the * page, so we are safe. @@ -1885,8 +1849,8 @@ out_unlock: if (nr_none) { __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); - if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ @@ -1950,10 +1914,10 @@ xa_unlocked: /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); - mapping->nrpages -= nr_none; - - if (is_shmem) + if (nr_none) { + mapping->nrpages -= nr_none; shmem_uncharge(mapping->host, nr_none); + } xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { @@ -2092,10 +2056,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { + struct vma_iterator vmi; struct mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; + unsigned long address; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); @@ -2119,11 +2085,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + address = khugepaged_scan.address; + vma_iter_init(&vmi, mm, address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); @@ -2131,22 +2100,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { skip: progress++; continue; } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; + hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); + hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); - if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) - goto skip; while (khugepaged_scan.address < hend) { int ret; @@ -2216,7 +2181,7 @@ breakouterloop_mmap_lock: static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && - khugepaged_enabled(); + hugepage_flags_enabled(); } static int khugepaged_wait_event(void) @@ -2281,7 +2246,7 @@ static void khugepaged_wait_work(void) return; } - if (khugepaged_enabled()) + if (hugepage_flags_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } @@ -2312,7 +2277,7 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) { + if (!hugepage_flags_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } @@ -2362,7 +2327,7 @@ int start_stop_khugepaged(void) int err = 0; mutex_lock(&khugepaged_mutex); - if (khugepaged_enabled()) { + if (hugepage_flags_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -2388,7 +2353,7 @@ fail: void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); - if (khugepaged_enabled() && khugepaged_thread) + if (hugepage_flags_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) if (!vma) goto out; - page = follow_page(vma, addr, FOLL_GET); + page = follow_page(vma, addr, FOLL_GET | FOLL_LRU); if (IS_ERR_OR_NULL(page)) goto out; if (PageAnon(page)) { @@ -981,11 +981,13 @@ static int unmerge_and_remove_all_rmap_items(void) struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); - for (mm_slot = ksm_scan.mm_slot; - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->mm, 0); + mm = mm_slot->mm; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) @@ -2232,6 +2234,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.mm_list)) @@ -2290,13 +2293,13 @@ next_mm: } mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + mmap_read_lock(mm); if (ksm_test_exit(mm)) - vma = NULL; - else - vma = find_vma(mm, ksm_scan.address); + goto no_vmas; - for (; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -2307,7 +2310,7 @@ next_mm: while (ksm_scan.address < vma->vm_end) { if (ksm_test_exit(mm)) break; - *page = follow_page(vma, ksm_scan.address, FOLL_GET); + *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU); if (IS_ERR_OR_NULL(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); @@ -2334,6 +2337,7 @@ next_mm: } if (ksm_test_exit(mm)) { +no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; } diff --git a/mm/madvise.c b/mm/madvise.c index 0316bbc6441b..851fa4e134bc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,6 @@ success: static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; struct vm_area_struct *vma = walk->private; unsigned long index; struct swap_iocb *splug = NULL; @@ -208,12 +207,13 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, swp_entry_t entry; struct page *page; spinlock_t *ptl; + pte_t *ptep; - orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); - pte = *(orig_pte + ((index - start) / PAGE_SIZE)); - pte_unmap_unlock(orig_pte, ptl); + ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl); + pte = *ptep; + pte_unmap_unlock(ptep, ptl); - if (pte_present(pte) || pte_none(pte)) + if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) @@ -421,7 +421,7 @@ regular_page: continue; page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* @@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* @@ -1238,7 +1238,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, if (start >= end) break; if (prev) - vma = prev->vm_next; + vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 655c09393ad5..a04903fd733b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -77,6 +77,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +static struct obj_cgroup *root_obj_cgroup __read_mostly; /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -252,9 +253,14 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) return container_of(vmpr, struct mem_cgroup, vmpressure); } -#ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); +static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg) +{ + return objcg == root_obj_cgroup; +} + +#ifdef CONFIG_MEMCG_KMEM bool mem_cgroup_kmem_disabled(void) { return cgroup_memory_nokmem; @@ -263,12 +269,10 @@ bool mem_cgroup_kmem_disabled(void) static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, unsigned int nr_pages); -static void obj_cgroup_release(struct percpu_ref *ref) +static void obj_cgroup_release_bytes(struct obj_cgroup *objcg) { - struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); unsigned int nr_bytes; unsigned int nr_pages; - unsigned long flags; /* * At this point all allocated objects are freed, and @@ -282,9 +286,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) * 3) CPU1: a process from another memcg is allocating something, * the stock if flushed, * objcg->nr_charged_bytes = PAGE_SIZE - 92 - * 5) CPU0: we do release this object, + * 4) CPU0: we do release this object, * 92 bytes are added to stock->nr_bytes - * 6) CPU0: stock is flushed, + * 5) CPU0: stock is flushed, * 92 bytes are added to objcg->nr_charged_bytes * * In the result, nr_charged_bytes == PAGE_SIZE. @@ -296,6 +300,19 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); +} +#else +static inline void obj_cgroup_release_bytes(struct obj_cgroup *objcg) +{ +} +#endif + +static void obj_cgroup_release(struct percpu_ref *ref) +{ + struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); + unsigned long flags; + + obj_cgroup_release_bytes(objcg); spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); @@ -324,28 +341,135 @@ static struct obj_cgroup *obj_cgroup_alloc(void) return objcg; } -static void memcg_reparent_objcgs(struct mem_cgroup *memcg, - struct mem_cgroup *parent) +static void objcg_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_lock(&objcg_lock); +} + +static void objcg_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst) { struct obj_cgroup *objcg, *iter; - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); + objcg = rcu_replace_pointer(src->objcg, NULL, true); + /* 1) Ready to reparent active objcg. */ + list_add(&objcg->list, &src->objcg_list); + /* 2) Reparent active objcg and already reparented objcgs to dst. */ + list_for_each_entry(iter, &src->objcg_list, list) + WRITE_ONCE(iter->memcg, dst); + /* 3) Move already reparented objcgs to the dst's list */ + list_splice(&src->objcg_list, &dst->objcg_list); +} - spin_lock_irq(&objcg_lock); +static void objcg_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_unlock(&objcg_lock); +} - /* 1) Ready to reparent active objcg. */ - list_add(&objcg->list, &memcg->objcg_list); - /* 2) Reparent active objcg and already reparented objcgs to parent. */ - list_for_each_entry(iter, &memcg->objcg_list, list) - WRITE_ONCE(iter->memcg, parent); - /* 3) Move already reparented objcgs to the parent's list */ - list_splice(&memcg->objcg_list, &parent->objcg_list); +static DEFINE_MEMCG_REPARENT_OPS(objcg); + +static void lruvec_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + int nid, nest = 0; + + for_each_node(nid) { + spin_lock_nested(&mem_cgroup_lruvec(src, + NODE_DATA(nid))->lru_lock, nest++); + spin_lock_nested(&mem_cgroup_lruvec(dst, + NODE_DATA(nid))->lru_lock, nest++); + } +} + +static void lruvec_reparent_lru(struct lruvec *src, struct lruvec *dst, + enum lru_list lru) +{ + int zid; + struct mem_cgroup_per_node *mz_src, *mz_dst; + + mz_src = container_of(src, struct mem_cgroup_per_node, lruvec); + mz_dst = container_of(dst, struct mem_cgroup_per_node, lruvec); - spin_unlock_irq(&objcg_lock); + if (lru != LRU_UNEVICTABLE) + list_splice_tail_init(&src->lists[lru], &dst->lists[lru]); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz_dst->lru_zone_size[zid][lru] += mz_src->lru_zone_size[zid][lru]; + mz_src->lru_zone_size[zid][lru] = 0; + } +} + +static void lruvec_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + int nid; + + for_each_node(nid) { + enum lru_list lru; + struct lruvec *src_lruvec, *dst_lruvec; + + src_lruvec = mem_cgroup_lruvec(src, NODE_DATA(nid)); + dst_lruvec = mem_cgroup_lruvec(dst, NODE_DATA(nid)); + + dst_lruvec->anon_cost += src_lruvec->anon_cost; + dst_lruvec->file_cost += src_lruvec->file_cost; + + for_each_lru(lru) + lruvec_reparent_lru(src_lruvec, dst_lruvec, lru); + } +} + +static void lruvec_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + int nid; + + for_each_node(nid) { + spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock); + spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock); + } +} + +static DEFINE_MEMCG_REPARENT_OPS(lruvec); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +DECLARE_MEMCG_REPARENT_OPS(thp_sq); +#endif + +/* The lock order depends on the order of elements in this array. */ +static const struct memcg_reparent_ops *memcg_reparent_ops[] = { + &memcg_objcg_reparent_ops, + &memcg_lruvec_reparent_ops, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + &memcg_thp_sq_reparent_ops, +#endif +}; + +#define DEFINE_MEMCG_REPARENT_FUNC(phase) \ + static void memcg_reparent_##phase(struct mem_cgroup *src, \ + struct mem_cgroup *dst) \ + { \ + int i; \ + \ + for (i = 0; i < ARRAY_SIZE(memcg_reparent_ops); i++) \ + memcg_reparent_ops[i]->phase(src, dst); \ + } + +DEFINE_MEMCG_REPARENT_FUNC(lock) +DEFINE_MEMCG_REPARENT_FUNC(relocate) +DEFINE_MEMCG_REPARENT_FUNC(unlock) + +static void memcg_reparent_objcgs(struct mem_cgroup *src) +{ + struct mem_cgroup *dst = parent_mem_cgroup(src); + struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true); + + local_irq_disable(); + memcg_reparent_lock(src, dst); + memcg_reparent_relocate(src, dst); + memcg_reparent_unlock(src, dst); + local_irq_enable(); percpu_ref_kill(&objcg->refcnt); } +#ifdef CONFIG_MEMCG_KMEM /* * A lot of the calls to the cache allocation functions are expected to be * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are @@ -357,7 +481,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page * @page: page of interest * * If memcg is bound to the default hierarchy, css of the memcg associated @@ -367,13 +491,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page_memcg(page); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg = get_mem_cgroup_from_page(page); + if (!memcg) memcg = root_mem_cgroup; return &memcg->css; @@ -756,13 +882,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct page *head = compound_head(page); /* rmap on tail pages */ + struct folio *folio = page_folio(page); /* rmap on tail pages */ struct mem_cgroup *memcg; pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; rcu_read_lock(); - memcg = page_memcg(head); + memcg = folio_memcg(folio); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); @@ -1183,23 +1309,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return; - - memcg = folio_memcg(folio); - - if (!memcg) - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); - else - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); -} -#endif - /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. @@ -1214,10 +1323,18 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) */ struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock(&lruvec->lru_lock); + goto retry; + } + rcu_read_unlock(); return lruvec; } @@ -1237,10 +1354,18 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irq(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } + rcu_read_unlock(); return lruvec; } @@ -1262,10 +1387,18 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irqsave(&lruvec->lru_lock, *flags); - lruvec_memcg_debug(lruvec, folio); + + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + rcu_read_unlock(); return lruvec; } @@ -2037,7 +2170,9 @@ void folio_memcg_lock(struct folio *folio) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed. + */ rcu_read_lock(); if (mem_cgroup_disabled()) @@ -2766,18 +2901,33 @@ static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages page_counter_uncharge(&memcg->memsw, nr_pages); } -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct obj_cgroup *objcg) { - VM_BUG_ON_FOLIO(folio_memcg(folio), folio); + VM_BUG_ON_FOLIO(folio_objcg(folio), folio); /* - * Any of the following ensures page's memcg stability: + * Any of the following ensures page's objcg stability: * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference */ - folio->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)objcg; +} + +static struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg = NULL; + + rcu_read_lock(); + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + objcg = rcu_dereference(memcg->objcg); + if (objcg && obj_cgroup_tryget(objcg)) + break; + } + rcu_read_unlock(); + + return objcg; } #ifdef CONFIG_MEMCG_KMEM @@ -2923,19 +3073,6 @@ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) return mem_cgroup_from_obj_folio(virt_to_folio(p), p); } -static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) -{ - struct obj_cgroup *objcg = NULL; - - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); - if (objcg && obj_cgroup_tryget(objcg)) - break; - objcg = NULL; - } - return objcg; -} - __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = NULL; @@ -2949,7 +3086,16 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) memcg = active_memcg(); else memcg = mem_cgroup_from_task(current); - objcg = __get_obj_cgroup_from_memcg(memcg); + + if (mem_cgroup_is_root(memcg)) + goto out; + + objcg = get_obj_cgroup_from_memcg(memcg); + if (obj_cgroup_is_root(objcg)) { + obj_cgroup_put(objcg); + objcg = NULL; + } +out: rcu_read_unlock(); return objcg; } @@ -2961,20 +3107,10 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) if (!memcg_kmem_enabled() || memcg_kmem_bypass()) return NULL; - if (PageMemcgKmem(page)) { - objcg = __folio_objcg(page_folio(page)); + objcg = folio_objcg(page_folio(page)); + if (objcg) obj_cgroup_get(objcg); - } else { - struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = __folio_memcg(page_folio(page)); - if (memcg) - objcg = __get_obj_cgroup_from_memcg(memcg); - else - objcg = NULL; - rcu_read_unlock(); - } return objcg; } @@ -3069,13 +3205,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) void __memcg_kmem_uncharge_page(struct page *page, int order) { struct folio *folio = page_folio(page); - struct obj_cgroup *objcg; + struct obj_cgroup *objcg = folio_objcg(folio); unsigned int nr_pages = 1 << order; - if (!folio_memcg_kmem(folio)) + if (!objcg) return; - objcg = __folio_objcg(folio); + VM_BUG_ON_FOLIO(!folio_memcg_kmem(folio), folio); obj_cgroup_uncharge_pages(objcg, nr_pages); folio->memcg_data = 0; obj_cgroup_put(objcg); @@ -3329,24 +3465,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ /* - * Because page_memcg(head) is not set on tails, set it now. + * Because page_objcg(head) is not set on tails, set it now. */ void split_page_memcg(struct page *head, unsigned int nr) { struct folio *folio = page_folio(head); - struct mem_cgroup *memcg = folio_memcg(folio); + struct obj_cgroup *objcg = folio_objcg(folio); int i; - if (mem_cgroup_disabled() || !memcg) + if (mem_cgroup_disabled() || !objcg) return; for (i = 1; i < nr; i++) folio_page(folio, i)->memcg_data = folio->memcg_data; - if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), nr - 1); - else - css_get_many(&memcg->css, nr - 1); + obj_cgroup_get_many(objcg, nr - 1); } #ifdef CONFIG_MEMCG_SWAP @@ -3651,21 +3784,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg; - - if (cgroup_memory_nokmem) + if (mem_cgroup_kmem_disabled()) return 0; if (unlikely(mem_cgroup_is_root(memcg))) return 0; - objcg = obj_cgroup_alloc(); - if (!objcg) - return -ENOMEM; - - objcg->memcg = memcg; - rcu_assign_pointer(memcg->objcg, objcg); - static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg->id.id; @@ -3675,27 +3799,13 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct mem_cgroup *parent; - - if (cgroup_memory_nokmem) + if (mem_cgroup_kmem_disabled()) return; if (unlikely(mem_cgroup_is_root(memcg))) return; - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - memcg_reparent_objcgs(memcg, parent); - - /* - * After we have finished memcg_reparent_objcgs(), all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. - * The ordering is imposed by list_lru_node->lock taken by - * memcg_reparent_list_lrus(). - */ - memcg_reparent_list_lrus(memcg, parent); + memcg_reparent_list_lrus(memcg, parent_mem_cgroup(memcg)); } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -4562,7 +4672,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@ -4609,6 +4719,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, frn->memcg_id = wb->memcg_css->id; frn->at = now; } + css_put(&memcg->css); } /* issue foreign writeback flushes for recorded foreign dirtying events */ @@ -5088,6 +5199,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return idr_find(&mem_cgroup_idr, id); } +#ifdef CONFIG_SHRINKER_DEBUG +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + cgrp = cgroup_get_from_id(ino); + if (!cgrp) + return ERR_PTR(-ENOENT); + + css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); + if (css) + memcg = container_of(css, struct mem_cgroup, css); + else + memcg = ERR_PTR(-ENOENT); + + cgroup_put(cgrp); + + return memcg; +} +#endif + static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -5152,6 +5286,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { error = memcg->id.id; + if (error == -ENOSPC) + pr_notice_ratelimited("mem_cgroup_id space is exhausted\n"); goto fail; } @@ -5177,8 +5313,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->socket_pressure = jiffies; #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->objcg_list); #endif + INIT_LIST_HEAD(&memcg->objcg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -5243,6 +5379,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct obj_cgroup *objcg; if (memcg_online_kmem(memcg)) goto remove_id; @@ -5255,6 +5392,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; + objcg = obj_cgroup_alloc(); + if (!objcg) + goto free_shrinker; + + objcg->memcg = memcg; + rcu_assign_pointer(memcg->objcg, objcg); + + if (unlikely(mem_cgroup_is_root(memcg))) + root_obj_cgroup = objcg; + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); @@ -5263,6 +5410,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); return 0; +free_shrinker: + free_shrinker_info(memcg); offline_kmem: memcg_offline_kmem(memcg); remove_id: @@ -5290,6 +5439,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + memcg_reparent_objcgs(memcg); memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); @@ -5656,10 +5806,12 @@ static int mem_cgroup_move_account(struct page *page, */ smp_mb(); - css_get(&to->css); - css_put(&from->css); + rcu_read_lock(); + obj_cgroup_get(rcu_dereference(to->objcg)); + obj_cgroup_put(rcu_dereference(from->objcg)); + rcu_read_unlock(); - folio->memcg_data = (unsigned long)to; + folio->memcg_data = (unsigned long)rcu_access_pointer(to->objcg); __folio_memcg_unlock(from); @@ -5693,8 +5845,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5732,7 +5884,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; @@ -5833,7 +5986,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6131,13 +6284,55 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); + + /* + * Moving its pages to another memcg is finished. Wait for already + * started RCU-only updates to finish to make sure that the caller + * of lock_page_memcg() can unlock the correct move_lock. The + * possible bad scenario would like: + * + * CPU0: CPU1: + * mem_cgroup_move_charge() + * walk_page_range() + * + * lock_page_memcg(page) + * memcg = folio_memcg() + * spin_lock_irqsave(&memcg->move_lock) + * memcg->move_lock_task = current + * + * atomic_dec(&mc.from->moving_account) + * + * mem_cgroup_css_offline() + * memcg_offline_kmem() + * memcg_reparent_objcgs() <== reparented + * + * unlock_page_memcg(page) + * memcg = folio_memcg() <== memcg has been changed + * if (memcg->move_lock_task == current) <== false + * spin_unlock_irqrestore(&memcg->move_lock) + * + * Once mem_cgroup_move_charge() returns (it means that the cgroup_mutex + * would be released soon), the page can be reparented to its parent + * memcg. When the unlock_page_memcg() is called for the page, we will + * miss unlock the move_lock. So using synchronize_rcu to wait for + * already started RCU-only updates to finish before this function + * returns (mem_cgroup_move_charge() and mem_cgroup_css_offline() are + * serialized by cgroup_mutex). + */ + synchronize_rcu(); } +/* + * The cgroup migration and memory cgroup offlining are serialized by + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot + * be reparented to its parent memory cgroup. So during the whole process + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not + * need to worry about the memcg (returned from page_memcg()) being + * released even if we do not hold an rcu read lock. + */ static void mem_cgroup_move_task(void) { if (mc.to) { @@ -6742,21 +6937,26 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(folio); - int ret; + int ret = 0; - ret = try_charge(memcg, gfp, nr_pages); + objcg = get_obj_cgroup_from_memcg(memcg); + /* Do not account at the root objcg level. */ + if (!obj_cgroup_is_root(objcg)) + ret = try_charge(memcg, gfp, nr_pages); if (ret) goto out; - css_get(&memcg->css); - commit_charge(folio, memcg); + obj_cgroup_get(objcg); + commit_charge(folio, objcg); local_irq_disable(); mem_cgroup_charge_statistics(memcg, nr_pages); memcg_check_events(memcg, folio_nid(folio)); local_irq_enable(); out: + obj_cgroup_put(objcg); return ret; } @@ -6842,7 +7042,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) } struct uncharge_gather { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; @@ -6857,63 +7057,56 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long flags; + struct mem_cgroup *memcg; + rcu_read_lock(); + memcg = obj_cgroup_memcg(ug->objcg); if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); + page_counter_uncharge(&memcg->memory, ug->nr_memory); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + page_counter_uncharge(&memcg->memsw, ug->nr_memory); if (ug->nr_kmem) - memcg_account_kmem(ug->memcg, -ug->nr_kmem); - memcg_oom_recover(ug->memcg); + memcg_account_kmem(memcg, -ug->nr_kmem); + memcg_oom_recover(memcg); } local_irq_save(flags); - __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg_check_events(ug->memcg, ug->nid); + __count_memcg_events(memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory); + memcg_check_events(memcg, ug->nid); local_irq_restore(flags); + rcu_read_unlock(); /* drop reference from uncharge_folio */ - css_put(&ug->memcg->css); + obj_cgroup_put(ug->objcg); } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { long nr_pages; - struct mem_cgroup *memcg; struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * folio memcg or objcg at this point, we have fully - * exclusive access to the folio. + * folio objcg at this point, we have fully exclusive + * access to the folio. */ - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); - /* - * This get matches the put at the end of the function and - * kmem pages do not hold memcg references anymore. - */ - memcg = get_mem_cgroup_from_objcg(objcg); - } else { - memcg = __folio_memcg(folio); - } - - if (!memcg) + objcg = folio_objcg(folio); + if (!objcg) return; - if (ug->memcg != memcg) { - if (ug->memcg) { + if (ug->objcg != objcg) { + if (ug->objcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = memcg; + ug->objcg = objcg; ug->nid = folio_nid(folio); - /* pairs with css_put in uncharge_batch */ - css_get(&memcg->css); + /* pairs with obj_cgroup_put in uncharge_batch */ + obj_cgroup_get(objcg); } nr_pages = folio_nr_pages(folio); @@ -6921,19 +7114,15 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - - folio->memcg_data = 0; - obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) ug->nr_memory += nr_pages; ug->pgpgout++; - - folio->memcg_data = 0; } - css_put(&memcg->css); + folio->memcg_data = 0; + obj_cgroup_put(objcg); } void __mem_cgroup_uncharge(struct folio *folio) @@ -6941,7 +7130,7 @@ void __mem_cgroup_uncharge(struct folio *folio) struct uncharge_gather ug; /* Don't touch folio->lru of any random page, pre-check: */ - if (!folio_memcg(folio)) + if (!folio_objcg(folio)) return; uncharge_gather_clear(&ug); @@ -6964,7 +7153,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) uncharge_gather_clear(&ug); list_for_each_entry(folio, page_list, lru) uncharge_folio(folio, &ug); - if (ug.memcg) + if (ug.objcg) uncharge_batch(&ug); } @@ -6981,6 +7170,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(new); unsigned long flags; @@ -6993,28 +7183,33 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; /* Page cache replacement: new folio already charged? */ - if (folio_memcg(new)) + if (folio_objcg(new)) return; - memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); - if (!memcg) + objcg = folio_objcg(old); + VM_WARN_ON_ONCE_FOLIO(!objcg, old); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + /* Force-charge the new page. The old one will be freed soon */ - if (!mem_cgroup_is_root(memcg)) { + if (!obj_cgroup_is_root(objcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); } - css_get(&memcg->css); - commit_charge(new, memcg); + obj_cgroup_get(objcg); + commit_charge(new, objcg); local_irq_save(flags); mem_cgroup_charge_statistics(memcg, nr_pages); memcg_check_events(memcg, folio_nid(new)); local_irq_restore(flags); + + rcu_read_unlock(); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); @@ -7173,8 +7368,6 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) break; } memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; } return memcg; } @@ -7189,6 +7382,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + struct obj_cgroup *objcg; unsigned int nr_entries; unsigned short oldid; @@ -7201,13 +7395,18 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return; /* + * Interrupts should be disabled by the caller (see the comments below), + * which can serve as RCU read-side critical sections. + */ + memcg = obj_cgroup_memcg(objcg); + + /* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. @@ -7224,7 +7423,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) folio->memcg_data = 0; - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) page_counter_uncharge(&memcg->memory, nr_entries); if (!cgroup_memory_noswap && memcg != swap_memcg) { @@ -7244,7 +7443,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) memcg_stats_unlock(); memcg_check_events(memcg, folio_nid(folio)); - css_put(&memcg->css); + obj_cgroup_put(objcg); } /** @@ -7262,19 +7461,21 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; + int ret = 0; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; + rcu_read_lock(); memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg, folio); if (!memcg) - return 0; + goto out; if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - return 0; + goto out; } memcg = mem_cgroup_id_get_online(memcg); @@ -7284,7 +7485,8 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* Get references for the tail pages, too */ @@ -7293,8 +7495,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); +out: + rcu_read_unlock(); - return 0; + return ret; } /** @@ -7339,6 +7543,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) bool mem_cgroup_swap_full(struct page *page) { struct mem_cgroup *memcg; + bool ret = false; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -7347,19 +7552,24 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; + rcu_read_lock(); memcg = page_memcg(page); if (!memcg) - return false; + goto out; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || - usage * 2 >= READ_ONCE(memcg->swap.max)) - return true; + usage * 2 >= READ_ONCE(memcg->swap.max)) { + ret = true; + goto out; + } } +out: + rcu_read_unlock(); - return false; + return ret; } static int __init setup_swap_account(char *s) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index da39ec8afca8..001fead45f30 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -297,10 +297,9 @@ void shake_page(struct page *p) } EXPORT_SYMBOL_GPL(shake_page); -static unsigned long dev_pagemap_mapping_shift(struct page *page, - struct vm_area_struct *vma) +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, + unsigned long address) { - unsigned long address = vma_address(page, vma); unsigned long ret = 0; pgd_t *pgd; p4d_t *p4d; @@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Notice: @fsdax_pgoff is used only when @p is a fsdax page. + * In other cases, such as anonymous and file-backend page, the address to be + * killed can be caculated by @p itself. */ static void add_to_kill(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + pgoff_t fsdax_pgoff, struct vm_area_struct *vma, + struct list_head *to_kill) { struct to_kill *tk; @@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } tk->addr = page_address_in_vma(p, vma); - if (is_zone_device_page(p)) - tk->size_shift = dev_pagemap_mapping_shift(p, vma); - else + if (is_zone_device_page(p)) { + /* + * Since page->mapping is not used for fsdax, we need + * calculate the address based on the vma. + */ + if (p->pgmap->type == MEMORY_DEVICE_FS_DAX) + tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else tk->size_shift = page_shift(compound_head(p)); /* @@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -541,12 +550,40 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} + +#ifdef CONFIG_FS_DAX +/* + * Collect processes when the error hit a fsdax page. + */ +static void collect_procs_fsdax(struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + for_each_process(tsk) { + struct task_struct *t = task_early_kill(tsk, true); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_mm == t->mm) + add_to_kill(t, page, pgoff, vma, to_kill); } } read_unlock(&tasklist_lock); i_mmap_unlock_read(mapping); } +#endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. @@ -1007,12 +1044,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p) static int me_swapcache_clean(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(p); + delete_from_swap_cache(folio); ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; - unlock_page(p); + folio_unlock(folio); if (has_extra_refcount(ps, p, false)) ret = MF_FAILED; @@ -1498,6 +1536,134 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } +static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, + struct address_space *mapping, pgoff_t index, int flags) +{ + struct to_kill *tk; + unsigned long size = 0; + + list_for_each_entry(tk, to_kill, nd) + if (tk->size_shift) + size = max(size, 1UL << tk->size_shift); + + if (size) { + /* + * Unmap the largest mapping to avoid breaking up device-dax + * mappings which are constant size. The actual size of the + * mapping being torn down is communicated in siginfo, see + * kill_proc() + */ + loff_t start = (index << PAGE_SHIFT) & ~(size - 1); + + unmap_mapping_range(mapping, start, size, 0); + } + + kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); +} + +static int mf_generic_kill_procs(unsigned long long pfn, int flags, + struct dev_pagemap *pgmap) +{ + struct page *page = pfn_to_page(pfn); + LIST_HEAD(to_kill); + dax_entry_t cookie; + int rc = 0; + + /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + + /* + * Prevent the inode from being freed while we are interrogating + * the address_space, typically this would be handled by + * lock_page(), but dax pages do not use the page lock. This + * also prevents changes to the mapping of this pfn until + * poison signaling is complete. + */ + cookie = dax_lock_page(page); + if (!cookie) + return -EBUSY; + + if (hwpoison_filter(page)) { + rc = -EOPNOTSUPP; + goto unlock; + } + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + /* + * TODO: Handle device pages which may need coordination + * with device-side memory. + */ + rc = -ENXIO; + goto unlock; + default: + break; + } + + /* + * Use this flag as an indication that the dax page has been + * remapped UC to prevent speculative consumption of poison. + */ + SetPageHWPoison(page); + + /* + * Unlike System-RAM there is no possibility to swap in a + * different physical page at a given virtual address, so all + * userspace consumption of ZONE_DEVICE memory necessitates + * SIGBUS (i.e. MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + collect_procs(page, &to_kill, true); + + unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); +unlock: + dax_unlock_page(page, cookie); + return rc; +} + +#ifdef CONFIG_FS_DAX +/** + * mf_dax_kill_procs - Collect and kill processes who are using this file range + * @mapping: address_space of the file in use + * @index: start pgoff of the range within the file + * @count: length of the range, in unit of PAGE_SIZE + * @mf_flags: memory failure flags + */ +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags) +{ + LIST_HEAD(to_kill); + dax_entry_t cookie; + struct page *page; + size_t end = index + count; + + mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + for (; index < end; index++) { + page = NULL; + cookie = dax_lock_mapping_entry(mapping, index, &page); + if (!cookie) + return -EBUSY; + if (!page) + goto unlock; + + SetPageHWPoison(page); + + collect_procs_fsdax(page, mapping, index, &to_kill); + unmap_and_kill(&to_kill, page_to_pfn(page), mapping, + index, mf_flags); +unlock: + dax_unlock_mapping_entry(mapping, index, cookie); + } + return 0; +} +EXPORT_SYMBOL_GPL(mf_dax_kill_procs); +#endif /* CONFIG_FS_DAX */ + /* * Called from hugetlb code with hugetlb_lock held. * @@ -1633,23 +1799,27 @@ out: unlock_page(head); return res; } + #else + +static inline int mf_generic_kill_procs(unsigned long long pfn, int flags, + struct dev_pagemap *pgmap) +{ + return 0; +} + static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { return 0; } -#endif + +#endif /* CONFIG_HUGETLB_PAGE */ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { struct page *page = pfn_to_page(pfn); - unsigned long size = 0; - struct to_kill *tk; - LIST_HEAD(tokill); - int rc = -EBUSY; - loff_t start; - dax_entry_t cookie; + int rc = -ENXIO; if (flags & MF_COUNT_INCREASED) /* @@ -1658,73 +1828,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, put_page(page); /* device metadata space is not recoverable */ - if (!pgmap_pfn_valid(pgmap, pfn)) { - rc = -ENXIO; + if (!pgmap_pfn_valid(pgmap, pfn)) goto out; - } /* - * Pages instantiated by device-dax (not filesystem-dax) - * may be compound pages. + * Call driver's implementation to handle the memory failure, otherwise + * fall back to generic handler. */ - page = compound_head(page); - - /* - * Prevent the inode from being freed while we are interrogating - * the address_space, typically this would be handled by - * lock_page(), but dax pages do not use the page lock. This - * also prevents changes to the mapping of this pfn until - * poison signaling is complete. - */ - cookie = dax_lock_page(page); - if (!cookie) - goto out; - - if (hwpoison_filter(page)) { - rc = -EOPNOTSUPP; - goto unlock; - } - - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + if (pgmap->ops->memory_failure) { + rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); /* - * TODO: Handle HMM pages which may need coordination - * with device-side memory. + * Fall back to generic handler too if operation is not + * supported inside the driver/device/filesystem. */ - goto unlock; + if (rc != -EOPNOTSUPP) + goto out; } - /* - * Use this flag as an indication that the dax page has been - * remapped UC to prevent speculative consumption of poison. - */ - SetPageHWPoison(page); - - /* - * Unlike System-RAM there is no possibility to swap in a - * different physical page at a given virtual address, so all - * userspace consumption of ZONE_DEVICE memory necessitates - * SIGBUS (i.e. MF_MUST_KILL) - */ - flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &tokill, true); - - list_for_each_entry(tk, &tokill, nd) - if (tk->size_shift) - size = max(size, 1UL << tk->size_shift); - if (size) { - /* - * Unmap the largest mapping to avoid breaking up - * device-dax mappings which are constant size. The - * actual size of the mapping being torn down is - * communicated in siginfo, see kill_proc() - */ - start = (page->index << PAGE_SHIFT) & ~(size - 1); - unmap_mapping_range(page->mapping, start, size, 0); - } - kill_procs(&tokill, true, false, pfn, flags); - rc = 0; -unlock: - dax_unlock_page(page, cookie); + rc = mf_generic_kill_procs(pfn, flags, pgmap); out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); @@ -2178,7 +2299,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool lru = PageLRU(page); if (PageHuge(page)) { - isolated = isolate_huge_page(page, pagelist); + isolated = !isolate_hugetlb(page, pagelist); } else { if (lru) isolated = !isolate_lru_page(page); diff --git a/mm/memory.c b/mm/memory.c index ed45dbf9c31e..e3d3596068f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -402,12 +402,21 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { - while (vma) { - struct vm_area_struct *next = vma->vm_next; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -426,7 +435,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = vma->vm_next; + next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); unlink_file_vma(vma); } @@ -434,7 +443,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, floor, next ? next->vm_start : ceiling); } vma = next; - } + } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) @@ -623,6 +632,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; + /* + * NOTE: New users of ZONE_DEVICE will not set pte_devmap() and + * will have refcounts incremented on their struct pages when + * they are inserted into PTEs, thus they are safe to return + * here. Legacy ZONE_DEVICE pages that set pte_devmap() do not + * have refcounts. Example of legacy ZONE_DEVICE is + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + */ if (pte_devmap(pte)) return NULL; @@ -1245,7 +1262,7 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (userfaultfd_wp(dst_vma)) return true; - if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) + if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) @@ -1705,7 +1722,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { @@ -1715,12 +1732,14 @@ void unmap_vmas(struct mmu_gather *tlb, /* Careful - we need to zap private pages too! */ .even_cows = true, }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1735,8 +1754,11 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; struct mmu_notifier_range range; struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -1744,8 +1766,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + do { unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -4693,7 +4716,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_modify(old_pte, vma->vm_page_prot); page = vm_normal_page(vma, vmf->address, pte); - if (!page) + if (!page || is_zone_device_page(page)) goto out_map; /* TODO: handle PTE-mapped THP */ @@ -4963,6 +4986,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; + unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -4976,7 +5000,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!vmf.pud) return VM_FAULT_OOM; retry_pud: - if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { + if (pud_none(*vmf.pud) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5009,7 +5034,8 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { + if (pmd_none(*vmf.pmd) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f1a730c4499..99ecb2b3ff53 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -43,30 +43,22 @@ #include "shuffle.h" #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp) -{ - if (hugetlb_optimize_vmemmap_enabled()) - return 0; - return param_set_bool(val, kp); -} - -static const struct kernel_param_ops memmap_on_memory_ops = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = memmap_on_memory_set, - .get = param_get_bool, -}; - /* * memory_hotplug.memmap_on_memory parameter */ static bool memmap_on_memory __ro_after_init; -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444); +module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); -bool mhp_memmap_on_memory(void) +static inline bool mhp_memmap_on_memory(void) { return memmap_on_memory; } +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} #endif enum { @@ -670,12 +662,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon } +#ifdef CONFIG_ZONE_DEVICE static void section_taint_zone_device(unsigned long pfn) { struct mem_section *ms = __pfn_to_section(pfn); ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; } +#else +static inline void section_taint_zone_device(unsigned long pfn) +{ +} +#endif /* * Associate the pfn range with the given zone, initializing the memmaps @@ -1029,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; - int ret; + int ret, i; ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); if (ret) @@ -1037,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + for (i = 0; i < nr_pages; i++) + SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + /* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections online here as otherwise they will be @@ -1641,7 +1642,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_huge_page(head, &source); + isolate_hugetlb(head, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d39b01fd52fe..dc74239d1ac7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } @@ -523,7 +524,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* * vm_normal_page() filters out zero pages, but there might @@ -602,7 +603,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { - if (!isolate_huge_page(page, qp->pagelist) && + if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages @@ -656,7 +657,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->vma; + struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; @@ -671,9 +672,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, /* hole at head side of range */ return -EFAULT; } + next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && - (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; @@ -787,26 +789,24 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { + MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - - vma = find_vma(mm, start); - VM_BUG_ON(!vma); - prev = vma->vm_prev; - if (start > vma->vm_start) - prev = vma; + prev = mas_find_rev(&mas, 0); + if (prev && (start < prev->vm_end)) + vma = prev; + else + vma = mas_next(&mas, end - 1); - for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); + for (; vma; vma = mas_next(&mas, end - 1)) { + unsigned long vmstart = max(start, vma->vm_start); + unsigned long vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) - continue; + goto next; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); @@ -815,6 +815,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto replace; } @@ -822,19 +824,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } - replace: +replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; +next: + prev = vma; } - out: +out: return err; } @@ -1049,6 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { @@ -1064,8 +1073,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ + vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1195,14 +1205,13 @@ static struct page *new_page(struct page *page, unsigned long start) struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - vma = find_vma(current->mm, start); - while (vma) { + for_each_vma(vmi, vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; - vma = vma->vm_next; } if (folio_test_hugetlb(src)) @@ -1388,7 +1397,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; - if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) + if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { @@ -1480,6 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long vmend; unsigned long end; int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) @@ -1505,9 +1515,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); - vma = find_vma(mm, start); - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - + for_each_vma_range(vmi, vma, end) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); diff --git a/mm/memremap.c b/mm/memremap.c index 8b5c8fd4ea8e..f0955785150f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; case MEMORY_DEVICE_FS_DAX: if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); diff --git a/mm/migrate.c b/mm/migrate.c index 0b4995f768d0..ddfb8c5f9bc9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -132,7 +132,7 @@ static void putback_movable_page(struct page *page) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_huge_page(). + * and isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -314,13 +314,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) +#ifdef CONFIG_HUGETLB_PAGE +void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); - __migration_entry_wait(mm, pte, ptl); + pte_t pte; + + spin_lock(ptl); + pte = huge_ptep_get(ptep); + + if (unlikely(!is_hugetlb_entry_migration(pte))) + spin_unlock(ptl); + else + migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); } +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); + + __migration_entry_wait_huge(pte, ptl); +} +#endif + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { @@ -440,6 +455,10 @@ int folio_migrate_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; + /* + * Irq is disabled, which can serve as RCU read-side critical + * sections. + */ memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); @@ -1132,15 +1151,10 @@ static int unmap_and_move(new_page_t get_new_page, return -ENOSYS; if (page_count(page) == 1) { - /* page was freed from under us. So we are done. */ + /* Page was freed from under us. So we are done. */ ClearPageActive(page); ClearPageUnevictable(page); - if (unlikely(__PageMovable(page))) { - lock_page(page); - if (!PageMovable(page)) - ClearPageIsolated(page); - unlock_page(page); - } + /* free_pages_prepare() will clear PG_isolated. */ goto out; } @@ -1655,7 +1669,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1675,8 +1689,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - isolate_huge_page(page, pagelist); - err = 1; + err = isolate_hugetlb(page, pagelist); + if (!err) + err = 1; } } else { struct page *head; @@ -1846,7 +1861,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); err = PTR_ERR(page); if (IS_ERR(page)) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5dd97c39ca6a..ad593d5754cf 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -148,15 +148,21 @@ again: if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; goto next; } page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -480,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args) args->start &= PAGE_MASK; args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; if (!args->src || !args->dst) return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); + if (args->vma) { + if (is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || + args->end > args->vma->vm_end) + return -EINVAL; + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + } if (args->cpages) migrate_vma_unmap(args); @@ -518,7 +524,7 @@ EXPORT_SYMBOL(migrate_vma_setup); * handle_pte_fault() * do_anonymous_page() * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * private or coherent page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -594,11 +600,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { + if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } @@ -682,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - if (!page) { + if (!page && migrate->vma) { if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { @@ -701,10 +704,11 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_device_private_page(newpage)) { + if (is_device_private_page(newpage) || + is_device_coherent_page(newpage)) { /* - * For now only support private anonymous when migrating - * to un-addressable device memory. + * For now only support anonymous memory migrating to + * device private or coherent memory. */ if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; @@ -791,3 +795,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } EXPORT_SYMBOL(migrate_vma_finalize); + +/* + * Migrate a device coherent page back to normal memory. The caller should have + * a reference on page which will be copied to the new page if migration is + * successful or dropped on failure. + */ +struct page *migrate_device_page(struct page *page, unsigned int gup_flags) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma args; + struct page *dpage; + + lock_page(page); + src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + args.src = &src_pfn; + args.dst = &dst_pfn; + args.cpages = 1; + args.npages = 1; + args.vma = NULL; + migrate_vma_setup(&args); + if (!(src_pfn & MIGRATE_PFN_MIGRATE)) + return NULL; + + dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0); + + /* + * get/pin the new page now so we don't have to retry gup after + * migrating. We already have a reference so this should never fail. + */ + if (dpage && WARN_ON_ONCE(!try_grab_page(dpage, gup_flags))) { + __free_pages(dpage, 0); + dpage = NULL; + } + + if (dpage) { + lock_page(dpage); + dst_pfn = migrate_pfn(page_to_pfn(dpage)); + } + + migrate_vma_pages(&args); + if (src_pfn & MIGRATE_PFN_MIGRATE) + copy_highpage(dpage, page); + migrate_vma_finalize(&args); + if (dpage && !(src_pfn & MIGRATE_PFN_MIGRATE)) { + if (gup_flags & FOLL_PIN) + unpin_user_page(dpage); + else + put_page(dpage); + dpage = NULL; + } + + return dpage; +} diff --git a/mm/mlock.c b/mm/mlock.c index 716caf851043..d9039fb9c56b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,7 +205,7 @@ static void mlock_pagevec(struct pagevec *pvec) } if (lruvec) - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; if (PageTransCompound(page)) continue; @@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = find_vma(current->mm, start); - if (!vma || vma->vm_start > start) + vma = mas_walk(&mas); + if (!vma) return -ENOMEM; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; + else + prev = mas_prev(&mas, 0); for (nstart = start ; ; ) { vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(prev->vm_mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; @@ -526,24 +528,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); - if (mm == NULL) - mm = current->mm; + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; - vma = find_vma(mm, start); - if (vma == NULL) - return 0; - - for (; vma ; vma = vma->vm_next) { - if (start >= vma->vm_end) - continue; - if (start + len <= vma->vm_start) - break; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); - if (start + len < vma->vm_end) { - count += start + len - vma->vm_start; + if (end < vma->vm_end) { + count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; @@ -659,6 +658,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,7 +679,7 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -687,6 +687,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); cond_resched(); } out: diff --git a/mm/mmap.c b/mm/mmap.c index c14d7286a379..8f776f257893 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -14,7 +14,6 @@ #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -39,7 +38,6 @@ #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> -#include <linux/rbtree_augmented.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> @@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); + struct vm_area_struct *next, unsigned long start, + unsigned long end); /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected @@ -179,12 +178,10 @@ void unlink_file_vma(struct vm_area_struct *vma) } /* - * Close a vm structure and free it, returning the next. + * Close a vm structure and free it. */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma) { - struct vm_area_struct *next = vma->vm_next; - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -192,20 +189,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); - return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -228,6 +246,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) if (brk < min_brk) goto out; + /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data @@ -247,35 +266,51 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. + * do_brk_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + BUG_ON(brkvma == NULL); + if (brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; } - /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -288,121 +323,64 @@ success: if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; - out: mmap_write_unlock(mm); return origbrk; } -static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) -{ - unsigned long gap, prev_end; +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); - /* - * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we - * allow two stack_guard_gaps between them here, and when choosing - * an unmapped area; whereas when expanding we only require one. - * That's a little inconsistent, but keeps the code here simpler. - */ - gap = vm_start_gap(vma); - if (vma->vm_prev) { - prev_end = vm_end_gap(vma->vm_prev); - if (gap > prev_end) - gap -= prev_end; - else - gap = 0; - } - return gap; -} +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt; -#ifdef CONFIG_DEBUG_VM_RB -static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) -{ - unsigned long max = vma_compute_gap(vma), subtree_gap; - if (vma->vm_rb.rb_left) { - subtree_gap = rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - if (vma->vm_rb.rb_right) { - subtree_gap = rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - return max; -} - -static int browse_rb(struct mm_struct *mm) -{ - struct rb_root *root = &mm->mm_rb; - int i = 0, j, bug = 0; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) { - pr_emerg("vm_start %lx < prev %lx\n", - vma->vm_start, prev); - bug = 1; - } - if (vma->vm_start < pend) { - pr_emerg("vm_start %lx < pend %lx\n", - vma->vm_start, pend); - bug = 1; - } - if (vma->vm_start > vma->vm_end) { - pr_emerg("vm_start %lx > vm_end %lx\n", - vma->vm_start, vma->vm_end); - bug = 1; - } - spin_lock(&mm->page_table_lock); - if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_emerg("free gap %lx, correct %lx\n", - vma->rb_subtree_gap, - vma_compute_subtree_gap(vma)); - bug = 1; - } - spin_unlock(&mm->page_table_lock); - i++; - pn = nd; - prev = vma->vm_start; - pend = vma->vm_end; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) - j++; - if (i != j) { - pr_emerg("backwards %d, forwards %d\n", j, i); - bug = 1; - } - return bug ? -1 : i; -} + MA_STATE(mas, mt, 0, 0); -static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) -{ - struct rb_node *nd; + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma_mt); + pr_emerg("mt piv: %px %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %px %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - VM_BUG_ON_VMA(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma), - vma); + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %px vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %px vma_mt %px %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); + } } + mt_validate(&mm->mm_mt); } static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - unsigned long highest_address = 0; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + validate_mm_mt(mm); - while (vma) { + mas_for_each(&mas, vma, ULONG_MAX) { +#ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -412,93 +390,20 @@ static void validate_mm(struct mm_struct *mm) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } - - highest_address = vm_end_gap(vma); - vma = vma->vm_next; +#endif i++; } if (i != mm->map_count) { - pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); - bug = 1; - } - if (highest_address != mm->highest_vm_end) { - pr_emerg("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); - bug = 1; - } - i = browse_rb(mm); - if (i != mm->map_count) { - if (i != -1) - pr_emerg("map_count %d rb %d\n", mm->map_count, i); + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } -#else -#define validate_mm_rb(root, ignore) do { } while (0) -#define validate_mm(mm) do { } while (0) -#endif - -RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, - struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_gap) - -/* - * Update augmented rbtree rb_subtree_gap values after vma->vm_start or - * vma->vm_prev->vm_end values changed, without modifying the vma's position - * in the rbtree. - */ -static void vma_gap_update(struct vm_area_struct *vma) -{ - /* - * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created - * a callback function that does exactly what we want. - */ - vma_gap_callbacks_propagate(&vma->vm_rb, NULL); -} - -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) -{ - /* All rb_subtree_gap values must be consistent prior to insertion */ - validate_mm_rb(root, NULL); - - rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) -{ - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, - struct vm_area_struct *ignore) -{ - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of - * - * a. the "next" vma being erased if next->vm_start was reduced in - * __vma_adjust() -> __vma_unlink() - * b. the vma being erased in detach_vmas_to_be_unmapped() -> - * vma_rb_erase() - */ - validate_mm_rb(root, ignore); - __vma_rb_erase(vma, root); -} - -static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) -{ - vma_rb_erase_ignore(vma, root, vma); -} +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ +#define validate_mm_mt(root) do { } while (0) +#define validate_mm(mm) do { } while (0) +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -532,208 +437,221 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static int find_vma_links(struct mm_struct *mm, unsigned long addr, - unsigned long end, struct vm_area_struct **pprev, - struct rb_node ***rb_link, struct rb_node **rb_parent) +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) { - struct rb_node **__rb_link, *__rb_parent, *rb_prev; + VMA_ITERATOR(vmi, mm, addr); + struct vm_area_struct *vma; + unsigned long nr_pages = 0; - mmap_assert_locked(mm); - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; + for_each_vma_range(vmi, vma, end) { + unsigned long vm_start = max(addr, vma->vm_start); + unsigned long vm_end = min(end, vma->vm_end); - while (*__rb_link) { - struct vm_area_struct *vma_tmp; + nr_pages += (vm_end - vm_start) / PAGE_SIZE; + } - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + return nr_pages; +} - if (vma_tmp->vm_end > addr) { - /* Fail if an existing vma overlaps the area */ - if (vma_tmp->vm_start < end) - return -ENOMEM; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return 0; + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); } /* - * vma_next() - Get the next VMA. - * @mm: The mm_struct. - * @vma: The current vma. + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state * - * If @vma is NULL, return the first vma in the mm. + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. * - * Returns: The next VMA after @vma. + * Note: the end address is inclusive in the maple tree. */ -static inline struct vm_area_struct *vma_next(struct mm_struct *mm, - struct vm_area_struct *vma) +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) { - if (!vma) - return mm->mmap; - - return vma->vm_next; + trace_vma_store(mas->tree, vma); + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); } /* - * munmap_vma_range() - munmap VMAs that overlap a range. - * @mm: The mm struct - * @start: The start of the range. - * @len: The length of the range. - * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * @rb_link: the rb_node - * @rb_parent: the parent rb_node - * - * Find all the vm_area_struct that overlap from @start to - * @end and munmap them. Set @pprev to the previous vm_area_struct. + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state * - * Returns: -ENOMEM on munmap failure or 0 on success. + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. */ -static inline int -munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct rb_node ***link, - struct rb_node **parent, struct list_head *uf) +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) { - - while (find_vma_links(mm, start, start + len, pprev, link, parent)) - if (do_munmap(mm, start, len, uf)) - return -ENOMEM; - - return 0; + trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); } -static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) + +/* + * vma_mas_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mm: The struct_mm + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, + unsigned long end) { - unsigned long nr_pages = 0; - struct vm_area_struct *vma; + trace_vma_mas_szero(mas->tree, start, end - 1); + mas_set_range(mas, start, end - 1); + mas_store_prealloc(mas, NULL); +} - /* Find first overlapping mapping */ - vma = find_vma_intersection(mm, addr, end); - if (!vma) - return 0; +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct address_space *mapping = NULL; - nr_pages = (min(end, vma->vm_end) - - max(addr, vma->vm_start)) >> PAGE_SHIFT; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; - /* Iterate over the rest of the overlaps */ - for (vma = vma->vm_next; vma; vma = vma->vm_next) { - unsigned long overlap_len; + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } - if (vma->vm_start > end) - break; + vma_mas_store(vma, &mas); - overlap_len = min(end, vma->vm_end) - vma->vm_start; - nr_pages += overlap_len >> PAGE_SHIFT; + if (mapping) { + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); } - return nr_pages; + mm->map_count++; + validate_mm(mm); + return 0; } -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { - /* Update tracking information for the gap following the new vma. */ - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; + bool anon_cloned = false; - /* - * vma->vm_prev wasn't known when we followed the rbtree to find the - * correct insertion point for that vma. As a result, we could not - * update the vma vm_rb parents rb_subtree_gap values on the way down. - * So, we first insert the vma with a zero rb_subtree_gap value - * (to be consistent with what we did on the way down), and then - * immediately update the gap to the correct value. Finally we - * rebalance the rbtree after all augmented values have been set. - */ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - vma->rb_subtree_gap = 0; - vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); -} + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; -static void __vma_link_file(struct vm_area_struct *vma) -{ - struct file *file; + vma->anon_vma = next->anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + anon_cloned = true; + } + } - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(mapping); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); + vma_adjust_trans_huge(vma, start, end, 0); + + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); } -} -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); -} + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + } -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - struct address_space *mapping = NULL; + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); } - __vma_link(mm, vma, prev, rb_link, rb_parent); - __vma_link_file(vma); + /* Expanding over the next vma */ + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); + } - if (mapping) + if (file) { i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } - mm->map_count++; - validate_mm(mm); -} + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } -/* - * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the interval tree. - */ -static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); - mm->map_count++; -} + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } -static __always_inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *ignore) -{ - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - __vma_unlink_list(mm, vma); - /* Kill the cache */ - vmacache_invalidate(mm); + validate_mm(mm); + return 0; + +nomem: + if (anon_cloned) + unlink_anon_vmas(vma); + return -ENOMEM; } /* @@ -748,18 +666,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; + struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; - bool start_changed = false, end_changed = false; + bool vma_changed = false; long adjust_next = 0; int remove_next = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; - if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -788,8 +707,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * remove_next == 1 is case 1 or 7. */ remove_next = 1 + (end > next->vm_end); + next_next = find_vma(mm, next->vm_end); VM_WARN_ON(remove_next == 2 && - end != next->vm_next->vm_end); + end != next_next->vm_end); /* trim end to next, for case 6 first pass */ end = next->vm_end; } @@ -802,7 +722,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * next, if the vma overlaps with it. */ if (remove_next == 2 && !next->anon_vma) - exporter = next->vm_next; + exporter = find_vma(mm, next->vm_end); } else if (end > next->vm_start) { /* @@ -842,6 +762,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, again: vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (exporter && exporter->anon_vma) + unlink_anon_vmas(importer); + return -ENOMEM; + } + if (file) { mapping = file->f_mapping; root = &mapping->i_mmap; @@ -851,14 +777,14 @@ again: uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); - if (insert) { + if (insert && insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ - __vma_link_file(insert); + __vma_link_file(insert, insert->vm_file->f_mapping); } } @@ -882,17 +808,37 @@ again: } if (start != vma->vm_start) { + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { + vma_mas_szero(&mas, vma->vm_start, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } vma->vm_start = start; - start_changed = true; } if (end != vma->vm_end) { + if (vma->vm_end > end) { + if (!insert || (insert->vm_start != end)) { + vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } + } else { + vma_changed = true; + } vma->vm_end = end; - end_changed = true; } + + if (vma_changed) + vma_mas_store(vma, &mas); + vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_mas_store(next, &mas); } if (file) { @@ -902,42 +848,17 @@ again: flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - if (remove_next != 3) - __vma_unlink(mm, next, next); - else - /* - * vma is not before next if they've been - * swapped. - * - * pre-swap() next->vm_start was reduced so - * tell validate_mm_rb to ignore pre-swap() - * "next" (which is stored in post-swap() - * "vma"). - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, insert); - } else { - if (start_changed) - vma_gap_update(vma); - if (end_changed) { - if (!next) - mm->highest_vm_end = vm_end_gap(vma); - else if (!adjust_next) - vma_gap_update(next); - } + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; } if (anon_vma) { @@ -964,7 +885,9 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); + BUG_ON(vma->vm_end < next->vm_end); vm_area_free(next); + /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -974,10 +897,10 @@ again: /* * If "next" was removed and vma->vm_end was * expanded (up) over it, in turn - * "next->vm_prev->vm_end" changed and the - * "vma->vm_next" gap must be updated. + * "next->prev->vm_end" changed and the + * "vma->next" gap must be updated. */ - next = vma->vm_next; + next = next_next; } else { /* * For the scope of the comment "next" and @@ -992,38 +915,17 @@ again: next = vma; } if (remove_next == 2) { + mas_reset(&mas); remove_next = 1; end = next->vm_end; goto again; } - else if (next) - vma_gap_update(next); - else { - /* - * If remove_next == 2 we obviously can't - * reach this path. - * - * If remove_next == 3 we can't reach this - * path because pre-swap() next is always not - * NULL. pre-swap() "next" is not being - * removed and its next->vm_end is not altered - * (and furthermore "end" already matches - * next->vm_end in remove_next == 3). - * - * We reach this only in the remove_next == 1 - * case if the "next" vma that was removed was - * the highest vma of the mm. However in such - * case next->vm_end == "end" and the extended - * "vma" has vma->vm_end == next->vm_end so - * mm->highest_vm_end doesn't need any update - * in remove_next == 1 case. - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); - } } - if (insert && file) + if (insert && file) { uprobe_mmap(insert); + } + mas_destroy(&mas); validate_mm(mm); return 0; @@ -1175,8 +1077,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - int err; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; /* * We later require that vma->vm_flags == vm_flags, @@ -1185,76 +1089,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = vma_next(mm, prev); - area = next; - if (area && area->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; + next = find_vma(mm, prev ? prev->vm_end : 0); + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* - * Can it merge with the predecessor? - */ + /* Can we merge the predecessor? */ if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, - pgoff+pglen, - vm_userfaultfd_ctx, anon_name) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { - /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - } else /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - if (err) - return NULL; - khugepaged_enter_vma(prev, vm_flags); - return prev; + merge_prev = true; } - - /* - * Can this new request be merged in front of next? - */ + /* Can we merge the successor? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ - err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); - /* - * In case 3 area is already equal to next and - * this is a noop, but in case 8 "area" has - * been removed and next was expanded over it. - */ - area = next; - } - if (err) - return NULL; - khugepaged_enter_vma(area, vm_flags); - return area; + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; } - return NULL; + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; } /* @@ -1322,18 +1211,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; /* Try next first. */ - if (vma->vm_next) { - anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); /* Try prev next. */ - if (vma->vm_prev) - anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find @@ -1422,6 +1317,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; + validate_mm(mm); *populate = 0; if (!len) @@ -1722,389 +1618,62 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev, *merge; - int error; - struct rb_node **rb_link, *rb_parent; - unsigned long charged = 0; - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, addr + len); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } - - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) - return -ENOMEM; - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - /* - * Can we just expand an old mapping? - */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_flags = vm_flags; - vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; - - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - - vma->vm_file = get_file(file); - error = call_mmap(file, vma); - if (error) - goto unmap_and_free_vma; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; - - /* If vm_flags changed after call_mmap(), we should try merge vma again - * as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (merge) { - /* ->mmap() can change vma->vm_file and fput the original file. So - * fput the vma->vm_file here or we would add an extra fput for file - * and cause general protection fault ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - vm_flags = vma->vm_flags; - goto unmap_writable; - } - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } else { - vma_set_anonymous(vma); - } - - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - vma_link(mm, vma, prev, rb_link, rb_parent); - - /* - * vma_merge() calls khugepaged_enter_vma() either, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - file = vma->vm_file; -out: - perf_event_mmap(vma); - - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else - mm->locked_vm += (len >> PAGE_SHIFT); - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vma->vm_flags |= VM_SOFTDIRTY; - - vma_set_page_prot(vma); - - return addr; - -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - return error; -} - +/* + * unmapped_area() Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - /* - * We implement the search by looking for an rbtree node that - * immediately follows a suitable gap. That is, - * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; - * - gap_end = vma->vm_start >= info->low_limit + length; - * - gap_end - gap_start >= length - */ + unsigned long length, gap; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long length, low_limit, high_limit, gap_start, gap_end; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - /* Adjust search limits by the desired length */ - if (info->high_limit < length) - return -ENOMEM; - high_limit = info->high_limit - length; - - if (info->low_limit > high_limit) + if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - low_limit = info->low_limit + length; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - goto check_highest; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) - goto check_highest; - - while (true) { - /* Visit left subtree if it looks promising */ - gap_end = vm_start_gap(vma); - if (gap_end >= low_limit && vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; -check_current: - /* Check if current node has a suitable gap */ - if (gap_start > high_limit) - return -ENOMEM; - if (gap_end >= low_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit right subtree if it looks promising */ - if (vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - goto check_highest; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_left) { - gap_start = vm_end_gap(vma->vm_prev); - gap_end = vm_start_gap(vma); - goto check_current; - } - } - } -check_highest: - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ - if (gap_start > high_limit) - return -ENOMEM; - -found: - /* We found a suitable gap. Clip it with the original low_limit. */ - if (gap_start < info->low_limit) - gap_start = info->low_limit; - - /* Adjust gap address to the desired alignment */ - gap_start += (info->align_offset - gap_start) & info->align_mask; - - VM_BUG_ON(gap_start + info->length > info->high_limit); - VM_BUG_ON(gap_start + info->length > gap_end); - return gap_start; + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + return gap; } +/* unmapped_area_topdown() Find an area between the low_limit and the + * high_limit with * the correct alignment and offset at the highest available + * address, all from * @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long length, gap; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - /* - * Adjust search limits by the desired length. - * See implementation comment at top of unmapped_area(). - */ - gap_end = info->high_limit; - if (gap_end < length) - return -ENOMEM; - high_limit = gap_end - length; - - if (info->low_limit > high_limit) + if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - low_limit = info->low_limit + length; - - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - if (gap_start <= high_limit) - goto found_highest; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - return -ENOMEM; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) - return -ENOMEM; - - while (true) { - /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; - if (gap_start <= high_limit && vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - -check_current: - /* Check if current node has a suitable gap */ - gap_end = vm_start_gap(vma); - if (gap_end < low_limit) - return -ENOMEM; - if (gap_start <= high_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit left subtree if it looks promising */ - if (vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - return -ENOMEM; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_right) { - gap_start = vma->vm_prev ? - vm_end_gap(vma->vm_prev) : 0; - goto check_current; - } - } - } - -found: - /* We found a suitable gap. Clip it with the original high_limit. */ - if (gap_end > info->high_limit) - gap_end = info->high_limit; -found_highest: - /* Compute highest gap address at the desired alignment */ - gap_end -= info->length; - gap_end -= (gap_end - info->align_offset) & info->align_mask; - - VM_BUG_ON(gap_end < info->low_limit); - VM_BUG_ON(gap_end < gap_start); - return gap_end; + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + return gap; } /* @@ -2294,58 +1863,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct rb_node *rb_node; - struct vm_area_struct *vma; + unsigned long index = start_addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *tmp; - - tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); - if (tmp->vm_end > addr) { - vma = tmp; - if (tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } +/** + * find_vma() - Find the VMA for a given address, or the next vma. + * @mm: The mm_struct to check + * @addr: The address + * + * Returns: The VMA associated with addr, or the next vma. + * May return %NULL in the case of no vma at addr or above. + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + unsigned long index = addr; - if (vma) - vmacache_update(addr, vma); - return vma; + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, ULONG_MAX); } - EXPORT_SYMBOL(find_vma); -/* - * Same as find_vma, but also return a pointer to the previous VMA in *pprev. +/** + * find_vma_prev() - Find the VMA for a given address, or the next vma and + * set %pprev to the previous VMA, if any. + * @mm: The mm_struct to check + * @addr: The address + * @pprev: The pointer to set to the previous VMA + * + * Note that RCU lock is missing here since the external mmap_lock() is used + * instead. + * + * Returns: The VMA associated with @addr, or the next vma. + * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); - vma = find_vma(mm, addr); - if (vma) { - *pprev = vma->vm_prev; - } else { - struct rb_node *rb_node = rb_last(&mm->mm_rb); - - *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; - } + vma = mas_walk(&mas); + *pprev = mas_prev(&mas, 0); + if (!vma) + vma = mas_next(&mas, ULONG_MAX); return vma; } @@ -2399,6 +1977,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2416,16 +1995,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - next = vma->vm_next; - if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2446,15 +2030,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2462,11 +2044,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2475,7 +2055,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); + mas_destroy(&mas); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2483,10 +2063,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, - unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; @@ -2495,7 +2075,7 @@ int expand_downwards(struct vm_area_struct *vma, return -EPERM; /* Enforce stack_guard_gap */ - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev)) { @@ -2503,9 +2083,14 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2526,15 +2111,13 @@ int expand_downwards(struct vm_area_struct *vma, error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2543,8 +2126,9 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - vma_gap_update(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2553,7 +2137,7 @@ int expand_downwards(struct vm_area_struct *vma, } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); + mas_destroy(&mas); return error; } @@ -2627,25 +2211,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL_GPL(find_extend_vma); /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Ok - we have the memory areas we should free on a maple tree so release them, + * and do the vma updates. * * Called with the mm semaphore held. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { unsigned long nr_accounted = 0; + struct vm_area_struct *vma; /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); - do { + mas_for_each(mas, vma, ULONG_MAX) { long nrpages = vma_pages(vma); if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - vma = remove_vma(vma); - } while (vma); + remove_vma(vma); + } vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -2655,67 +2240,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, unsigned long end) { - struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end); - free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mt, vma, start, end); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); } /* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - vma->vm_prev = NULL; - do { - vma_rb_erase(vma, &mm->mm_rb); - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(vma); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - if (vma) { - vma->vm_prev = prev; - vma_gap_update(vma); - } else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - tail_vma->vm_next = NULL; - - /* Kill the cache */ - vmacache_invalidate(mm); - - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (vma && (vma->vm_flags & VM_GROWSDOWN)) - return false; - if (prev && (prev->vm_flags & VM_GROWSUP)) - return false; - return true; -} - -/* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ @@ -2724,6 +2265,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *new; int err; + validate_mm_mt(mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2776,6 +2318,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); + validate_mm_mt(mm); return err; } @@ -2792,38 +2335,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge <jeremy@goop.org> - */ -int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) { - unsigned long end; - struct vm_area_struct *vma, *prev, *last; - - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; - len = PAGE_ALIGN(len); - end = start + len; - if (len == 0) - return -EINVAL; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); - /* - * arch_unmap() might do unmaps itself. It must be called - * and finish any rbtree manipulation before this code - * runs and also starts to manipulate the rbtree. - */ - arch_unmap(mm, start, end); + return 0; +} - /* Find the first overlapping VMA where start < vma->vm_end */ - vma = find_vma_intersection(mm, start, end); - if (!vma) - return 0; - prev = vma->vm_prev; +/* + * do_mas_align_munmap() - munmap the aligned region from @start to @end. + * @mas: The maple_state, ideally set up to alter the correct tree location. + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * + * If @downgrade is true, check return code for potential release of the lock. + */ +static int +do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool downgrade) +{ + struct vm_area_struct *prev, *next = NULL; + struct maple_tree mt_detach; + int count = 0; + int error = -ENOMEM; + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt_detach, &mm->mmap_lock); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2831,8 +2384,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ + + /* Does it split the first one? */ if (start > vma->vm_start) { - int error; /* * Make sure that map_count on return from munmap() will @@ -2840,22 +2394,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * its limit temporarily, to help free resources as expected. */ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - return -ENOMEM; + goto map_count_exceeded; + /* + * mas_pause() is not needed since mas->index needs to be set + * differently than vma->vm_end anyways. + */ error = __split_vma(mm, vma, start, 0); if (error) - return error; - prev = vma; + goto start_split_failed; + + mas_set(mas, start); + vma = mas_walk(mas); } - /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { - int error = __split_vma(mm, last, end, 1); + prev = mas_prev(mas, 0); + if (unlikely((!prev))) + mas_set(mas, start); + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + mas_for_each(mas, next, end - 1) { + /* Does it split the end? */ + if (next->vm_end > end) { + struct vm_area_struct *split; + + error = __split_vma(mm, next, end, 1); + if (error) + goto end_split_failed; + + mas_set(mas, end); + split = mas_prev(mas, 0); + error = munmap_sidetree(split, &mas_detach); + if (error) + goto munmap_sidetree_failed; + + count++; + if (vma == next) + vma = split; + break; + } + error = munmap_sidetree(next, &mas_detach); if (error) - return error; + goto munmap_sidetree_failed; + + count++; +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < start); + BUG_ON(next->vm_start > end); +#endif } - vma = vma_next(mm, prev); + + if (!next) + next = mas_next(mas, ULONG_MAX); if (unlikely(uf)) { /* @@ -2867,30 +2460,366 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - int error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(mm, start, end, uf); + if (error) - return error; + goto userfaultfd_error; + } + + /* Point of no return */ + mas_set_range(mas, start, end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, &mt_detach, start, end - 1); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + rcu_read_lock(); + vma_test = mas_find(&test, end - 1); + mas_for_each(mas, vma_mas, end - 1) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, end - 1); + } + rcu_read_unlock(); + BUG_ON(count != test_count); + mas_set_range(mas, start, end - 1); + } +#endif + mas_store_prealloc(mas, NULL); + mm->map_count -= count; + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (downgrade) { + if (next && (next->vm_flags & VM_GROWSDOWN)) + downgrade = false; + else if (prev && (prev->vm_flags & VM_GROWSUP)) + downgrade = false; + else + mmap_write_downgrade(mm); } - /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) - downgrade = false; + unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* Statistics and freeing VMAs */ + mas_set(&mas_detach, start); + remove_mt(mm, &mas_detach); + __mt_destroy(&mt_detach); + + + validate_mm(mm); + return downgrade ? 1 : 0; + +userfaultfd_error: +munmap_sidetree_failed: +end_split_failed: + __mt_destroy(&mt_detach); +start_split_failed: +map_count_exceeded: + mas_destroy(mas); + return error; +} + +/* + * do_mas_munmap() - munmap a given range. + * @mas: The maple state + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @downgrade: set to true if the user wants to attempt to write_downgrade the + * mmap_sem + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned and any arch_unmap work will be preformed. + * + * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + */ +int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; - if (downgrade) - mmap_write_downgrade(mm); + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; - unmap_region(mm, vma, prev, start, end); + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, start, end); - /* Fix up all other VM information */ - remove_vma_list(mm, vma); + /* Find the first overlapping VMA */ + vma = mas_find(mas, end - 1); + if (!vma) + return 0; - return downgrade ? 1 : 0; + return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); } +/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length to be munmapped. + * @uf: The userfaultfd list_head + */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - return __do_munmap(mm, start, len, uf, false); + MA_STATE(mas, &mm->mm_mt, start, start); + + return do_mas_munmap(&mas, mm, start, len, uf, false); +} + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; + unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + nr_pages = count_vma_pages_range(mm, addr, end); + + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Unmap any existing mapping in the area */ + if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + return -ENOMEM; + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX , NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_start = addr; + vma->vm_end = end; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto free_vma; + } + + vma->vm_file = get_file(file); + error = call_mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + mas_reset(&mas); + + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + vma_set_anonymous(vma); + } + + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + + /* Once vma denies write, undo our temporary denial count */ +unmap_writable: + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + file = vma->vm_file; +expanded: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + validate_mm(mm); + return addr; + +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + validate_mm(mm); + return error; } static int __vm_munmap(unsigned long start, size_t len, bool downgrade) @@ -2898,11 +2827,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, start, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __do_munmap(mm, start, len, &uf, downgrade); + ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2968,11 +2898,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; if (start + size > vma->vm_end) { - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; - for (next = vma->vm_next; next; next = next->vm_next) { + for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ - if (next->vm_start != next->vm_prev->vm_end) + if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) @@ -2981,8 +2912,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; - if (start + size <= next->vm_end) - break; + prev = next; } if (!next) @@ -3012,37 +2942,106 @@ out: } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. */ -static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - struct rb_node **rb_link, *rb_parent; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; - unsigned long mapped_addr; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct unmap, *next; + unsigned long unmap_pages; + int ret; - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + arch_unmap(mm, newbrk, oldbrk); - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) - return mapped_addr; + if (likely((vma->vm_end < oldbrk) || (vma->vm_start >= newbrk))) { + /* remove entire mapping(s) */ + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, + true); + goto munmap_full_vma; + } - error = mlock_future_check(mm, mm->def_flags, len); - if (error) - return error; + ret = userfaultfd_unmap_prep(mm, newbrk, oldbrk, uf); + if (ret) + return ret; + + ret = 1; - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Change the oldbrk of vma to the newbrk of the munmap area */ + vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0); + if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; - /* Check against address space limits *after* clearing old maps... */ + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + vma->vm_end = newbrk; + vma_init(&unmap, mm); + unmap.vm_start = newbrk; + unmap.vm_end = oldbrk; + if (vma->anon_vma) + vma_set_anonymous(&unmap); + + vma_mas_remove(&unmap, mas); + + vma->vm_end = newbrk; + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + + unmap_pages = vma_pages(&unmap); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= unmap_pages; + + next = mas_next(mas, ULONG_MAX); + mmap_write_downgrade(mm); + unmap_region(mm, mas->tree, &unmap, vma, next, newbrk, oldbrk); + /* Statistics */ + vm_stat_account(mm, vma->vm_flags, -unmap_pages); + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(unmap_pages); + +munmap_full_vma: + validate_mm_mt(mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + validate_mm_mt(mm); + + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -3052,28 +3051,50 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - /* - * create a vma struct for an anonymous mapping + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if (vma && + (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + mas->index = vma->vm_start; + mas->last = addr + len - 1; + vma_adjust_trans_huge(vma, addr, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; } + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; + vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if ( mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; + out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -3081,16 +3102,25 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; + validate_mm(mm); return 0; + +mas_store_fail: + vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3101,13 +3131,36 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + if (!vma || vma->vm_end != addr || vma_policy(vma) || + !can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT,NULL_VM_UFFD_CTX, NULL)) + vma = NULL; + + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; } EXPORT_SYMBOL(vm_brk_flags); @@ -3123,34 +3176,19 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_lock for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_lock is - * dropped. - * - * Nothing can be holding mm->mmap_lock here and the above call - * to mmu_notifier_release(mm) ensures mmu notifier callbacks in - * __oom_reap_task_mm() will not block. - */ - (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - } - - mmap_write_lock(mm); + mmap_read_lock(mm); arch_exit_mmap(mm); - vma = mm->mmap; + vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ - mmap_write_unlock(mm); + mmap_read_unlock(mm); return; } @@ -3158,19 +3196,38 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. Do not bother checking + * mm_is_oom_victim because setting a bit unconditionally is cheaper. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* Walk the list again, actually closing and freeing it. */ - while (vma) { + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - vma = remove_vma(vma); + remove_vma(vma); + count++; cond_resched(); - } - mm->mmap = NULL; + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3181,12 +3238,9 @@ void exit_mmap(struct mm_struct *mm) */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; @@ -3208,7 +3262,9 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma)) + return -ENOMEM; + return 0; } @@ -3224,9 +3280,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; + validate_mm_mt(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3236,8 +3292,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma->vm_start < addr + len) return NULL; /* should never get here */ + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3278,16 +3336,22 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); + if (vma_link(mm, new_vma)) + goto out_vma_link; *need_rmap_locks = false; } + validate_mm_mt(mm); return new_vma; +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: + validate_mm_mt(mm); return NULL; } @@ -3424,6 +3488,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; + validate_mm_mt(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3446,10 +3511,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); + validate_mm_mt(mm); return vma; out: vm_area_free(vma); + validate_mm_mt(mm); return ERR_PTR(ret); } @@ -3574,12 +3641,13 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3587,7 +3655,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3595,7 +3664,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3654,11 +3724,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..041beeb5fad6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -38,6 +38,39 @@ #include "internal.h" +static inline bool can_change_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + + if (pte_protnone(pte) || !pte_dirty(pte)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_pte_wp(vma, pte)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* + * We can only special-case on exclusive anonymous pages, + * because we know that our write-fault handler similarly would + * map them writable without any additional checks while holding + * the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + } + + return true; +} + static unsigned long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; - bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -95,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, continue; page = vm_normal_page(vma, addr, oldpte); - if (!page || PageKsm(page)) + if (!page || is_zone_device_page(page) || PageKsm(page)) continue; /* Also skip shared copy-on-write pages */ @@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, ptent = pte_wrprotect(ptent); ptent = pte_mkuffd_wp(ptent); } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled - * by PF interrupt handler, then - * things like COW could be properly - * handled. - */ ptent = pte_clear_uffd_wp(ptent); } - /* Avoid taking write faults for known dirty pages */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) { + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) ptent = pte_mkwrite(ptent); - } + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); if (pte_needs_flush(oldpte, ptent)) tlb_flush_pte_range(tlb, addr, PAGE_SIZE); @@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; + bool try_change_writable; pgoff_t pgoff; int error; - int dirty_accountable = 0; if (newflags == oldflags) { *pprev = vma; @@ -583,11 +621,20 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); + else + try_change_writable = !!(vma->vm_flags & VM_WRITE); vma_set_page_prot(vma); change_protection(tlb, vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -621,6 +668,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); start = untagged_addr(start); @@ -652,7 +700,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + vma = mas_find(&mas, ULONG_MAX); error = -ENOMEM; if (!vma) goto out; @@ -678,7 +726,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (start > vma->vm_start) prev = vma; else - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { @@ -741,7 +789,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(current->mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; diff --git a/mm/mremap.c b/mm/mremap.c index b522cd0259a0..e465ffe279bb 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,6 +9,7 @@ */ #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/ksm.h> @@ -23,6 +24,7 @@ #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> +#include <linux/mempolicy.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -716,7 +718,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (excess) { vma->vm_flags |= VM_ACCOUNT; if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; @@ -866,9 +868,10 @@ out: static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ return 0; - if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) @@ -975,20 +978,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * __do_munmap does all the needed commit accounting, and + * do_mas_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); - retval = __do_munmap(mm, addr+new_len, old_len - new_len, - &uf_unmap, true); - if (retval < 0 && old_len != new_len) { - ret = retval; - goto out; + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ - } else if (retval == 1) + if (retval == 1) { downgraded = true; + } else if (retval < 0 && old_len != new_len) { + ret = retval; + goto out; + } + ret = addr; goto out; } @@ -1008,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1016,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } } - if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + /* + * Function vma_merge() is called on the extension we are adding to + * the already existing vma, vma_merge() will merge this extension with + * the already existing vma (expand operation itself) and possibly also + * with the next vma if it becomes adjacent to the expanded vma and + * otherwise compatible. + */ + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; goto out; diff --git a/mm/msync.c b/mm/msync.c index 137d1c104f3e..ac4c9bfea2e7 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) error = 0; goto out_unlock; } - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); } } out_unlock: diff --git a/mm/nommu.c b/mm/nommu.c index 9d7afc2d959e..7b3fad611895 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -19,7 +19,6 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/sched/mm.h> -#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) { - struct vm_area_struct *pvma, *prev; - struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} - BUG_ON(!vma->vm_region); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) +{ mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + BUG_ON(!vma->vm_region); - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + setup_vma_to_mm(vma, mm); - __vma_link_list(mm, vma, prev); + /* add the VMA to the tree */ + vma_mas_store(vma, mas); } /* - * delete a VMA from its owning mm_struct and address space + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked */ -static void delete_vma_from_mm(struct vm_area_struct *vma) -{ - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; - - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); - __vma_unlink_list(mm, vma); + /* remove from the MM's tree and list */ + vma_mas_remove(vma, &mas); + return 0; } /* @@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) vm_area_free(vma); } +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + /* * look up the first VMA in which addr resides, NULL if none * - should be called with mm->mmap_lock at least held readlocked */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; - - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } + MA_STATE(mas, &mm->mm_mt, addr, addr); - return NULL; + return mas_walk(&mas); } EXPORT_SYMBOL(find_vma); @@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + return vma; } /* @@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); *populate = 0; @@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + mas_add_vma_to_mm(&mas, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1262,6 +1250,7 @@ error: sharing_violation: up_write(&nommu_region_sem); + mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1278,6 +1267,14 @@ error_getting_region: len, current->pid); show_free_areas(0, NULL); return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1378,7 +1376,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1391,8 +1388,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); return 0; } @@ -1408,12 +1414,14 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + return -ENOMEM; if (from > vma->vm_start) vma->vm_end = from; else vma->vm_start = to; - add_vma_to_mm(mm, vma); + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; /* cut the backing region down to size */ region = vma->vm_region; @@ -1441,9 +1449,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1452,7 +1461,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = mas_find(&mas, end - 1); if (!vma) { static int limit; if (limit < 5) { @@ -1471,7 +1480,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = mas_next(&mas, end - 1); } while (vma); return -EINVAL; } else { @@ -1493,9 +1502,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list } erase_whole_vma: - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; delete_vma(mm, vma); - return 0; + return ret; } int vm_munmap(unsigned long addr, size_t len) @@ -1520,6 +1530,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1527,12 +1538,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } + __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } int vm_brk(unsigned long addr, unsigned long len) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c6cf9e3cd66..35ec75cdfee2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -509,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -bool __oom_reap_task_mm(struct mm_struct *mm) +static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; + VMA_ITERATOR(vmi, mm, 0); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; @@ -764,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5abeb349c391..2eb6ad5a650a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -126,13 +126,97 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -struct pagesets { - local_lock_t lock; -}; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { - .lock = INIT_LOCAL_LOCK(lock), -}; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin() preempt_disable() +#define pcpu_task_unpin() preempt_enable() +#else +#define pcpu_task_pin() migrate_disable() +#define pcpu_task_unpin() migrate_enable() +#endif + +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock(&_ret->member); \ + _ret; \ +}) + +#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock_irqsave(&_ret->member, flags); \ + _ret; \ +}) + +#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + pcpu_task_unpin(); \ + _ret = NULL; \ + } \ + _ret; \ +}) + +#define pcpu_spin_unlock(member, ptr) \ +({ \ + spin_unlock(&ptr->member); \ + pcpu_task_unpin(); \ +}) + +#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ +({ \ + spin_unlock_irqrestore(&ptr->member, flags); \ + pcpu_task_unpin(); \ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr) \ + pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_lock_irqsave(ptr, flags) \ + pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_trylock_irqsave(ptr, flags) \ + pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + +#define pcp_spin_unlock_irqrestore(ptr, flags) \ + pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif -/* work_structs for global per-cpu drains */ -struct pcpu_drain { - struct zone *zone; - struct work_struct work; -}; static DEFINE_MUTEX(pcpu_drain_mutex); -static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -653,7 +731,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - base = PAGE_ALLOC_COSTLY_ORDER + 1; + return NR_LOWORDER_PCP_LISTS; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -667,7 +745,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (pindex == NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -744,6 +822,14 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; @@ -785,7 +871,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, return false; __SetPageGuard(page); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -928,7 +1014,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add(&page->lru, &area->free_list[migratetype]); + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -938,7 +1024,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->lru, &area->free_list[migratetype]); + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -952,7 +1038,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -962,7 +1048,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); - list_del(&page->lru); + list_del(&page->buddy_list); __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; @@ -1296,18 +1382,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } @@ -1396,7 +1478,7 @@ static __always_inline bool free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1473,10 +1555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1504,11 +1583,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; - page = list_last_entry(list, struct page, lru); + page = list_last_entry(list, struct page, pcp_list); mt = get_pcppage_migratetype(page); /* must delete to avoid corrupting pcp list */ - list_del(&page->lru); + list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; @@ -2441,7 +2520,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } /* If memory is still not initialized, do it now. */ if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); @@ -3044,10 +3123,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3068,7 +3144,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ - list_add_tail(&page->lru, list); + list_add_tail(&page->pcp_list, list); allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3091,51 +3167,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to drain pagesets of this * currently executing processor on remote nodes after they have * expired. - * - * Note that this function must be called with the thread pinned to - * a single processor. */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif /* * Drain pcplists of the indicated processor and zone. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp, 0); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* * Drain pcplists of all zones on the indicated processor. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages(unsigned int cpu) { @@ -3148,9 +3221,6 @@ static void drain_pages(unsigned int cpu) /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. - * - * The CPU has to be pinned. When zone parameter is non-NULL, spill just - * the single zone's pages. */ void drain_local_pages(struct zone *zone) { @@ -3162,24 +3232,6 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } -static void drain_local_pages_wq(struct work_struct *work) -{ - struct pcpu_drain *drain; - - drain = container_of(work, struct pcpu_drain, work); - - /* - * drain_all_pages doesn't use proper cpu hotplug protection so - * we can race with cpu offline when the WQ can move this from - * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is alright but we also have to make sure to not move to - * a different one. - */ - migrate_disable(); - drain_local_pages(drain->zone); - migrate_enable(); -} - /* * The implementation of drain_all_pages(), exposing an extra parameter to * drain on all cpus. @@ -3201,13 +3253,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) static cpumask_t cpus_with_pcps; /* - * Make sure nobody triggers this path before mm_percpu_wq is fully - * initialized. - */ - if (WARN_ON_ONCE(!mm_percpu_wq)) - return; - - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need * the drain to be complete when the call returns. @@ -3256,14 +3301,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) } for_each_cpu(cpu, &cpus_with_pcps) { - struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); - - drain->zone = zone; - INIT_WORK(&drain->work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, &drain->work); + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } - for_each_cpu(cpu, &cpus_with_pcps) - flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -3272,8 +3314,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. - * - * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -3318,7 +3358,7 @@ void mark_free_pages(struct zone *zone) for_each_migratetype_order(order, t) { list_for_each_entry(page, - &zone->free_area[order].free_list[t], lru) { + &zone->free_area[order].free_list[t], buddy_list) { unsigned long i; pfn = page_to_pfn(page); @@ -3395,19 +3435,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, int migratetype, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; bool free_high; __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); - list_add(&page->lru, &pcp->lists[pindex]); + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; /* @@ -3432,6 +3470,9 @@ static void free_unref_page_commit(struct page *page, int migratetype, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; @@ -3454,9 +3495,16 @@ void free_unref_page(struct page *page, unsigned int order) migratetype = MIGRATE_MOVABLE; } - local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, migratetype, order); - local_unlock_irqrestore(&pagesets.lock, flags); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (pcp) { + free_unref_page_commit(zone, pcp, page, migratetype, order); + pcp_spin_unlock_irqrestore(pcp, flags); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); } /* @@ -3465,6 +3513,8 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; unsigned long flags; int batch_count = 0; int migratetype; @@ -3489,8 +3539,18 @@ void free_unref_page_list(struct list_head *list) } } - local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); + + locked_zone = zone; + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); + } + /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3500,19 +3560,21 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, migratetype, 0); + free_unref_page_commit(zone, pcp, page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); batch_count = 0; - local_lock_irqsave(&pagesets.lock, flags); + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } } - local_unlock_irqrestore(&pagesets.lock, flags); + + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); } /* @@ -3637,6 +3699,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, #endif } +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + unsigned int order, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + unsigned long flags; + + do { + page = NULL; + spin_lock_irqsave(&zone->lock, flags); + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; + } + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + + return page; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -3670,8 +3769,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, return NULL; } - page = list_first_entry(list, struct page, lru); - list_del(&page->lru); + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); pcp->count -= 1 << order; } while (check_new_pcp(page, order)); @@ -3688,19 +3787,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; - local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) { + pcp_trylock_finish(UP_flags); + return NULL; + } /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); @@ -3717,9 +3826,14 @@ struct page *rmqueue(struct zone *preferred_zone, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { - unsigned long flags; struct page *page; + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and @@ -3729,53 +3843,23 @@ struct page *rmqueue(struct zone *preferred_zone, migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } } - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - - do { - page = NULL; - spin_lock_irqsave(&zone->lock, flags); - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ - if (order > 0 && alloc_flags & ALLOC_HARDER) - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); - if (!page) - goto failed; - } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); - } while (check_new_pages(page, order)); - - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, 1); + page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, + migratetype); out: /* Separate test+clear to avoid unnecessary atomics */ - if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; - -failed: - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -3980,12 +4064,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4720,12 +4804,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -4918,14 +5002,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; @@ -5245,6 +5321,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5325,11 +5402,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; + /* Is a parallel drain in progress? */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) + goto failed_irq; + /* Attempt the batch allocation */ - local_lock_irqsave(&pagesets.lock, flags); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; - while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5342,8 +5422,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp, pcp_list); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) + if (!nr_account) { + pcp_spin_unlock_irqrestore(pcp, flags); goto failed_irq; + } break; } nr_account++; @@ -5356,7 +5438,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); @@ -5365,7 +5448,7 @@ out: return nr_populated; failed_irq: - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_trylock_finish(UP_flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); @@ -5796,14 +5879,14 @@ long si_mem_available(void) /* * Estimate the amount of memory available for userspace allocations, - * without causing swapping. + * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. */ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); @@ -6690,13 +6773,18 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores compound_mapcount_ptr(), + * compound_order() and compound_pincount_ptr(). Call + * prep_compound_head() after the first tail page have + * been initialized to not have the data overwritten. + * + * Note the idea to make this right after we initialize + * the offending tail pages is trying to take advantage + * of the likelihood of those tail struct pages being + * cached given that we will read them right after in + * prep_compound_head(). */ - if (pfn == head_pfn + 2) + if (unlikely(pfn == head_pfn + 1)) prep_compound_head(head, order); } } @@ -7005,6 +7093,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]); diff --git a/mm/page_io.c b/mm/page_io.c index 68318134dc92..f75ebbc95ee6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -222,13 +222,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) struct cgroup_subsys_state *css; struct mem_cgroup *memcg; + rcu_read_lock(); memcg = page_memcg(page); if (!memcg) - return; + goto out; - rcu_read_lock(); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); +out: rcu_read_unlock(); } #else diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c10f839fc410..e971a467fcdf 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -243,7 +243,7 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transparent_hugepage_active(vma) && + transhuge_vma_suitable(vma, pvmw->address) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1d..53e5c145fcce 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -456,7 +456,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { diff --git a/mm/rmap.c b/mm/rmap.c index f22d7578ac02..edc06c52bc82 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1537,6 +1537,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. @@ -1551,31 +1553,28 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (!folio_test_anon(folio)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); - - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; - } + page_vma_mapped_walk_done(&pvmw); + break; } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -1619,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -1921,6 +1918,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may @@ -1930,31 +1929,28 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (!folio_test_anon(folio)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); - - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; - } + page_vma_mapped_walk_done(&pvmw); + break; } /* Nuke the hugetlb page table entry */ @@ -1972,7 +1968,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (folio_is_zone_device(folio)) { + if (folio_is_device_private(folio)) { unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; @@ -2013,9 +2009,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2083,8 +2077,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) - set_huge_swap_pte_at(mm, address, pvmw.pte, - swp_pte, vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, swp_pte); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), @@ -2138,7 +2131,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) TTU_SYNC))) return; - if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) + if (folio_is_zone_device(folio) && + (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* diff --git a/mm/shmem.c b/mm/shmem.c index 133c67057d41..7331ed1be014 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1690,7 +1690,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, return; folio_wait_writeback(folio); - delete_from_swap_cache(&folio->page); + delete_from_swap_cache(folio); spin_lock_irq(&info->lock); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't @@ -1788,7 +1788,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(&folio->page); + delete_from_swap_cache(folio); folio_mark_dirty(folio); swap_free(swap); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c new file mode 100644 index 000000000000..e5b40c43221d --- /dev/null +++ b/mm/shrinker_debug.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/shrinker.h> +#include <linux/memcontrol.h> + +/* defined in vmscan.c */ +extern struct rw_semaphore shrinker_rwsem; +extern struct list_head shrinker_list; + +static DEFINE_IDA(shrinker_debugfs_ida); +static struct dentry *shrinker_debugfs_root; + +static unsigned long shrinker_count_objects(struct shrinker *shrinker, + struct mem_cgroup *memcg, + unsigned long *count_per_node) +{ + unsigned long nr, total = 0; + int nid; + + for_each_node(nid) { + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + + nr = shrinker->count_objects(shrinker, &sc); + if (nr == SHRINK_EMPTY) + nr = 0; + } else { + nr = 0; + } + + count_per_node[nid] = nr; + total += nr; + } + + return total; +} + +static int shrinker_debugfs_count_show(struct seq_file *m, void *v) +{ + struct shrinker *shrinker = m->private; + unsigned long *count_per_node; + struct mem_cgroup *memcg; + unsigned long total; + bool memcg_aware; + int ret, nid; + + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); + if (!count_per_node) + return -ENOMEM; + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); + + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (memcg && !mem_cgroup_online(memcg)) + continue; + + total = shrinker_count_objects(shrinker, + memcg_aware ? memcg : NULL, + count_per_node); + if (total) { + seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + for_each_node(nid) + seq_printf(m, " %lu", count_per_node[nid]); + seq_putc(m, '\n'); + } + + if (!memcg_aware) { + mem_cgroup_iter_break(NULL, memcg); + break; + } + + if (signal_pending(current)) { + mem_cgroup_iter_break(NULL, memcg); + ret = -EINTR; + break; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + + rcu_read_unlock(); + up_read(&shrinker_rwsem); + + kfree(count_per_node); + return ret; +} +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); + +static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t shrinker_debugfs_scan_write(struct file *file, + const char __user *buf, + size_t size, loff_t *pos) +{ + struct shrinker *shrinker = file->private_data; + unsigned long nr_to_scan = 0, ino, read_len; + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + }; + struct mem_cgroup *memcg = NULL; + int nid; + char kbuf[72]; + ssize_t ret; + + read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + + if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + return -EINVAL; + + if (nid < 0 || nid >= nr_node_ids) + return -EINVAL; + + if (nr_to_scan == 0) + return size; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + memcg = mem_cgroup_get_from_ino(ino); + if (!memcg || IS_ERR(memcg)) + return -ENOENT; + + if (!mem_cgroup_online(memcg)) { + mem_cgroup_put(memcg); + return -ENOENT; + } + } else if (ino != 0) { + return -EINVAL; + } + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } + + sc.nid = nid; + sc.memcg = memcg; + sc.nr_to_scan = nr_to_scan; + sc.nr_scanned = nr_to_scan; + + shrinker->scan_objects(shrinker, &sc); + + up_read(&shrinker_rwsem); + mem_cgroup_put(memcg); + + return size; +} + +static const struct file_operations shrinker_debugfs_scan_fops = { + .owner = THIS_MODULE, + .open = shrinker_debugfs_scan_open, + .write = shrinker_debugfs_scan_write, +}; + +int shrinker_debugfs_add(struct shrinker *shrinker) +{ + struct dentry *entry; + char buf[128]; + int id; + + lockdep_assert_held(&shrinker_rwsem); + + /* debugfs isn't initialized yet, add debugfs entries later. */ + if (!shrinker_debugfs_root) + return 0; + + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL); + if (id < 0) + return id; + shrinker->debugfs_id = id; + + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id); + + /* create debugfs entry */ + entry = debugfs_create_dir(buf, shrinker_debugfs_root); + if (IS_ERR(entry)) { + ida_free(&shrinker_debugfs_ida, id); + return PTR_ERR(entry); + } + shrinker->debugfs_entry = entry; + + debugfs_create_file("count", 0220, entry, shrinker, + &shrinker_debugfs_count_fops); + debugfs_create_file("scan", 0440, entry, shrinker, + &shrinker_debugfs_scan_fops); + return 0; +} + +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + struct dentry *entry; + char buf[128]; + const char *new, *old; + va_list ap; + int ret = 0; + + va_start(ap, fmt); + new = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!new) + return -ENOMEM; + + down_write(&shrinker_rwsem); + + old = shrinker->name; + shrinker->name = new; + + if (shrinker->debugfs_entry) { + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, + shrinker->debugfs_id); + + entry = debugfs_rename(shrinker_debugfs_root, + shrinker->debugfs_entry, + shrinker_debugfs_root, buf); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + shrinker->debugfs_entry = entry; + } + + up_write(&shrinker_rwsem); + + kfree_const(old); + + return ret; +} +EXPORT_SYMBOL(shrinker_debugfs_rename); + +void shrinker_debugfs_remove(struct shrinker *shrinker) +{ + lockdep_assert_held(&shrinker_rwsem); + + kfree_const(shrinker->name); + + if (!shrinker->debugfs_entry) + return; + + debugfs_remove_recursive(shrinker->debugfs_entry); + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); +} + +static int __init shrinker_debugfs_init(void) +{ + struct shrinker *shrinker; + struct dentry *dentry; + int ret = 0; + + dentry = debugfs_create_dir("shrinker", NULL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + shrinker_debugfs_root = dentry; + + /* Create debugfs entries for shrinkers registered at boot */ + down_write(&shrinker_rwsem); + list_for_each_entry(shrinker, &shrinker_list, list) + if (!shrinker->debugfs_entry) { + ret = shrinker_debugfs_add(shrinker); + if (ret) + break; + } + up_write(&shrinker_rwsem); + + return ret; +} +late_initcall(shrinker_debugfs_init); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d8bb32fd58fc..5f0ed4717ed0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -556,7 +556,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, } else { /* * When a PTE/PMD entry is freed from the init_mm - * there's a a free_pages() call to this page allocated + * there's a free_pages() call to this page allocated * above. Thus this get_page() is paired with the * put_page_testzero() on the freeing path. * This can only called by certain ZONE_DEVICE path, diff --git a/mm/sparse.c b/mm/sparse.c index cb3bfae64036..e5a8a3a0edd7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p { unsigned long coded_mem_map = (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT)); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); return coded_mem_map; } diff --git a/mm/swap.c b/mm/swap.c index 275a4ea1bc66..ae256e333713 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,30 +46,30 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +/* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; - struct pagevec pvec; + struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* - * The following struct pagevec are grouped together because they are protected + * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ -struct lru_pvecs { +struct cpu_fbatches { local_lock_t lock; - struct pagevec lru_add; - struct pagevec lru_deactivate_file; - struct pagevec lru_deactivate; - struct pagevec lru_lazyfree; + struct folio_batch lru_add; + struct folio_batch lru_deactivate_file; + struct folio_batch lru_deactivate; + struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP - struct pagevec activate_page; + struct folio_batch activate; #endif }; -static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { +static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -77,36 +77,35 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { * This path almost never happens for VM activity - pages are normally freed * via pagevecs. But it gets used by networking - and for compound pages. */ -static void __page_cache_release(struct page *page) +static void __page_cache_release(struct folio *folio) { - if (PageLRU(page)) { - struct folio *folio = page_folio(page); + if (folio_test_lru(folio)) { struct lruvec *lruvec; unsigned long flags; lruvec = folio_lruvec_lock_irqsave(folio, &flags); - del_page_from_lru_list(page, lruvec); - __clear_page_lru_flags(page); - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); + lruvec_unlock_irqrestore(lruvec, flags); } - /* See comment on PageMlocked in release_pages() */ - if (unlikely(PageMlocked(page))) { - int nr_pages = thp_nr_pages(page); + /* See comment on folio_test_mlocked in release_pages() */ + if (unlikely(folio_test_mlocked(folio))) { + long nr_pages = folio_nr_pages(folio); - __ClearPageMlocked(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + __folio_clear_mlocked(folio); + zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } -static void __put_single_page(struct page *page) +static void __folio_put_small(struct folio *folio) { - __page_cache_release(page); - mem_cgroup_uncharge(page_folio(page)); - free_unref_page(page, 0); + __page_cache_release(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, 0); } -static void __put_compound_page(struct page *page) +static void __folio_put_large(struct folio *folio) { /* * __page_cache_release() is supposed to be called for thp, not for @@ -114,21 +113,21 @@ static void __put_compound_page(struct page *page) * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ - if (!PageHuge(page)) - __page_cache_release(page); - destroy_compound_page(page); + if (!folio_test_hugetlb(folio)) + __page_cache_release(folio); + destroy_large_folio(folio); } -void __put_page(struct page *page) +void __folio_put(struct folio *folio) { - if (unlikely(is_zone_device_page(page))) - free_zone_device_page(page); - else if (unlikely(PageCompound(page))) - __put_compound_page(page); + if (unlikely(folio_is_zone_device(folio))) + free_zone_device_page(&folio->page); + else if (unlikely(folio_test_large(folio))) + __folio_put_large(folio); else - __put_single_page(page); + __folio_put_small(folio); } -EXPORT_SYMBOL(__put_page); +EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages @@ -138,19 +137,19 @@ EXPORT_SYMBOL(__put_page); */ void put_pages_list(struct list_head *pages) { - struct page *page, *next; + struct folio *folio, *next; - list_for_each_entry_safe(page, next, pages, lru) { - if (!put_page_testzero(page)) { - list_del(&page->lru); + list_for_each_entry_safe(folio, next, pages, lru) { + if (!folio_put_testzero(folio)) { + list_del(&folio->lru); continue; } - if (PageHead(page)) { - list_del(&page->lru); - __put_compound_page(page); + if (folio_test_large(folio)) { + list_del(&folio->lru); + __folio_put_large(folio); continue; } - /* Cannot be PageLRU because it's passed to us using the lru */ + /* LRU flag must be clear because it's passed using the lru */ } free_unref_page_list(pages); @@ -188,36 +187,84 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, struct lruvec *lruvec)) +typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); + +static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) +{ + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + /* + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests the mlocked flag, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears the mlocked flag + * while the LRU lock is held. + * + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear the mlocked flag after + * folio_put_testzero() has excluded any other users of the folio.) + */ + if (folio_evictable(folio)) { + if (was_unevictable) + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } else { + folio_clear_active(folio); + folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; + if (!was_unevictable) + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + } + + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); +} + +static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - /* block memcg migration during page moving between lru */ - if (!TestClearPageLRU(page)) + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - (*move_fn)(page, lruvec); + move_fn(lruvec, folio); - SetPageLRU(page); + folio_set_lru(folio); } + if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); + lruvec_unlock_irqrestore(lruvec, flags); + folios_put(fbatch->folios, folio_batch_count(fbatch)); + folio_batch_init(fbatch); } -static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) +static void folio_batch_add_and_move(struct folio_batch *fbatch, + struct folio *folio, move_fn_t move_fn) { - struct folio *folio = page_folio(page); + if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && + !lru_cache_disabled()) + return; + folio_batch_move_lru(fbatch, move_fn); +} +static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) +{ if (!folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_active(folio); @@ -226,18 +273,6 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) } } -/* return true if pagevec needs to drain */ -static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) -{ - bool ret = false; - - if (!pagevec_add(pvec, page) || PageCompound(page) || - lru_cache_disabled()) - ret = true; - - return ret; -} - /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it @@ -249,14 +284,13 @@ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio) && folio_test_lru(folio)) { - struct pagevec *pvec; + struct folio_batch *fbatch; unsigned long flags; folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); - pvec = this_cpu_ptr(&lru_rotate.pvec); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); + fbatch = this_cpu_ptr(&lru_rotate.fbatch); + folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } @@ -303,11 +337,16 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) void lru_note_cost_folio(struct folio *folio) { + WARN_ON_ONCE(!rcu_read_lock_held()); + /* + * The rcu read lock is held by the caller, so we do not need to + * care about the lruvec returned by folio_lruvec() being released. + */ lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio)); } -static void __folio_activate(struct folio *folio, struct lruvec *lruvec) +static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); @@ -324,41 +363,30 @@ static void __folio_activate(struct folio *folio, struct lruvec *lruvec) } #ifdef CONFIG_SMP -static void __activate_page(struct page *page, struct lruvec *lruvec) +static void folio_activate_drain(int cpu) { - return __folio_activate(page_folio(page), lruvec); -} + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); -static void activate_page_drain(int cpu) -{ - struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); - - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, __activate_page); -} - -static bool need_activate_page_drain(int cpu) -{ - return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, folio_activate_fn); } static void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { - struct pagevec *pvec; + struct folio_batch *fbatch; folio_get(folio); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.activate_page); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, __activate_page); - local_unlock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.activate); + folio_batch_add_and_move(fbatch, folio, folio_activate_fn); + local_unlock(&cpu_fbatches.lock); } } #else -static inline void activate_page_drain(int cpu) +static inline void folio_activate_drain(int cpu) { } @@ -368,8 +396,8 @@ static void folio_activate(struct folio *folio) if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); - __folio_activate(folio, lruvec); - unlock_page_lruvec_irq(lruvec); + folio_activate_fn(lruvec, folio); + lruvec_unlock_irq(lruvec); folio_set_lru(folio); } } @@ -377,32 +405,32 @@ static void folio_activate(struct folio *folio) static void __lru_cache_activate_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; int i; - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* - * Search backwards on the optimistic assumption that the page being - * activated has just been added to this pagevec. Note that only - * the local pagevec is examined as a !PageLRU page could be in the + * Search backwards on the optimistic assumption that the folio being + * activated has just been added to this batch. Note that only + * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote - * pagevec that is currently being drained. Furthermore, marking - * a remote pagevec's page PageActive potentially hits a race where - * a page is marked PageActive just after it is added to the inactive + * batch that is currently being drained. Furthermore, marking + * a remote batch's folio active potentially hits a race where + * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ - for (i = pagevec_count(pvec) - 1; i >= 0; i--) { - struct page *pagevec_page = pvec->pages[i]; + for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { + struct folio *batch_folio = fbatch->folios[i]; - if (pagevec_page == &folio->page) { + if (batch_folio == folio) { folio_set_active(folio); break; } } - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } /* @@ -427,9 +455,9 @@ void folio_mark_accessed(struct folio *folio) */ } else if (!folio_test_active(folio)) { /* - * If the page is on the LRU, queue it for activation via - * lru_pvecs.activate_page. Otherwise, assume the page is on a - * pagevec, mark it active and it'll be moved to the active + * If the folio is on the LRU, queue it for activation via + * cpu_fbatches.activate. Otherwise, assume the folio is in a + * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) @@ -450,22 +478,22 @@ EXPORT_SYMBOL(folio_mark_accessed); * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of folio_add_lru() + * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_active(folio) && + folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); folio_get(folio); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - __pagevec_lru_add(pvec); - local_unlock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); + folio_batch_add_and_move(fbatch, folio, lru_add_fn); + local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); @@ -489,56 +517,57 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, } /* - * If the page can not be invalidated, it is moved to the + * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * - * If the page isn't page_mapped and dirty/writeback, the page - * could reclaim asap using PG_reclaim. + * If the folio isn't mapped and dirty/writeback, the folio + * could be reclaimed asap using the reclaim flag. * - * 1. active, mapped page -> none - * 2. active, dirty/writeback page -> inactive, head, PG_reclaim - * 3. inactive, mapped page -> none - * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 1. active, mapped folio -> none + * 2. active, dirty/writeback folio -> inactive, head, reclaim + * 3. inactive, mapped folio -> none + * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * - * In 4, why it moves inactive's head, the VM expects the page would - * be write it out by flusher threads as this is much more effective + * In 4, it moves to the head of the inactive list so the folio is + * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ -static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) +static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { - bool active = PageActive(page); - int nr_pages = thp_nr_pages(page); + bool active = folio_test_active(folio); + long nr_pages = folio_nr_pages(folio); - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return; - /* Some processes are using the page */ - if (page_mapped(page)) + /* Some processes are using the folio */ + if (folio_mapped(folio)) return; - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); - if (PageWriteback(page) || PageDirty(page)) { + if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* - * PG_reclaim could be raced with end_page_writeback - * It can make readahead confusing. But race window - * is _really_ small and it's non-critical problem. + * Setting the reclaim flag could race with + * folio_end_writeback() and confuse readahead. But the + * race window is _really_ small and it's not a critical + * problem. */ - add_page_to_lru_list(page, lruvec); - SetPageReclaim(page); + lruvec_add_folio(lruvec, folio); + folio_set_reclaim(folio); } else { /* - * The page's writeback ends up during pagevec - * We move that page into tail of inactive. + * The folio's writeback ended while it was in the batch. + * We move that folio to the tail of the inactive list. */ - add_page_to_lru_list_tail(page, lruvec); + lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } @@ -549,15 +578,15 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) } } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) +static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { - if (PageActive(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (folio_test_active(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, @@ -565,22 +594,22 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) } } -static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) +static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { - if (PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); /* - * Lazyfree pages are clean anonymous pages. They have - * PG_swapbacked flag cleared, to distinguish them from normal - * anonymous pages + * Lazyfree folios are clean anonymous folios. They have + * the swapbacked flag cleared, to distinguish them from normal + * anonymous folios */ - ClearPageSwapBacked(page); - add_page_to_lru_list(page, lruvec); + folio_clear_swapbacked(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, @@ -589,71 +618,67 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) } /* - * Drain pages out of the cpu's pagevecs. + * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + struct folio_batch *fbatch = &fbatches->lru_add; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_add_fn); - pvec = &per_cpu(lru_rotate.pvec, cpu); + fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ - if (data_race(pagevec_count(pvec))) { + if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); + folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + fbatch = &fbatches->lru_deactivate_file; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_file_fn); - pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn); + fbatch = &fbatches->lru_deactivate; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_fn); - pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn); + fbatch = &fbatches->lru_lazyfree; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_lazyfree_fn); - activate_page_drain(cpu); + folio_activate_drain(cpu); } /** - * deactivate_file_folio() - Forcefully deactivate a file folio. + * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * - * Context: Caller holds a reference on the page. + * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - /* - * In a workload with many unevictable pages such as mprotect, - * unevictable folio deactivation for accelerating reclaim is pointless. - */ + /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); - local_unlock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); + local_unlock(&cpu_fbatches.lock); } /* @@ -666,15 +691,17 @@ void deactivate_file_folio(struct folio *folio) */ void deactivate_page(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn); - local_unlock(&lru_pvecs.lock); + struct folio *folio = page_folio(page); + + if (folio_test_lru(folio) && folio_test_active(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); + local_unlock(&cpu_fbatches.lock); } } @@ -687,24 +714,26 @@ void deactivate_page(struct page *page) */ void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn); - local_unlock(&lru_pvecs.lock); + struct folio *folio = page_folio(page); + + if (folio_test_lru(folio) && folio_test_anon(folio) && + folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); + folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); + local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); mlock_page_drain_local(); } @@ -716,19 +745,19 @@ void lru_add_drain(void) */ static void lru_add_and_bh_lrus_drain(void) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_page_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); mlock_page_drain_local(); } @@ -741,6 +770,21 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_and_bh_lrus_drain(); } +static bool cpu_needs_drain(unsigned int cpu) +{ + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + + /* Check these in order of likelihood that they're not zero */ + return folio_batch_count(&fbatches->lru_add) || + data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || + folio_batch_count(&fbatches->lru_deactivate_file) || + folio_batch_count(&fbatches->lru_deactivate) || + folio_batch_count(&fbatches->lru_lazyfree) || + folio_batch_count(&fbatches->activate) || + need_mlock_page_drain(cpu) || + has_bh_in_lru(cpu, NULL); +} + /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is @@ -773,8 +817,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) return; /* - * Guarantee pagevec counter stores visible by this CPU are visible to - * other CPUs before loading the current drain generation. + * Guarantee folio_batch counter stores visible by this CPU + * are visible to other CPUs before loading the current drain + * generation. */ smp_mb(); @@ -800,8 +845,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical - * section. Use a full memory barrier to guarantee that the new global - * drain generation number is stored before loading pagevec counters. + * section. Use a full memory barrier to guarantee that the + * new global drain generation number is stored before loading + * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. @@ -823,14 +869,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || - data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || - pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || - pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || - pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu) || - need_mlock_page_drain(cpu) || - has_bh_in_lru(cpu, NULL)) { + if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); @@ -906,8 +945,7 @@ void release_pages(struct page **pages, int nr) unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct page *page = pages[i]; - struct folio *folio = page_folio(page); + struct folio *folio = page_folio(pages[i]); /* * Make sure the IRQ-safe lock-holding time does not get @@ -915,39 +953,38 @@ void release_pages(struct page **pages, int nr) * same lruvec. The lock is held only if lruvec != NULL. */ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } - page = &folio->page; - if (is_huge_zero_page(page)) + if (is_huge_zero_page(&folio->page)) continue; - if (is_zone_device_page(page)) { + if (folio_is_zone_device(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page(page)) + if (put_devmap_managed_page(&folio->page)) continue; - if (put_page_testzero(page)) - free_zone_device_page(page); + if (folio_put_testzero(folio)) + free_zone_device_page(&folio->page); continue; } - if (!put_page_testzero(page)) + if (!folio_put_testzero(folio)) continue; - if (PageCompound(page)) { + if (folio_test_large(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } - __put_compound_page(page); + __folio_put_large(folio); continue; } - if (PageLRU(page)) { + if (folio_test_lru(folio)) { struct lruvec *prev_lruvec = lruvec; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, @@ -955,8 +992,8 @@ void release_pages(struct page **pages, int nr) if (prev_lruvec != lruvec) lock_batch = 0; - del_page_from_lru_list(page, lruvec); - __clear_page_lru_flags(page); + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); } /* @@ -965,16 +1002,16 @@ void release_pages(struct page **pages, int nr) * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ - if (unlikely(PageMlocked(page))) { - __ClearPageMlocked(page); - dec_zone_page_state(page, NR_MLOCK); + if (unlikely(folio_test_mlocked(folio))) { + __folio_clear_mlocked(folio); + zone_stat_sub_folio(folio, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); } - list_add(&page->lru, &pages_to_free); + list_add(&folio->lru, &pages_to_free); } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -987,8 +1024,8 @@ EXPORT_SYMBOL(release_pages); * OK from a correctness point of view but is inefficient - those pages may be * cache-warm and we want to give them back to the page allocator ASAP. * - * So __pagevec_release() will drain those queues here. __pagevec_lru_add() - * and __pagevec_lru_add_active() call release_pages() directly to avoid + * So __pagevec_release() will drain those queues here. + * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __pagevec_release(struct pagevec *pvec) @@ -1002,69 +1039,6 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) -{ - int was_unevictable = folio_test_clear_unevictable(folio); - long nr_pages = folio_nr_pages(folio); - - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - - folio_set_lru(folio); - /* - * Is an smp_mb__after_atomic() still required here, before - * folio_evictable() tests PageMlocked, to rule out the possibility - * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears PageMlocked while the LRU - * lock is held. - * - * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear PageMlocked after - * put_page_testzero() has excluded any other users of the page.) - */ - if (folio_evictable(folio)) { - if (was_unevictable) - __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); - } else { - folio_clear_active(folio); - folio_set_unevictable(folio); - /* - * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another - * actor has already counted the mlock or not. Err on the - * safe side, underestimate, let page reclaim fix it, rather - * than leaving a page on the unevictable LRU indefinitely. - */ - folio->mlock_count = 0; - if (!was_unevictable) - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); - } - - lruvec_add_folio(lruvec, folio); - trace_mm_lru_insertion(folio); -} - -/* - * Add the passed pages to the LRU, then drop the caller's refcount - * on them. Reinitialises the caller's pagevec. - */ -void __pagevec_lru_add(struct pagevec *pvec) -{ - int i; - struct lruvec *lruvec = NULL; - unsigned long flags = 0; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct folio *folio = page_folio(pvec->pages[i]); - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - __pagevec_lru_add_fn(folio, lruvec); - } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); -} - /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune diff --git a/mm/swap.h b/mm/swap.h index 0193797b0c92..17936e068c1c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -36,12 +36,11 @@ bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp); -void __delete_from_swap_cache(struct page *page, +void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); -void delete_from_swap_cache(struct page *page); +void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void free_swap_cache(struct page *page); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); @@ -61,9 +60,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -static inline unsigned int page_swap_flags(struct page *page) +static inline unsigned int folio_swap_flags(struct folio *folio) { - return page_swap_info(page)->flags; + return page_swap_info(&folio->page)->flags; } #else /* CONFIG_SWAP */ struct swap_iocb; @@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline void free_swap_cache(struct page *page) -{ -} - static inline void show_swap_cache_info(void) { } @@ -135,12 +130,12 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, return -1; } -static inline void __delete_from_swap_cache(struct page *page, +static inline void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void delete_from_swap_cache(struct page *page) +static inline void delete_from_swap_cache(struct folio *folio) { } @@ -149,7 +144,7 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, { } -static inline unsigned int page_swap_flags(struct page *page) +static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0a2021fc55ad..e166051566f4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -59,24 +59,11 @@ static bool enable_vma_readahead __read_mostly = true; #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) -#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) -#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) - -static struct { - unsigned long add_total; - unsigned long del_total; - unsigned long find_success; - unsigned long find_total; -} swap_cache_info; - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); - printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", - swap_cache_info.add_total, swap_cache_info.del_total, - swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %ldkB\n", get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); @@ -133,7 +120,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); - ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -147,32 +133,32 @@ unlock: } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page, +void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); - int i, nr = thp_nr_pages(page); + int i; + long nr = folio_nr_pages(folio); pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - VM_BUG_ON_PAGE(PageWriteback(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != page, entry); - set_page_private(page + i, 0); + VM_BUG_ON_FOLIO(entry != folio, folio); + set_page_private(folio_page(folio, i), 0); xas_next(&xas); } - ClearPageSwapCache(page); + folio_clear_swapcache(folio); address_space->nrpages -= nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); - ADD_CACHE_INFO(del_total, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /** @@ -237,22 +223,22 @@ fail: } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache and locked. - * It will never put the page into the free list, - * the caller has a reference on the page. + * It will never put the folio into the free list, + * the caller has a reference on the folio. */ -void delete_from_swap_cache(struct page *page) +void delete_from_swap_cache(struct folio *folio) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = folio_swap_entry(folio); struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page, entry, NULL); + __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(page, entry); - page_ref_sub(page, thp_nr_pages(page)); + put_swap_page(&folio->page, entry); + folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, @@ -348,12 +334,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, page = find_get_page(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - INC_CACHE_INFO(find_total); if (page) { bool vma_ra = swap_use_vma_readahead(); bool readahead; - INC_CACHE_INFO(find_success); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. diff --git a/mm/swapfile.c b/mm/swapfile.c index a2e66d855b19..5c8681a3f1d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -695,7 +695,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit += nr_entries; if (end == si->highest_bit) WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - si->inuse_pages += nr_entries; + WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; @@ -732,7 +732,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); - si->inuse_pages -= nr_entries; + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -1568,16 +1568,15 @@ unlock_out: return ret; } -static bool page_swapped(struct page *page) +static bool folio_swapped(struct folio *folio) { swp_entry_t entry; struct swap_info_struct *si; - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) - return page_swapcount(page) != 0; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) + return page_swapcount(&folio->page) != 0; - page = compound_head(page); - entry.val = page_private(page); + entry = folio_swap_entry(folio); si = _swap_info_get(entry); if (si) return swap_page_trans_huge_swapped(si, entry); @@ -1590,13 +1589,14 @@ static bool page_swapped(struct page *page) */ int try_to_free_swap(struct page *page) { - VM_BUG_ON_PAGE(!PageLocked(page), page); + struct folio *folio = page_folio(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return 0; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) return 0; - if (page_swapped(page)) + if (folio_swapped(folio)) return 0; /* @@ -1617,9 +1617,8 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); + delete_from_swap_cache(folio); + folio_set_dirty(folio); return 1; } @@ -1991,14 +1990,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); @@ -2640,7 +2641,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = si->pages << (PAGE_SHIFT - 10); - inuse = si->inuse_pages << (PAGE_SHIFT - 10); + inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3259,7 +3260,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += si->inuse_pages; + nr_to_be_unused += READ_ONCE(si->inuse_pages); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; diff --git a/mm/util.c b/mm/util.c index c9439c66d8cf..1266a33a49ea 100644 --- a/mm/util.c +++ b/mm/util.c @@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) - next->vm_prev = prev; -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { diff --git a/mm/vmacache.c b/mm/vmacache.c deleted file mode 100644 index 01a6e6688ec1..000000000000 --- a/mm/vmacache.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2014 Davidlohr Bueso. - */ -#include <linux/sched/signal.h> -#include <linux/sched/task.h> -#include <linux/mm.h> -#include <linux/vmacache.h> - -/* - * Hash based on the pmd of addr if configured with MMU, which provides a good - * hit rate for workloads with spatial locality. Otherwise, use pages. - */ -#ifdef CONFIG_MMU -#define VMACACHE_SHIFT PMD_SHIFT -#else -#define VMACACHE_SHIFT PAGE_SHIFT -#endif -#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) - -/* - * This task may be accessing a foreign mm via (for example) - * get_user_pages()->find_vma(). The vmacache is task-local and this - * task's vmacache pertains to a different mm (ie, its own). There is - * nothing we can do here. - * - * Also handle the case where a kernel thread has adopted this mm via - * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. - */ -static inline bool vmacache_valid_mm(struct mm_struct *mm) -{ - return current->mm == mm && !(current->flags & PF_KTHREAD); -} - -void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) -{ - if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; -} - -static bool vmacache_valid(struct mm_struct *mm) -{ - struct task_struct *curr; - - if (!vmacache_valid_mm(mm)) - return false; - - curr = current; - if (mm->vmacache_seqnum != curr->vmacache.seqnum) { - /* - * First attempt will always be invalid, initialize - * the new cache for this task here. - */ - curr->vmacache.seqnum = mm->vmacache_seqnum; - vmacache_flush(curr); - return false; - } - return true; -} - -struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) -{ - int idx = VMACACHE_HASH(addr); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma) { -#ifdef CONFIG_DEBUG_VM_VMACACHE - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; -#endif - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} - -#ifndef CONFIG_MMU -struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - int idx = VMACACHE_HASH(start); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma && vma->vm_start == start && vma->vm_end == end) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} -#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5977b178694d..dd6cdb201195 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -815,9 +815,9 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) return va; } -static struct vmap_area *__find_vmap_area(unsigned long addr) +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -910,8 +910,9 @@ get_va_next_sibling(struct rb_node *parent, struct rb_node **link) } static __always_inline void -link_va(struct vmap_area *va, struct rb_root *root, - struct rb_node *parent, struct rb_node **link, struct list_head *head) +__link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head, bool augment) { /* * VA is still not in the list, but we can @@ -925,7 +926,7 @@ link_va(struct vmap_area *va, struct rb_root *root, /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); - if (root == &free_vmap_area_root) { + if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to @@ -949,21 +950,49 @@ link_va(struct vmap_area *va, struct rb_root *root, } static __always_inline void -unlink_va(struct vmap_area *va, struct rb_root *root) +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, false); +} + +static __always_inline void +link_va_augment(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, true); +} + +static __always_inline void +__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; - if (root == &free_vmap_area_root) + if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); - list_del(&va->list); + list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, false); +} + +static __always_inline void +unlink_va_augment(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, true); +} + #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. @@ -1059,7 +1088,7 @@ insert_vmap_area_augment(struct vmap_area *va, link = find_va_links(va, root, NULL, &parent); if (link) { - link_va(va, root, parent, link, head); + link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } @@ -1076,8 +1105,8 @@ insert_vmap_area_augment(struct vmap_area *va, * ongoing. */ static __always_inline struct vmap_area * -merge_or_add_vmap_area(struct vmap_area *va, - struct rb_root *root, struct list_head *head) +__merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; @@ -1139,7 +1168,7 @@ merge_or_add_vmap_area(struct vmap_area *va, * "normalized" because of rotation operations. */ if (merged) - unlink_va(va, root); + __unlink_va(va, root, augment); sibling->va_end = va->va_end; @@ -1154,16 +1183,23 @@ merge_or_add_vmap_area(struct vmap_area *va, insert: if (!merged) - link_va(va, root, parent, link, head); + __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + return __merge_or_add_vmap_area(va, root, head, false); +} + +static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { - va = merge_or_add_vmap_area(va, root, head); + va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); @@ -1197,15 +1233,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size, * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, unsigned long align, - unsigned long vstart, bool adjust_search_size) +find_vmap_lowest_match(struct rb_root *root, unsigned long size, + unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ - node = free_vmap_area_root.rb_node; + node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; @@ -1333,8 +1369,9 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct vmap_area *va, - unsigned long nva_start_addr, unsigned long size) +adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); @@ -1347,7 +1384,7 @@ adjust_va_to_fit_type(struct vmap_area *va, * V NVA V * |---------------| */ - unlink_va(va, &free_vmap_area_root); + unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* @@ -1425,8 +1462,7 @@ adjust_va_to_fit_type(struct vmap_area *va, augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ - insert_vmap_area_augment(lva, &va->rb_node, - &free_vmap_area_root, &free_vmap_area_list); + insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; @@ -1437,7 +1473,8 @@ adjust_va_to_fit_type(struct vmap_area *va, * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long -__alloc_vmap_area(unsigned long size, unsigned long align, +__alloc_vmap_area(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; @@ -1457,7 +1494,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; - va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); + va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1471,7 +1508,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(va, nva_start_addr, size); + ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; @@ -1562,7 +1599,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); - addr = __alloc_vmap_area(size, align, vstart, vend); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); /* @@ -1796,7 +1834,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr); + va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; @@ -2539,7 +2577,7 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = __find_vmap_area((unsigned long)addr, &vmap_area_root); if (va && va->vm) { struct vm_struct *vm = va->vm; @@ -3161,15 +3199,15 @@ again: /* * Mark the pages as accessible, now that they are mapped. - * The init condition should match the one in post_alloc_hook() - * (except for the should_skip_init() check) to make sure that memory - * is initialized under the same conditions regardless of the enabled - * KASAN mode. + * The condition for setting KASAN_VMALLOC_INIT should complement the + * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check + * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; - if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && + (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); @@ -3838,7 +3876,9 @@ retry: /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(va, start, size); + ret = adjust_va_to_fit_type(&free_vmap_area_root, + &free_vmap_area_list, + va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; diff --git a/mm/vmscan.c b/mm/vmscan.c index 04f8671caad9..0070c7fb600a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -26,8 +26,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/buffer_head.h> /* for try_to_release_page(), - buffer_heads_over_limit */ +#include <linux/buffer_head.h> /* for buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> @@ -160,17 +159,17 @@ struct scan_control { }; #ifdef ARCH_HAS_PREFETCHW -#define prefetchw_prev_lru_page(_page, _base, _field) \ +#define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ - if ((_page)->lru.prev != _base) { \ - struct page *prev; \ + if ((_folio)->lru.prev != _base) { \ + struct folio *prev; \ \ - prev = lru_to_page(&(_page->lru)); \ + prev = lru_to_folio(&(_folio->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else -#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif /* @@ -190,8 +189,8 @@ static void set_task_reclaim_state(struct task_struct *task, task->reclaim_state = rs; } -static LIST_HEAD(shrinker_list); -static DECLARE_RWSEM(shrinker_rwsem); +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -410,13 +409,9 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int i, nid; long nr; - struct mem_cgroup *parent; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct shrinker_info *child_info, *parent_info; - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - /* Prevent from concurrent shrinker_info expand */ down_read(&shrinker_rwsem); for_each_node(nid) { @@ -608,7 +603,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, /* * Add a shrinker callback to be called from the vm. */ -int prealloc_shrinker(struct shrinker *shrinker) +static int __prealloc_shrinker(struct shrinker *shrinker) { unsigned int size; int err; @@ -632,8 +627,36 @@ int prealloc_shrinker(struct shrinker *shrinker) return 0; } +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + void free_prealloced_shrinker(struct shrinker *shrinker) { +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); +#endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); @@ -650,18 +673,43 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); } -int register_shrinker(struct shrinker *shrinker) +static int __register_shrinker(struct shrinker *shrinker) { - int err = prealloc_shrinker(shrinker); + int err = __prealloc_shrinker(shrinker); if (err) return err; register_shrinker_prepared(shrinker); return 0; } + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif EXPORT_SYMBOL(register_shrinker); /* @@ -677,6 +725,7 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); + shrinker_debugfs_remove(shrinker); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); @@ -1276,7 +1325,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, mem_cgroup_swapout(folio, swap); if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(&folio->page, swap, shadow); + __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(&folio->page, swap); } else { @@ -1519,7 +1568,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ - return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS); + return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); } /* @@ -1556,13 +1605,19 @@ retry: folio = lru_to_folio(page_list); list_del(&folio->lru); + nr_pages = folio_nr_pages(folio); + + if (folio_ref_count(folio) == 1 && + folio_ref_freeze(folio, 1)) { + /* folio was freed from under us. So we are done. */ + goto free_it; + } + if (!folio_trylock(folio)) goto keep; VM_BUG_ON_FOLIO(folio_test_active(folio), folio); - nr_pages = folio_nr_pages(folio); - /* Account the number of base pages */ sc->nr_scanned += nr_pages; @@ -1926,7 +1981,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) - destroy_compound_page(&folio->page); + destroy_large_folio(folio); else list_add(&folio->lru, &free_pages); continue; @@ -1987,7 +2042,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *page_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1995,16 +2050,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, }; struct reclaim_stat stat; unsigned int nr_reclaimed; - struct page *page, *next; - LIST_HEAD(clean_pages); + struct folio *folio, *next; + LIST_HEAD(clean_folios); unsigned int noreclaim_flag; - list_for_each_entry_safe(page, next, page_list, lru) { - if (!PageHuge(page) && page_is_file_lru(page) && - !PageDirty(page) && !__PageMovable(page) && - !PageUnevictable(page)) { - ClearPageActive(page); - list_move(&page->lru, &clean_pages); + list_for_each_entry_safe(folio, next, folio_list, lru) { + if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && + !folio_test_dirty(folio) && !__folio_test_movable(folio) && + !folio_test_unevictable(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &clean_folios); } } @@ -2015,11 +2070,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); - list_splice(&clean_pages, page_list); + list_splice(&clean_folios, folio_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* @@ -2085,72 +2140,72 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; - LIST_HEAD(pages_skipped); + LIST_HEAD(folios_skipped); total_scan = 0; scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; - struct page *page; + struct folio *folio; - page = lru_to_page(src); - prefetchw_prev_lru_page(page, src, flags); + folio = lru_to_folio(src); + prefetchw_prev_lru_folio(folio, src, flags); - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (page_zonenum(page) > sc->reclaim_idx) { - nr_skipped[page_zonenum(page)] += nr_pages; - move_to = &pages_skipped; + if (folio_zonenum(folio) > sc->reclaim_idx) { + nr_skipped[folio_zonenum(folio)] += nr_pages; + move_to = &folios_skipped; goto move; } /* - * Do not count skipped pages because that makes the function - * return with no isolated pages if the LRU mostly contains - * ineligible pages. This causes the VM to not reclaim any - * pages, triggering a premature OOM. - * Account all tail pages of THP. + * Do not count skipped folios because that makes the function + * return with no isolated folios if the LRU mostly contains + * ineligible folios. This causes the VM to not reclaim any + * folios, triggering a premature OOM. + * Account all pages in a folio. */ scan += nr_pages; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) goto move; - if (!sc->may_unmap && page_mapped(page)) + if (!sc->may_unmap && folio_mapped(folio)) goto move; /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. + * Be careful not to clear the lru flag until after we're + * sure the folio is not being freed elsewhere -- the + * folio release code relies on it. */ - if (unlikely(!get_page_unless_zero(page))) + if (unlikely(!folio_try_get(folio))) goto move; - if (!TestClearPageLRU(page)) { - /* Another thread is already isolating this page */ - put_page(page); + if (!folio_test_clear_lru(folio)) { + /* Another thread is already isolating this folio */ + folio_put(folio); goto move; } nr_taken += nr_pages; - nr_zone_taken[page_zonenum(page)] += nr_pages; + nr_zone_taken[folio_zonenum(folio)] += nr_pages; move_to = dst; move: - list_move(&page->lru, move_to); + list_move(&folio->lru, move_to); } /* - * Splice any skipped pages to the start of the LRU list. Note that + * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX - * scanning would soon rescan the same pages to skip and waste lots + * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles. */ - if (!list_empty(&pages_skipped)) { + if (!list_empty(&folios_skipped)) { int zid; - list_splice(&pages_skipped, src); + list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -2202,7 +2257,7 @@ int folio_isolate_lru(struct folio *folio) folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); ret = 0; } @@ -2254,71 +2309,71 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves pages from private @list to appropriate LRU list. - * On return, @list is reused as a list of pages to be freed by the caller. + * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * On return, @list is reused as a list of folios to be freed by the caller. * - * Returns the number of pages moved to the given lruvec. + * Returns the number of pages moved to the appropriate LRU list. + * + * Note: The caller must not hold any lruvec lock. */ -static unsigned int move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_pages_to_lru(struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(pages_to_free); - struct page *page; + struct lruvec *lruvec = NULL; + LIST_HEAD(folios_to_free); while (!list_empty(list)) { - page = lru_to_page(list); - VM_BUG_ON_PAGE(PageLRU(page), page); - list_del(&page->lru); - if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&lruvec->lru_lock); - putback_lru_page(page); - spin_lock_irq(&lruvec->lru_lock); + struct folio *folio = lru_to_folio(list); + + lruvec = folio_lruvec_relock_irq(folio, lruvec); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + list_del(&folio->lru); + if (unlikely(!folio_evictable(folio))) { + lruvec_unlock_irq(lruvec); + folio_putback_lru(folio); + lruvec = NULL; continue; } /* - * The SetPageLRU needs to be kept here for list integrity. + * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_pages_to_lru #1 release_pages - * if !put_page_testzero - * if (put_page_testzero()) - * !PageLRU //skip lru_lock - * SetPageLRU() - * list_add(&page->lru,) - * list_add(&page->lru,) + * if (!folio_put_testzero()) + * if (folio_put_testzero()) + * !lru //skip lru_lock + * folio_set_lru() + * list_add(&folio->lru,) + * list_add(&folio->lru,) */ - SetPageLRU(page); + folio_set_lru(folio); - if (unlikely(put_page_testzero(page))) { - __clear_page_lru_flags(page); + if (unlikely(folio_put_testzero(folio))) { + __folio_clear_lru_flags(folio); - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&lruvec->lru_lock); - destroy_compound_page(page); - spin_lock_irq(&lruvec->lru_lock); + if (unlikely(folio_test_large(folio))) { + lruvec_unlock_irq(lruvec); + destroy_large_folio(folio); + lruvec = NULL; } else - list_add(&page->lru, &pages_to_free); + list_add(&folio->lru, &folios_to_free); continue; } - /* - * All pages were isolated from the same lruvec (and isolation - * inhibits memcg migration). - */ - VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page); - add_page_to_lru_list(page, lruvec); - nr_pages = thp_nr_pages(page); + lruvec_add_folio(lruvec, folio); + nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; - if (PageActive(page)) + if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); } + if (lruvec) + lruvec_unlock_irq(lruvec); /* * To save our caller's stack, now use input list for pages to free. */ - list_splice(&pages_to_free, list); + list_splice(&folios_to_free, list); return nr_moved; } @@ -2385,16 +2440,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); - spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &page_list); + move_pages_to_lru(&page_list); + local_irq_disable(); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&lruvec->lru_lock); + local_irq_enable(); lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); @@ -2429,21 +2484,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, } /* - * shrink_active_list() moves pages from the active LRU to the inactive LRU. + * shrink_active_list() moves folios from the active LRU to the inactive LRU. * - * We move them the other way if the page is referenced by one or more + * We move them the other way if the folio is referenced by one or more * processes. * - * If the pages are mostly unmapped, the processing is fast and it is + * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (folio_referenced()), so - * we should drop lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. + * the folios are mapped, the processing is slow (folio_referenced()), so + * we should drop lru_lock around each folio. It's impossible to balance + * this, so instead we remove the folios from the LRU while processing them. + * It is safe to rely on the active flag against the non-LRU folios in here + * because nobody will play with that bit on a non-LRU folio. * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. + * The downside is that we have to touch folio->_refcount against each folio. + * But we had to alter folio->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, @@ -2453,7 +2508,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_taken; unsigned long nr_scanned; unsigned long vm_flags; - LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; @@ -2478,23 +2533,21 @@ static void shrink_active_list(unsigned long nr_to_scan, while (!list_empty(&l_hold)) { struct folio *folio; - struct page *page; cond_resched(); folio = lru_to_folio(&l_hold); list_del(&folio->lru); - page = &folio->page; - if (unlikely(!page_evictable(page))) { - putback_lru_page(page); + if (unlikely(!folio_evictable(folio))) { + folio_putback_lru(folio); continue; } if (unlikely(buffer_heads_over_limit)) { - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); + if (folio_get_private(folio) && folio_trylock(folio)) { + if (folio_get_private(folio)) + filemap_release_folio(folio, 0); + folio_unlock(folio); } } @@ -2502,41 +2555,39 @@ static void shrink_active_list(unsigned long nr_to_scan, if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags) != 0) { /* - * Identify referenced, file-backed active pages and + * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in - * memory under moderate memory pressure. Anon pages + * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming - * IO, plus JVM can create lots of anon VM_EXEC pages, + * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { - nr_rotated += thp_nr_pages(page); - list_add(&page->lru, &l_active); + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { + nr_rotated += folio_nr_pages(folio); + list_add(&folio->lru, &l_active); continue; } } - ClearPageActive(page); /* we are de-activating */ - SetPageWorkingset(page); - list_add(&page->lru, &l_inactive); + folio_clear_active(folio); /* we are de-activating */ + folio_set_workingset(folio); + list_add(&folio->lru, &l_inactive); } /* - * Move pages back to the lru list. + * Move folios back to the lru list. */ - spin_lock_irq(&lruvec->lru_lock); - - nr_activate = move_pages_to_lru(lruvec, &l_active); - nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); - /* Keep all free pages in l_active list */ + nr_activate = move_pages_to_lru(&l_active); + nr_deactivate = move_pages_to_lru(&l_inactive); + /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); + local_irq_disable(); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&lruvec->lru_lock); + local_irq_enable(); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); @@ -2568,34 +2619,33 @@ static unsigned int reclaim_page_list(struct list_head *page_list, return nr_reclaimed; } -unsigned long reclaim_pages(struct list_head *page_list) +unsigned long reclaim_pages(struct list_head *folio_list) { int nid; unsigned int nr_reclaimed = 0; - LIST_HEAD(node_page_list); - struct page *page; + LIST_HEAD(node_folio_list); unsigned int noreclaim_flag; - if (list_empty(page_list)) + if (list_empty(folio_list)) return nr_reclaimed; noreclaim_flag = memalloc_noreclaim_save(); - nid = page_to_nid(lru_to_page(page_list)); + nid = folio_nid(lru_to_folio(folio_list)); do { - page = lru_to_page(page_list); + struct folio *folio = lru_to_folio(folio_list); - if (nid == page_to_nid(page)) { - ClearPageActive(page); - list_move(&page->lru, &node_page_list); + if (nid == folio_nid(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &node_folio_list); continue; } - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); - nid = page_to_nid(lru_to_page(page_list)); - } while (!list_empty(page_list)); + nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -4595,7 +4645,7 @@ void kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kswapd_stop(int nid) { @@ -4846,7 +4896,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf2..da7e389cf33c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1382,10 +1382,6 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", diff --git a/mm/workingset.c b/mm/workingset.c index 592569a8974c..a5e84862fc86 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -625,7 +625,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker); + ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); if (ret) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 71d6edcbea48..a7bad79cbbed 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2169,7 +2169,8 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->shrinker.batch = 0; pool->shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&pool->shrinker); + return register_shrinker(&pool->shrinker, "mm-zspool:%s", + pool->name); } /** |