summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c41
-rw-r--r--mm/damon/Kconfig8
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/dbgfs.c79
-rw-r--r--mm/damon/lru_sort.c546
-rw-r--r--mm/damon/ops-common.c42
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c60
-rw-r--r--mm/damon/reclaim.c40
-rw-r--r--mm/damon/sysfs.c69
-rw-r--r--mm/damon/vaddr-test.h36
-rw-r--r--mm/damon/vaddr.c53
-rw-r--r--mm/debug.c14
-rw-r--r--mm/filemap.c18
-rw-r--r--mm/gup.c62
-rw-r--r--mm/huge_memory.c247
-rw-r--r--mm/hugetlb.c143
-rw-r--r--mm/hugetlb_vmemmap.c66
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kasan/common.c8
-rw-r--r--mm/kasan/hw_tags.c32
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/kasan/report.c12
-rw-r--r--mm/khugepaged.c243
-rw-r--r--mm/ksm.c24
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memcontrol.c592
-rw-r--r--mm/memory-failure.c279
-rw-r--r--mm/memory.c52
-rw-r--r--mm/memory_hotplug.c35
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/memremap.c10
-rw-r--r--mm/migrate.c47
-rw-r--r--mm/migrate_device.c115
-rw-r--r--mm/mlock.c41
-rw-r--r--mm/mmap.c2265
-rw-r--r--mm/mprotect.c86
-rw-r--r--mm/mremap.c41
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c249
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page_alloc.c447
-rw-r--r--mm/page_io.c5
-rw-r--r--mm/page_vma_mapped.c2
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/rmap.c106
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/shrinker_debug.c285
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c618
-rw-r--r--mm/swap.h19
-rw-r--r--mm/swap_state.c56
-rw-r--r--mm/swapfile.c35
-rw-r--r--mm/util.c32
-rw-r--r--mm/vmacache.c117
-rw-r--r--mm/vmalloc.c108
-rw-r--r--mm/vmscan.c358
-rw-r--r--mm/vmstat.c4
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zsmalloc.c3
63 files changed, 4885 insertions, 3090 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..8083fa85a348 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/compaction.c b/mm/compaction.c
index a2c53fcf933e..f1dfeb6a4090 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -505,6 +505,25 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
+static struct lruvec *
+compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags,
+ struct compact_control *cc)
+{
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
+ compact_lock_irqsave(&lruvec->lru_lock, flags, cc);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ return lruvec;
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. The lock should be periodically unlocked to avoid
@@ -831,6 +850,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ struct folio *folio;
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
@@ -861,7 +881,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -974,7 +994,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -1053,18 +1073,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
- lruvec = folio_lruvec(page_folio(page));
+ folio = page_folio(page);
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
- compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, page_folio(page));
-
/* Try get exclusive access under lock */
if (!skip_updated) {
skip_updated = true;
@@ -1117,7 +1136,7 @@ isolate_success_no_list:
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
put_page(page);
@@ -1133,7 +1152,7 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
putback_movable_pages(&cc->migratepages);
@@ -1165,7 +1184,7 @@ isolate_fail:
isolate_abort:
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
if (page) {
SetPageLRU(page);
put_page(page);
@@ -3009,7 +3028,7 @@ void kcompactd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
void kcompactd_stop(int nid)
{
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 9b559c76d6dd..66265e3a9c65 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -92,4 +92,12 @@ config DAMON_RECLAIM
reclamation under light memory pressure, while the traditional page
scanning-based reclamation is used for heavy pressure.
+config DAMON_LRU_SORT
+ bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based LRU-lists sorting subsystem. It tries to
+ protect frequently accessed (hot) pages while rarely accessed (cold)
+ pages reclaimed first under memory pressure.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index dbf7190b4144..3e6b8ad73858 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a0dab8b5e45f..cb8a7e9926a4 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -97,6 +97,31 @@ out:
return ret;
}
+/*
+ * Return corresponding dbgfs' scheme action value (int) for the given
+ * damos_action if the given damos_action value is valid and supported by
+ * dbgfs, negative error code otherwise.
+ */
+static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
+{
+ switch (action) {
+ case DAMOS_WILLNEED:
+ return 0;
+ case DAMOS_COLD:
+ return 1;
+ case DAMOS_PAGEOUT:
+ return 2;
+ case DAMOS_HUGEPAGE:
+ return 3;
+ case DAMOS_NOHUGEPAGE:
+ return 4;
+ case DAMOS_STAT:
+ return 5;
+ default:
+ return -EINVAL;
+ }
+}
+
static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damos *s;
@@ -109,7 +134,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
s->min_sz_region, s->max_sz_region,
s->min_nr_accesses, s->max_nr_accesses,
s->min_age_region, s->max_age_region,
- s->action,
+ damos_action_to_dbgfs_scheme_action(s->action),
s->quota.ms, s->quota.sz,
s->quota.reset_interval,
s->quota.weight_sz,
@@ -160,18 +185,27 @@ static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
kfree(schemes);
}
-static bool damos_action_valid(int action)
+/*
+ * Return corresponding damos_action for the given dbgfs input for a scheme
+ * action if the input is valid, negative error code otherwise.
+ */
+static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
{
- switch (action) {
- case DAMOS_WILLNEED:
- case DAMOS_COLD:
- case DAMOS_PAGEOUT:
- case DAMOS_HUGEPAGE:
- case DAMOS_NOHUGEPAGE:
- case DAMOS_STAT:
- return true;
+ switch (dbgfs_action) {
+ case 0:
+ return DAMOS_WILLNEED;
+ case 1:
+ return DAMOS_COLD;
+ case 2:
+ return DAMOS_PAGEOUT;
+ case 3:
+ return DAMOS_HUGEPAGE;
+ case 4:
+ return DAMOS_NOHUGEPAGE;
+ case 5:
+ return DAMOS_STAT;
default:
- return false;
+ return -EINVAL;
}
}
@@ -189,7 +223,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
int pos = 0, parsed, ret;
unsigned long min_sz, max_sz;
unsigned int min_nr_a, max_nr_a, min_age, max_age;
- unsigned int action;
+ unsigned int action_input;
+ enum damos_action action;
schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
GFP_KERNEL);
@@ -204,7 +239,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
ret = sscanf(&str[pos],
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
&min_sz, &max_sz, &min_nr_a, &max_nr_a,
- &min_age, &max_age, &action, &quota.ms,
+ &min_age, &max_age, &action_input, &quota.ms,
&quota.sz, &quota.reset_interval,
&quota.weight_sz, &quota.weight_nr_accesses,
&quota.weight_age, &wmarks.metric,
@@ -212,7 +247,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
&wmarks.low, &parsed);
if (ret != 18)
break;
- if (!damos_action_valid(action))
+ action = dbgfs_scheme_action_to_damos_action(action_input);
+ if ((int)action < 0)
goto fail;
if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
@@ -275,11 +311,6 @@ out:
return ret;
}
-static inline bool target_has_pid(const struct damon_ctx *ctx)
-{
- return ctx->ops.id == DAMON_OPS_VADDR;
-}
-
static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
{
struct damon_target *t;
@@ -288,7 +319,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
int rc;
damon_for_each_target(t, ctx) {
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
/* Show pid numbers to debugfs users */
id = pid_vnr(t->pid);
else
@@ -415,7 +446,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
struct damon_target *t, *next;
damon_for_each_target_safe(t, next, ctx) {
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -425,11 +456,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
if (!t) {
damon_for_each_target_safe(t, next, ctx)
damon_destroy_target(t);
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
dbgfs_put_pids(pids, nr_targets);
return -ENOMEM;
}
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
t->pid = pids[i];
damon_add_target(ctx, t);
}
@@ -722,7 +753,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (!target_has_pid(ctx))
+ if (!damon_target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
new file mode 100644
index 000000000000..c276736a071c
--- /dev/null
+++ b/mm/damon/lru_sort.c
@@ -0,0 +1,546 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based LRU-lists Sorting
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-lru-sort: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_lru_sort."
+
+/*
+ * Enable or disable DAMON_LRU_SORT.
+ *
+ * You can enable DAMON_LRU_SORT by setting the value of this parameter as
+ * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that
+ * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the
+ * watermarks-based activation condition. Refer to below descriptions for the
+ * watermarks parameter for this.
+ */
+static bool enabled __read_mostly;
+
+/*
+ * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_LRU_SORT is running are not
+ * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT
+ * reads values of parametrs except ``enabled`` again. Once the re-reading is
+ * done, this parameter is set as ``N``. If invalid parameters are found while
+ * the re-reading, DAMON_LRU_SORT will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
+/*
+ * Access frequency threshold for hot memory regions identification in permil.
+ *
+ * If a memory region is accessed in frequency of this or higher,
+ * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the
+ * LRU list, so that it could not be reclaimed under memory pressure. 50% by
+ * default.
+ */
+static unsigned long hot_thres_access_freq = 500;
+module_param(hot_thres_access_freq, ulong, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+ * identifies the region as cold, and mark it as unaccessed on the LRU list, so
+ * that it could be reclaimed first under memory pressure. 120 seconds by
+ * default.
+ */
+static unsigned long cold_min_age __read_mostly = 120000000;
+module_param(cold_min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the LRU lists sorting in milliseconds.
+ *
+ * DAMON_LRU_SORT tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used
+ * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
+ * limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * The time quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms). That is,
+ * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
+ * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
+ * enabled but inactive due to its watermarks rule. 5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
+ * checks the watermarks. 200 (20%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 200;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
+ * and the LRU-lists sorting. 150 (15%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 150;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
+ * the watermarks. 50 (5%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 50;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the hot/cold memory monitoring. Please
+ * refer to the DAMON documentation for more detail. 5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the hot/cold memory monitoring.
+ * Please refer to the DAMON documentation for more detail. 100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring. This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail. 10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring. This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail. 1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+/*
+ * Number of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
+module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
+module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Number of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_hot_regions __read_mostly;
+module_param(nr_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
+module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for hot regions have exceeded
+ */
+static unsigned long nr_hot_quota_exceeds __read_mostly;
+module_param(nr_hot_quota_exceeds, ulong, 0400);
+
+/*
+ * Number of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
+module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
+module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Number of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_cold_regions __read_mostly;
+module_param(nr_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
+module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for cold regions have exceeded
+ */
+static unsigned long nr_cold_quota_exceeds __read_mostly;
+module_param(nr_cold_quota_exceeds, ulong, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_lru_sort_ram_walk_arg {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_lru_sort_ram_walk_arg *a = arg;
+
+ if (a->end - a->start < resource_size(res)) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+ struct damon_lru_sort_ram_walk_arg arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+/* Create a DAMON-based operation scheme for hot memory regions */
+static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
+{
+ struct damos_watermarks wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = wmarks_interval,
+ .high = wmarks_high,
+ .mid = wmarks_mid,
+ .low = wmarks_low,
+ };
+ struct damos_quota quota = {
+ /*
+ * Do not try LRU-lists sorting of hot pages for more than half
+ * of quota_ms milliseconds within quota_reset_interval_ms.
+ */
+ .ms = quota_ms / 2,
+ .sz = 0,
+ .reset_interval = quota_reset_interval_ms,
+ /* Within the quota, mark hotter regions accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 1,
+ .weight_age = 0,
+ };
+ struct damos *scheme = damon_new_scheme(
+ /* Find regions having PAGE_SIZE or larger size */
+ PAGE_SIZE, ULONG_MAX,
+ /* and accessed for more than the threshold */
+ hot_thres, UINT_MAX,
+ /* no matter its age */
+ 0, UINT_MAX,
+ /* prioritize those on LRU lists, as soon as found */
+ DAMOS_LRU_PRIO,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &wmarks);
+
+ return scheme;
+}
+
+/* Create a DAMON-based operation scheme for cold memory regions */
+static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
+{
+ struct damos_watermarks wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = wmarks_interval,
+ .high = wmarks_high,
+ .mid = wmarks_mid,
+ .low = wmarks_low,
+ };
+ struct damos_quota quota = {
+ /*
+ * Do not try LRU-lists sorting of cold pages for more than
+ * half of quota_ms milliseconds within
+ * quota_reset_interval_ms.
+ */
+ .ms = quota_ms / 2,
+ .sz = 0,
+ .reset_interval = quota_reset_interval_ms,
+ /* Within the quota, mark colder regions not accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1,
+ };
+ struct damos *scheme = damon_new_scheme(
+ /* Find regions having PAGE_SIZE or larger size */
+ PAGE_SIZE, ULONG_MAX,
+ /* and not accessed at all */
+ 0, 0,
+ /* for cold_thres or more micro-seconds, and */
+ cold_thres, UINT_MAX,
+ /* mark those as not accessed, as soon as found */
+ DAMOS_LRU_DEPRIO,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &wmarks);
+
+ return scheme;
+}
+
+static int damon_lru_sort_apply_parameters(void)
+{
+ struct damos *scheme, *next_scheme;
+ struct damon_addr_range addr_range;
+ unsigned int hot_thres, cold_thres;
+ int err = 0;
+
+ err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+ min_nr_regions, max_nr_regions);
+ if (err)
+ return err;
+
+ /* free previously set schemes */
+ damon_for_each_scheme_safe(scheme, next_scheme, ctx)
+ damon_destroy_scheme(scheme);
+
+ /* aggr_interval / sample_interval is the maximum nr_accesses */
+ hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
+ 1000;
+ scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_add_scheme(ctx, scheme);
+
+ cold_thres = cold_min_age / aggr_interval;
+ scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_add_scheme(ctx, scheme);
+
+ if (monitor_region_start > monitor_region_end)
+ return -EINVAL;
+ if (!monitor_region_start && !monitor_region_end &&
+ !get_monitoring_region(&monitor_region_start,
+ &monitor_region_end))
+ return -EINVAL;
+ addr_range.start = monitor_region_start;
+ addr_range.end = monitor_region_end;
+ return damon_set_regions(target, &addr_range, 1);
+}
+
+static int damon_lru_sort_turn(bool on)
+{
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
+ }
+
+ err = damon_lru_sort_apply_parameters();
+ if (err)
+ return err;
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
+}
+
+static struct delayed_work damon_lru_sort_timer;
+static void damon_lru_sort_timer_fn(struct work_struct *work)
+{
+ static bool last_enabled;
+ bool now_enabled;
+
+ now_enabled = enabled;
+ if (last_enabled != now_enabled) {
+ if (!damon_lru_sort_turn(now_enabled))
+ last_enabled = now_enabled;
+ else
+ enabled = last_enabled;
+ }
+}
+static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
+
+static bool damon_lru_sort_initialized;
+
+static int damon_lru_sort_enabled_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+
+ if (!damon_lru_sort_initialized)
+ return rc;
+
+ schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+ return 0;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_lru_sort_enabled_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+ "Enable or disable DAMON_LRU_SORT (default: disabled)");
+
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
+{
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO) {
+ nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
+ bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
+ nr_lru_sorted_hot_regions = s->stat.nr_applied;
+ bytes_lru_sorted_hot_regions = s->stat.sz_applied;
+ nr_hot_quota_exceeds = s->stat.qt_exceeds;
+ } else if (s->action == DAMOS_LRU_DEPRIO) {
+ nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
+ bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
+ nr_lru_sorted_cold_regions = s->stat.nr_applied;
+ bytes_lru_sorted_cold_regions = s->stat.sz_applied;
+ nr_cold_quota_exceeds = s->stat.qt_exceeds;
+ }
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
+{
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int __init damon_lru_sort_init(void)
+{
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ return -EINVAL;
+
+ ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
+ ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+ damon_lru_sort_initialized = true;
+ return 0;
+}
+
+module_init(damon_lru_sort_init);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 10ef20b2003f..b1335de200e7 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
/* Return coldness of the region */
return DAMOS_MAX_SCORE - hotness;
}
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s)
+{
+ unsigned int max_nr_accesses;
+ int freq_subscore;
+ unsigned int age_in_sec;
+ int age_in_log, age_subscore;
+ unsigned int freq_weight = s->quota.weight_nr_accesses;
+ unsigned int age_weight = s->quota.weight_age;
+ int hotness;
+
+ max_nr_accesses = c->aggr_interval / c->sample_interval;
+ freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+ age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+ for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+ age_in_log++, age_in_sec >>= 1)
+ ;
+
+ /* If frequency is 0, higher age means it's colder */
+ if (freq_subscore == 0)
+ age_in_log *= -1;
+
+ /*
+ * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+ * Scale it to be in [0, 100] and set it as age subscore.
+ */
+ age_in_log += DAMON_MAX_AGE_IN_LOG;
+ age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+ DAMON_MAX_AGE_IN_LOG / 2;
+
+ hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+ if (freq_weight + age_weight)
+ hotness /= freq_weight + age_weight;
+ /*
+ * Transform it to fit in [0, DAMOS_MAX_SCORE]
+ */
+ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+ return hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index e790cb5f8fe0..52329ff361cd 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index b40ff5811bb2..dc131c6a5403 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -204,16 +204,11 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+static unsigned long damon_pa_pageout(struct damon_region *r)
{
unsigned long addr, applied;
LIST_HEAD(page_list);
- if (scheme->action != DAMOS_PAGEOUT)
- return 0;
-
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
struct page *page = damon_get_page(PHYS_PFN(addr));
@@ -238,6 +233,55 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
return applied * PAGE_SIZE;
}
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+{
+ unsigned long addr, applied = 0;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct page *page = damon_get_page(PHYS_PFN(addr));
+
+ if (!page)
+ continue;
+ mark_page_accessed(page);
+ put_page(page);
+ applied++;
+ }
+ return applied * PAGE_SIZE;
+}
+
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+ unsigned long addr, applied = 0;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct page *page = damon_get_page(PHYS_PFN(addr));
+
+ if (!page)
+ continue;
+ deactivate_page(page);
+ put_page(page);
+ applied++;
+ }
+ return applied * PAGE_SIZE;
+}
+
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_pa_pageout(r);
+ case DAMOS_LRU_PRIO:
+ return damon_pa_mark_accessed(r);
+ case DAMOS_LRU_DEPRIO:
+ return damon_pa_deactivate_pages(r);
+ default:
+ break;
+ }
+ return 0;
+}
+
static int damon_pa_scheme_score(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme)
@@ -245,6 +289,10 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_pageout_score(context, r, scheme);
+ case DAMOS_LRU_PRIO:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_LRU_DEPRIO:
+ return damon_pageout_score(context, r, scheme);
default:
break;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 4b07c29effe9..e69b807fefe4 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -353,7 +353,6 @@ static int damon_reclaim_turn(bool on)
return 0;
}
-#define ENABLE_CHECK_INTERVAL_MS 1000
static struct delayed_work damon_reclaim_timer;
static void damon_reclaim_timer_fn(struct work_struct *work)
{
@@ -367,16 +366,12 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
else
enabled = last_enabled;
}
-
- if (enabled)
- schedule_delayed_work(&damon_reclaim_timer,
- msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
}
static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
static bool damon_reclaim_initialized;
-static int enabled_store(const char *val,
+static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
int rc = param_set_bool(val, kp);
@@ -388,14 +383,12 @@ static int enabled_store(const char *val,
if (!damon_reclaim_initialized)
return rc;
- if (enabled)
- schedule_delayed_work(&damon_reclaim_timer, 0);
-
+ schedule_delayed_work(&damon_reclaim_timer, 0);
return 0;
}
static const struct kernel_param_ops enabled_param_ops = {
- .set = enabled_store,
+ .set = damon_reclaim_enabled_store,
.get = param_get_bool,
};
@@ -403,10 +396,21 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
static int damon_reclaim_after_aggregation(struct damon_ctx *c)
{
struct damos *s;
- int err = 0;
/* update the stats parameter */
damon_for_each_scheme(s, c) {
@@ -417,22 +421,12 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
nr_quota_exceeds = s->stat.qt_exceeds;
}
- if (commit_inputs) {
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- }
- return err;
+ return damon_reclaim_handle_commit_inputs();
}
static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
{
- int err = 0;
-
- if (commit_inputs) {
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- }
- return err;
+ return damon_reclaim_handle_commit_inputs();
}
static int __init damon_reclaim_init(void)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 09f9e8ca3d1f..7488e27c87c3 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -762,6 +762,8 @@ static const char * const damon_sysfs_damos_action_strs[] = {
"pageout",
"hugepage",
"nohugepage",
+ "lru_prio",
+ "lru_deprio",
"stat",
};
@@ -2136,8 +2138,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
struct damon_target *t, *next;
damon_for_each_target_safe(t, next, ctx) {
- if (ctx->ops.id == DAMON_OPS_VADDR ||
- ctx->ops.id == DAMON_OPS_FVADDR)
+ if (damon_target_has_pid(ctx))
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -2181,8 +2182,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
- if (ctx->ops.id == DAMON_OPS_VADDR ||
- ctx->ops.id == DAMON_OPS_FVADDR) {
+ if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
@@ -2210,7 +2210,7 @@ static struct damon_target *damon_sysfs_existing_target(
struct pid *pid;
struct damon_target *t;
- if (ctx->ops.id == DAMON_OPS_PADDR) {
+ if (!damon_target_has_pid(ctx)) {
/* Up to only one target for paddr could exist */
damon_for_each_target(t, ctx)
return t;
@@ -2359,6 +2359,23 @@ static inline bool damon_sysfs_kdamond_running(
damon_sysfs_ctx_running(kdamond->damon_ctx);
}
+static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
+ struct damon_sysfs_context *sys_ctx)
+{
+ int err;
+
+ err = damon_select_ops(ctx, sys_ctx->ops_id);
+ if (err)
+ return err;
+ err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+ if (err)
+ return err;
+ err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+ if (err)
+ return err;
+ return damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+}
+
/*
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
@@ -2367,31 +2384,14 @@ static inline bool damon_sysfs_kdamond_running(
*/
static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
{
- struct damon_ctx *ctx = kdamond->damon_ctx;
- struct damon_sysfs_context *sys_ctx;
- int err = 0;
-
if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
/* TODO: Support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
- sys_ctx = kdamond->contexts->contexts_arr[0];
-
- err = damon_select_ops(ctx, sys_ctx->ops_id);
- if (err)
- return err;
- err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
- if (err)
- return err;
- err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
- if (err)
- return err;
- err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
- if (err)
- return err;
- return err;
+ return damon_sysfs_apply_inputs(kdamond->damon_ctx,
+ kdamond->contexts->contexts_arr[0]);
}
/*
@@ -2438,27 +2438,16 @@ static struct damon_ctx *damon_sysfs_build_ctx(
if (!ctx)
return ERR_PTR(-ENOMEM);
- err = damon_select_ops(ctx, sys_ctx->ops_id);
- if (err)
- goto out;
- err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
- if (err)
- goto out;
- err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
- if (err)
- goto out;
- err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
- if (err)
- goto out;
+ err = damon_sysfs_apply_inputs(ctx, sys_ctx);
+ if (err) {
+ damon_destroy_ctx(ctx);
+ return ERR_PTR(err);
+ }
ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
-
-out:
- damon_destroy_ctx(ctx);
- return ERR_PTR(err);
}
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index d4f55f349100..bce37c487540 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -14,33 +14,19 @@
#include <kunit/test.h>
-static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
+static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+ ssize_t nr_vmas)
{
- int i, j;
- unsigned long largest_gap, gap;
+ int i;
+ MA_STATE(mas, mt, 0, 0);
if (!nr_vmas)
return;
- for (i = 0; i < nr_vmas - 1; i++) {
- vmas[i].vm_next = &vmas[i + 1];
-
- vmas[i].vm_rb.rb_left = NULL;
- vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb;
-
- largest_gap = 0;
- for (j = i; j < nr_vmas; j++) {
- if (j == 0)
- continue;
- gap = vmas[j].vm_start - vmas[j - 1].vm_end;
- if (gap > largest_gap)
- largest_gap = gap;
- }
- vmas[i].rb_subtree_gap = largest_gap;
- }
- vmas[i].vm_next = NULL;
- vmas[i].vm_rb.rb_right = NULL;
- vmas[i].rb_subtree_gap = 0;
+ mas_lock(&mas);
+ for (i = 0; i < nr_vmas; i++)
+ vma_mas_store(&vmas[i], &mas);
+ mas_unlock(&mas);
}
/*
@@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
*/
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
+ static struct mm_struct mm;
struct damon_addr_range regions[3] = {0,};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
@@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};
- __link_vmas(vmas, 6);
+ mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas));
- __damon_va_three_regions(&vmas[0], regions);
+ __damon_va_three_regions(&mm, regions);
KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3c7b9d6dca95..d24148a8149f 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r)
*
* Returns 0 if success, or negative error code otherwise.
*/
-static int __damon_va_three_regions(struct vm_area_struct *vma,
+static int __damon_va_three_regions(struct mm_struct *mm,
struct damon_addr_range regions[3])
{
- struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
- struct vm_area_struct *last_vma = NULL;
- unsigned long start = 0;
- struct rb_root rbroot;
-
- /* Find two biggest gaps so that first_gap > second_gap > others */
- for (; vma; vma = vma->vm_next) {
- if (!last_vma) {
- start = vma->vm_start;
- goto next;
- }
+ struct damon_addr_range first_gap = {0}, second_gap = {0};
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long start;
- if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
- rbroot.rb_node = &vma->vm_rb;
- vma = rb_entry(rb_last(&rbroot),
- struct vm_area_struct, vm_rb);
+ /*
+ * Find the two biggest gaps so that first_gap > second_gap > others.
+ * If this is too slow, it can be optimised to examine the maple
+ * tree gaps.
+ */
+ for_each_vma(vmi, vma) {
+ unsigned long gap;
+
+ if (!prev) {
+ start = vma->vm_start;
goto next;
}
-
- gap.start = last_vma->vm_end;
- gap.end = vma->vm_start;
- if (sz_range(&gap) > sz_range(&second_gap)) {
- swap(gap, second_gap);
- if (sz_range(&second_gap) > sz_range(&first_gap))
- swap(second_gap, first_gap);
+ gap = vma->vm_start - prev->vm_end;
+
+ if (gap > sz_range(&first_gap)) {
+ second_gap = first_gap;
+ first_gap.start = prev->vm_end;
+ first_gap.end = vma->vm_start;
+ } else if (gap > sz_range(&second_gap)) {
+ second_gap.start = prev->vm_end;
+ second_gap.end = vma->vm_start;
}
next:
- last_vma = vma;
+ prev = vma;
}
if (!sz_range(&second_gap) || !sz_range(&first_gap))
@@ -159,7 +160,7 @@ next:
regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
- regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
return 0;
}
@@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t,
return -EINVAL;
mmap_read_lock(mm);
- rc = __damon_va_three_regions(mm->mmap, regions);
+ rc = __damon_va_three_regions(mm, regions);
mmap_read_unlock(mm);
mmput(mm);
diff --git a/mm/debug.c b/mm/debug.c
index bef329bf28f0..0fd15ba70d16 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page);
void dump_vma(const struct vm_area_struct *vma)
{
- pr_emerg("vma %px start %px end %px\n"
- "next %px prev %px mm %px\n"
+ pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
- vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
- vma->vm_prev, vma->vm_mm,
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
@@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
- pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n"
+ pr_emerg("mm %px task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %px\n"
#endif
- "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
@@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm)
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
- mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size,
+ mm, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
- mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
diff --git a/mm/filemap.c b/mm/filemap.c
index 8ccb868c3d95..b561619cfafe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- int err = 0;
+ int err = 0, err2;
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
@@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
- if (err != -EIO) {
- int err2 = filemap_fdatawait_range(mapping,
- lstart, lend);
- if (!err)
- err = err2;
- } else {
- /* Clear any previously stored errors */
- filemap_check_errors(mapping);
- }
- } else {
- err = filemap_check_errors(mapping);
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
}
+ err2 = filemap_check_errors(mapping);
+ if (!err)
+ err = err2;
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
diff --git a/mm/gup.c b/mm/gup.c
index 407a81d5ca03..18f6953a74e3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -532,7 +532,11 @@ retry:
}
page = vm_normal_page(vma, address, pte);
- if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+ if ((flags & FOLL_LRU) && ((page && is_zone_device_page(page)) ||
+ (!page && pte_devmap(pte)))) {
+ page = ERR_PTR(-EEXIST);
+ goto out;
+ } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
* case since they are only valid while holding the pgmap
@@ -1641,10 +1645,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
if (!locked) {
locked = 1;
mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
+ vma = find_vma_intersection(mm, nstart, end);
} else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
+ vma = find_vma_intersection(mm, vma->vm_end, end);
+
+ if (!vma)
break;
/*
* Set [nstart; nend) to intersection of desired address
@@ -1923,14 +1928,48 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- if (folio_is_pinnable(folio))
+ /*
+ * Device private pages will get faulted in during gup so it
+ * shouldn't be possible to see one here.
+ */
+ if (WARN_ON_ONCE(folio_is_device_private(folio))) {
+ ret = -EFAULT;
+ goto unpin_pages;
+ }
+
+ /*
+ * Device coherent pages are managed by a driver and should not
+ * be pinned indefinitely as it prevents the driver moving the
+ * page. So when trying to pin with FOLL_LONGTERM instead try
+ * to migrate the page out of device memory.
+ */
+ if (folio_is_device_coherent(folio)) {
+ WARN_ON_ONCE(PageCompound(&folio->page));
+
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ if (gup_flags & FOLL_PIN) {
+ get_page(&folio->page);
+ unpin_user_page(&folio->page);
+ }
+
+ pages[i] = migrate_device_page(&folio->page, gup_flags);
+ if (!pages[i]) {
+ ret = -EBUSY;
+ goto unpin_pages;
+ }
continue;
+ }
+ if (folio_is_pinnable(folio))
+ continue;
/*
* Try to move out any movable page before pinning the range.
*/
if (folio_test_hugetlb(folio)) {
- if (!isolate_huge_page(&folio->page,
+ if (isolate_hugetlb(&folio->page,
&movable_page_list))
isolation_error_count++;
continue;
@@ -1961,10 +2000,13 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
return nr_pages;
unpin_pages:
- if (gup_flags & FOLL_PIN) {
- unpin_user_pages(pages, nr_pages);
- } else {
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_pages; i++) {
+ if (!pages[i])
+ continue;
+
+ if (gup_flags & FOLL_PIN)
+ unpin_user_page(pages[i]);
+ else
put_page(pages[i]);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9b90a8d7dfa..7b7eedc6d5dd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -69,21 +69,85 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_active(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ bool smaps, bool in_pf)
{
- /* The addr is used to check if the vma size fits */
- unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+ if (!vma->vm_mm) /* vdso */
+ return false;
+
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ * */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ /*
+ * If the hardware/firmware marked hugepage support disabled.
+ */
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+ return false;
- if (!transhuge_vma_suitable(vma, addr))
+ /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
+ if (vma_is_dax(vma))
+ return in_pf;
+
+ /*
+ * Special VMA and hugetlb VMA.
+ * Must be checked after dax since some dax mappings may have
+ * VM_MIXEDMAP set.
+ */
+ if (vm_flags & VM_NO_KHUGEPAGED)
+ return false;
+
+ /*
+ * Check alignment for file vma and size for both file and anon vma.
+ *
+ * Skip the check for page fault. Huge fault does the check in fault
+ * handlers. And this check is not suitable for huge PUD fault.
+ */
+ if (!in_pf &&
+ !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
return false;
- if (vma_is_anonymous(vma))
- return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma))
+
+ /*
+ * Enabled via shmem mount options or sysfs settings.
+ * Must be done before hugepage flags check since shmem has its
+ * own flags.
+ */
+ if (!in_pf && shmem_file(vma->vm_file))
return shmem_huge_enabled(vma);
- if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+
+ if (!hugepage_flags_enabled())
+ return false;
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ return false;
+
+ /* Only regular file is valid */
+ if (!in_pf && file_thp_enabled(vma))
return true;
- return false;
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if (vma_is_temporary_stack(vma))
+ return false;
+
+ /*
+ * THPeligible bit of smaps should show 1 for proper VMAs even
+ * though anon_vma is not initialized yet.
+ *
+ * Allow page fault since anon_vma may be not initialized until
+ * the first page fault.
+ */
+ if (!vma->anon_vma)
+ return (smaps || in_pf);
+
+ return true;
}
static bool get_huge_zero_page(void)
@@ -423,10 +487,10 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
- err = register_shrinker(&huge_zero_page_shrinker);
+ err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
if (err)
goto err_hzp_shrinker;
- err = register_shrinker(&deferred_split_shrinker);
+ err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
if (err)
goto err_split_shrinker;
@@ -497,25 +561,125 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
}
#ifdef CONFIG_MEMCG
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static struct shrinker deferred_split_shrinker;
+
+static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct mem_cgroup *memcg = page_memcg(compound_head(page));
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ if (mem_cgroup_disabled())
+ return NULL;
+ if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue)
+ return NULL;
+ return container_of(queue, struct mem_cgroup, deferred_split_queue);
+}
- if (memcg)
- return &memcg->deferred_split_queue;
- else
- return &pgdat->deferred_split_queue;
+static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio)
+{
+ struct mem_cgroup *memcg = folio_memcg(folio);
+
+ return memcg ? &memcg->deferred_split_queue : NULL;
+}
+
+static void thp_sq_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_lock(&src->deferred_split_queue.split_queue_lock);
+ spin_lock_nested(&dst->deferred_split_queue.split_queue_lock,
+ SINGLE_DEPTH_NESTING);
}
+
+static void thp_sq_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid;
+ struct deferred_split *src_queue, *dst_queue;
+
+ src_queue = &src->deferred_split_queue;
+ dst_queue = &dst->deferred_split_queue;
+
+ if (!src_queue->split_queue_len)
+ return;
+
+ list_splice_tail_init(&src_queue->split_queue, &dst_queue->split_queue);
+ dst_queue->split_queue_len += src_queue->split_queue_len;
+ src_queue->split_queue_len = 0;
+
+ for_each_node(nid)
+ set_shrinker_bit(dst, nid, deferred_split_shrinker.id);
+}
+
+static void thp_sq_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_unlock(&dst->deferred_split_queue.split_queue_lock);
+ spin_unlock(&src->deferred_split_queue.split_queue_lock);
+}
+DEFINE_MEMCG_REPARENT_OPS(thp_sq);
#else
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ return NULL;
+}
- return &pgdat->deferred_split_queue;
+static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio)
+{
+ return NULL;
}
#endif
+static struct deferred_split *folio_split_queue(struct folio *folio)
+{
+ struct deferred_split *queue = folio_memcg_split_queue(folio);
+
+ return queue ? : &NODE_DATA(folio_nid(folio))->deferred_split_queue;
+}
+
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+retry:
+ queue = folio_split_queue(folio);
+ spin_lock(&queue->split_queue_lock);
+
+ if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) {
+ spin_unlock(&queue->split_queue_lock);
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ return queue;
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+retry:
+ queue = folio_split_queue(folio);
+ spin_lock_irqsave(&queue->split_queue_lock, *flags);
+
+ if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ return queue;
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+ spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
void prep_transhuge_page(struct page *page)
{
/*
@@ -726,7 +890,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- khugepaged_enter(vma, vma->vm_flags);
+ khugepaged_enter_vma(vma, vma->vm_flags);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
@@ -2266,11 +2430,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start,
+ * If we're also updating the next vma vm_start,
* check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
@@ -2455,7 +2619,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
- unlock_page_lruvec(lruvec);
+ lruvec_unlock(lruvec);
/* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, nr);
@@ -2540,7 +2704,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct folio *folio = page_folio(page);
struct page *head = &folio->page;
- struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ struct deferred_split *ds_queue;
XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
@@ -2632,13 +2796,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&ds_queue->split_queue_lock);
+ ds_queue = folio_split_queue_lock(folio);
if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock(&ds_queue->split_queue_lock);
+ split_queue_unlock(ds_queue);
if (mapping) {
int nr = thp_nr_pages(head);
@@ -2656,7 +2820,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
- spin_unlock(&ds_queue->split_queue_lock);
+ split_queue_unlock(ds_queue);
fail:
if (mapping)
xas_unlock(&xas);
@@ -2680,25 +2844,23 @@ out:
void free_transhuge_page(struct page *page)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue;
unsigned long flags;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(page_folio(page), &flags);
if (!list_empty(page_deferred_list(page))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = page_memcg(compound_head(page));
-#endif
+ struct deferred_split *ds_queue;
unsigned long flags;
+ struct folio *folio = page_folio(page);
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -2715,18 +2877,19 @@ void deferred_split_huge_page(struct page *page)
if (PageSwapCache(page))
return;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
if (list_empty(page_deferred_list(page))) {
+ struct mem_cgroup *memcg;
+
+ memcg = folio_split_queue_memcg(folio, ds_queue);
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
if (memcg)
set_shrinker_bit(memcg, page_to_nid(page),
- deferred_split_shrinker.id);
-#endif
+ shrinker_id(&deferred_split_shrinker));
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -2906,7 +3069,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
}
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
if (IS_ERR(page))
continue;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index da59aece4756..87d875e5e0a9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order)
#endif
static unsigned long hugetlb_cma_size __initdata;
-/*
- * Minimum page order among possible hugepage sizes, set to a proper value
- * at boot time.
- */
-static unsigned int minimum_order __read_mostly = UINT_MAX;
-
__initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
@@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
struct page *page;
int rc = 0;
+ unsigned int order;
+ struct hstate *h;
if (!hugepages_supported())
return rc;
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ order = huge_page_order(&default_hstate);
+ for_each_hstate(h)
+ order = min(order, huge_page_order(h));
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
page = pfn_to_page(pfn);
rc = dissolve_free_huge_page(page);
if (rc)
@@ -2766,8 +2766,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- if (!isolate_huge_page(old_page, list))
- ret = -EBUSY;
+ ret = isolate_hugetlb(old_page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!HPageFreed(old_page)) {
@@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (page_count(head) && isolate_huge_page(head, list))
+ if (page_count(head) && !isolate_hugetlb(head, list))
ret = 0;
else if (!page_count(head))
ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -3149,9 +3148,6 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
- if (minimum_order > huge_page_order(h))
- minimum_order = huge_page_order(h);
-
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3176,7 +3172,6 @@ static void __init hugetlb_init_hstates(void)
h->demote_order = h2->order;
}
}
- VM_BUG_ON(minimum_order == UINT_MAX);
}
static void __init report_hugepages(void)
@@ -4732,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
int ret = 0;
if (cow) {
@@ -4751,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}
+ last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
+ if (!src_pte) {
+ addr |= last_addr_mask;
continue;
+ }
dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -4772,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* after taking the lock below.
*/
dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ addr |= last_addr_mask;
continue;
+ }
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4803,12 +4804,11 @@ again:
entry = swp_entry_to_pte(swp_entry);
if (userfaultfd_wp(src_vma) && uffd_wp)
entry = huge_pte_mkuffd_wp(entry);
- set_huge_swap_pte_at(src, addr, src_pte,
- entry, sz);
+ set_huge_pte_at(src, addr, src_pte, entry);
}
if (!userfaultfd_wp(dst_vma) && uffd_wp)
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
/*
* We copy the pte marker only if the dst vma has
@@ -4934,7 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
unsigned long old_end = old_addr + len;
- unsigned long old_addr_copy;
+ unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
bool shared_pmd = false;
@@ -4949,23 +4949,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
- if (!src_pte)
+ if (!src_pte) {
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
continue;
+ }
if (huge_pte_none(huge_ptep_get(src_pte)))
continue;
- /* old_addr arg to huge_pmd_unshare() is a pointer and so the
- * arg may be modified. Pass a copy instead to preserve the
- * value in old_addr.
- */
- old_addr_copy = old_addr;
-
- if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+ if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
shared_pmd = true;
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
continue;
}
@@ -4999,6 +4999,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5019,17 +5020,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
- if (!ptep)
+ if (!ptep) {
+ address |= last_addr_mask;
continue;
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
spin_unlock(ptl);
tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
force_flush = true;
+ address |= last_addr_mask;
continue;
}
@@ -5709,7 +5714,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
+ migration_entry_wait_huge(vma, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -6046,8 +6051,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
- (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
- dst_vma->vm_flags & VM_WRITE);
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
/* No need to invalidate - it was non-present before */
@@ -6299,6 +6302,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -6315,14 +6319,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
- if (!ptep)
+ if (!ptep) {
+ address |= last_addr_mask;
continue;
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6332,6 +6339,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pages++;
spin_unlock(ptl);
shared_pmd = true;
+ address |= last_addr_mask;
continue;
}
pte = huge_ptep_get(ptep);
@@ -6357,8 +6365,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
newpte = pte_swp_mkuffd_wp(newpte);
else if (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_swap_pte_at(mm, address, ptep,
- newpte, psize);
+ set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
spin_unlock(ptl);
@@ -6755,11 +6762,11 @@ out:
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
- pgd_t *pgd = pgd_offset(mm, *addr);
- p4d_t *p4d = p4d_offset(pgd, *addr);
- pud_t *pud = pud_offset(p4d, *addr);
+ pgd_t *pgd = pgd_offset(mm, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6769,14 +6776,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
- /*
- * This update of passed address optimizes loops sequentially
- * processing addresses in increments of huge page size (PMD_SIZE
- * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
- * Update address to the 'last page' in the cleared area so that
- * calling loop can move to first page past this area.
- */
- *addr |= PUD_SIZE - PMD_SIZE;
return 1;
}
@@ -6788,7 +6787,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
}
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
@@ -6871,6 +6870,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return (pte_t *)pmd;
}
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size. Used to skip non-present
+ * page table entries when linearly scanning address ranges. Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+ unsigned long hp_size = huge_page_size(h);
+
+ if (hp_size == PUD_SIZE)
+ return P4D_SIZE - PUD_SIZE;
+ else if (hp_size == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+ else
+ return 0UL;
+}
+
+#else
+
+/* See description above. Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+ if (huge_page_size(h) == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+#endif
+ return 0UL;
+}
+
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/*
@@ -6934,7 +6964,7 @@ retry:
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ __migration_entry_wait_huge((pte_t *)pmd, ptl);
goto retry;
}
/*
@@ -6966,15 +6996,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
{
- bool ret = true;
+ int ret = 0;
spin_lock_irq(&hugetlb_lock);
if (!PageHeadHuge(page) ||
!HPageMigratable(page) ||
!get_page_unless_zero(page)) {
- ret = false;
+ ret = -EBUSY;
goto unlock;
}
ClearHPageMigratable(page);
@@ -7094,14 +7124,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- unsigned long tmp = address;
-
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- /* We don't want 'address' to be changed */
- huge_pmd_unshare(mm, vma, &tmp, ptep);
+ huge_pmd_unshare(mm, vma, address, ptep);
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba29c15c53d6..1362feb3c6c9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,7 +10,7 @@
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
-#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
#include "hugetlb_vmemmap.h"
/*
@@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
return ret;
}
+static unsigned int vmemmap_optimizable_pages(struct hstate *h,
+ struct page *head)
+{
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+ pmd_t *pmdp, pmd;
+ struct page *vmemmap_page;
+ unsigned long vaddr = (unsigned long)head;
+
+ /*
+ * Only the vmemmap page's vmemmap page can be self-hosted.
+ * Walking the page tables to find the backing page of the
+ * vmemmap page.
+ */
+ pmdp = pmd_off_k(vaddr);
+ /*
+ * The READ_ONCE() is used to stabilize *pmdp in a register or
+ * on the stack so that it will stop changing under the code.
+ * The only concurrent operation where it can be changed is
+ * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+ * operation).
+ */
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmd))
+ vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+ else
+ vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+ /*
+ * Due to HugeTLB alignment requirements and the vmemmap pages
+ * being at the start of the hotplugged memory region in
+ * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+ * page's vmemmap page if it is marked as VmemmapSelfHosted is
+ * sufficient.
+ *
+ * [ hotplugged memory ]
+ * [ section ][...][ section ]
+ * [ vmemmap ][ usable memory ]
+ * ^ | | |
+ * +---+ | |
+ * ^ | |
+ * +-------+ |
+ * ^ |
+ * +-------------------------------------------+
+ */
+ if (PageVmemmapSelfHosted(vmemmap_page))
+ return 0;
+ }
+
+ return hugetlb_optimize_vmemmap_pages(h);
+}
+
void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ vmemmap_pages = vmemmap_optimizable_pages(h, head);
if (!vmemmap_pages)
return;
- if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
- return;
-
static_branch_inc(&hugetlb_optimize_vmemmap_key);
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
@@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static __init int hugetlb_vmemmap_sysctls_init(void)
{
/*
- * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
- * crosses page boundaries, the vmemmap pages cannot be optimized.
+ * If "struct page" crosses page boundaries, the vmemmap pages cannot
+ * be optimized.
*/
- if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ if (is_power_of_2(sizeof(struct page)))
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..c9327abb771c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
-#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
@@ -28,7 +28,7 @@
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index ddd2d6a46f1b..39b6dd400a15 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ struct folio_batch;
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC|__GFP_NOLOCKDEP)
+ __GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -84,8 +84,9 @@ void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long floor,
+ unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags)
}
/* mm/util.c */
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev);
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
struct anon_vma *folio_anon_vma(struct folio *folio);
#ifdef CONFIG_MMU
@@ -853,6 +851,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
void free_zone_device_page(struct page *page);
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags);
/*
* mm/gup.c
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index c40c0e7b3b5f..707c3a527fcb 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -343,7 +343,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
- kasan_report_invalid_free(tagged_object, ip);
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE);
return true;
}
@@ -352,7 +352,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
return false;
if (!kasan_byte_accessible(tagged_object)) {
- kasan_report_invalid_free(tagged_object, ip);
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
@@ -377,12 +377,12 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
{
if (ptr != page_address(virt_to_head_page(ptr))) {
- kasan_report_invalid_free(ptr, ip);
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
return true;
}
if (!kasan_byte_accessible(ptr)) {
- kasan_report_invalid_free(ptr, ip);
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9e1b6544bfa8..9ad8eff71b28 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -257,27 +257,37 @@ static void unpoison_vmalloc_pages(const void *addr, u8 tag)
}
}
+static void init_vmalloc_pages(const void *start, unsigned long size)
+{
+ const void *addr;
+
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ struct page *page = virt_to_page(addr);
+
+ clear_highpage_kasan_tagged(page);
+ }
+}
+
void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
kasan_vmalloc_flags_t flags)
{
u8 tag;
unsigned long redzone_start, redzone_size;
- if (!kasan_vmalloc_enabled())
- return (void *)start;
-
- if (!is_vmalloc_or_module_addr(start))
+ if (!kasan_vmalloc_enabled() || !is_vmalloc_or_module_addr(start)) {
+ if (flags & KASAN_VMALLOC_INIT)
+ init_vmalloc_pages(start, size);
return (void *)start;
+ }
/*
- * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
- * mappings as:
+ * Don't tag non-VM_ALLOC mappings, as:
*
* 1. Unlike the software KASAN modes, hardware tag-based KASAN only
* supports tagging physical memory. Therefore, it can only tag a
* single mapping of normal physical pages.
* 2. Hardware tag-based KASAN can only tag memory mapped with special
- * mapping protection bits, see arch_vmalloc_pgprot_modify().
+ * mapping protection bits, see arch_vmap_pgprot_tagged().
* As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
* providing these bits would require tracking all non-VM_ALLOC
* mappers.
@@ -289,15 +299,19 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*
* For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
*/
- if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
return (void *)start;
+ }
/*
* Don't tag executable memory.
* The kernel doesn't tolerate having the PC register tagged.
*/
- if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
return (void *)start;
+ }
tag = kasan_random_tag();
start = set_tag(start, tag);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 610d60d6e5b8..01c03e45acd4 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -125,6 +125,7 @@ static inline bool kasan_sync_fault_possible(void)
enum kasan_report_type {
KASAN_REPORT_ACCESS,
KASAN_REPORT_INVALID_FREE,
+ KASAN_REPORT_DOUBLE_FREE,
};
struct kasan_report_info {
@@ -277,7 +278,7 @@ static inline void kasan_print_address_stack_frame(const void *addr) { }
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
-void kasan_report_invalid_free(void *object, unsigned long ip);
+void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
struct page *kasan_addr_to_page(const void *addr);
struct slab *kasan_addr_to_slab(const void *addr);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b341a191651d..fe3f606b3a98 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -176,8 +176,12 @@ static void end_report(unsigned long *flags, void *addr)
static void print_error_description(struct kasan_report_info *info)
{
if (info->type == KASAN_REPORT_INVALID_FREE) {
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n",
- (void *)info->ip);
+ pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip);
+ return;
+ }
+
+ if (info->type == KASAN_REPORT_DOUBLE_FREE) {
+ pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip);
return;
}
@@ -433,7 +437,7 @@ static void print_report(struct kasan_report_info *info)
}
}
-void kasan_report_invalid_free(void *ptr, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type)
{
unsigned long flags;
struct kasan_report_info info;
@@ -448,7 +452,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip)
start_report(&flags, true);
- info.type = KASAN_REPORT_INVALID_FREE;
+ info.type = type;
info.access_addr = ptr;
info.first_bad_addr = kasan_reset_tag(ptr);
info.access_size = 0;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..cfe231c5958f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -147,8 +147,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
- __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
- scan_sleep_millisecs_store);
+ __ATTR_RW(scan_sleep_millisecs);
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -175,8 +174,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
- __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
- alloc_sleep_millisecs_store);
+ __ATTR_RW(alloc_sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -200,8 +198,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute pages_to_scan_attr =
- __ATTR(pages_to_scan, 0644, pages_to_scan_show,
- pages_to_scan_store);
+ __ATTR_RW(pages_to_scan);
static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -221,22 +218,21 @@ static ssize_t full_scans_show(struct kobject *kobj,
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
-static ssize_t khugepaged_defrag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
-static ssize_t khugepaged_defrag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
- __ATTR(defrag, 0644, khugepaged_defrag_show,
- khugepaged_defrag_store);
+ __ATTR_RW(defrag);
/*
* max_ptes_none controls if khugepaged should collapse hugepages over
@@ -246,21 +242,21 @@ static struct kobj_attribute khugepaged_defrag_attr =
* runs. Increasing max_ptes_none will instead potentially reduce the
* free memory in the system during the khugepaged scan.
*/
-static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
-static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_none;
err = kstrtoul(buf, 10, &max_ptes_none);
- if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ if (err || max_ptes_none > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_none = max_ptes_none;
@@ -268,25 +264,24 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
- __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
- khugepaged_max_ptes_none_store);
+ __ATTR_RW(max_ptes_none);
-static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}
-static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_swap;
err = kstrtoul(buf, 10, &max_ptes_swap);
- if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_swap = max_ptes_swap;
@@ -295,25 +290,24 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_swap_attr =
- __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
- khugepaged_max_ptes_swap_store);
+ __ATTR_RW(max_ptes_swap);
-static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}
-static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
- if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
@@ -322,8 +316,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
- __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
- khugepaged_max_ptes_shared_store);
+ __ATTR_RW(max_ptes_shared);
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
@@ -437,43 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- if (!transhuge_vma_enabled(vma, vm_flags))
- return false;
-
- if (vm_flags & VM_NO_KHUGEPAGED)
- return false;
-
- /* Don't run khugepaged against DAX vma */
- if (vma_is_dax(vma))
- return false;
-
- if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
- vma->vm_pgoff, HPAGE_PMD_NR))
- return false;
-
- /* Enabled via shmem mount options or sysfs settings. */
- if (shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma);
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
- return false;
-
- /* Only regular file is valid */
- if (file_thp_enabled(vma))
- return true;
-
- if (!vma->anon_vma || !vma_is_anonymous(vma))
- return false;
- if (vma_is_temporary_stack(vma))
- return false;
-
- return true;
-}
-
void __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
@@ -509,10 +465,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
- khugepaged_enabled() &&
- (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK))) {
- if (hugepage_vma_check(vma, vm_flags))
+ hugepage_flags_enabled()) {
+ if (hugepage_vma_check(vma, vm_flags, false, false))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -599,7 +553,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
@@ -618,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -762,7 +716,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
list_del(&src_page->lru);
- release_pte_page(src_page);
+ mod_node_page_state(page_pgdat(src_page),
+ NR_ISOLATED_ANON + page_is_file_lru(src_page),
+ -compound_nr(src_page));
+ unlock_page(src_page);
+ free_swap_cache(src_page);
+ putback_lru_page(src_page);
}
}
@@ -802,6 +761,10 @@ static bool khugepaged_scan_abort(int nid)
return false;
}
+#define khugepaged_defrag() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
@@ -899,7 +862,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
khugepaged_alloc_sleep();
} else
count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+ } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
return hpage;
}
@@ -947,7 +910,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct vm_area_struct **vmap)
{
struct vm_area_struct *vma;
- unsigned long hstart, hend;
if (unlikely(khugepaged_test_exit(mm)))
return SCAN_ANY_PROCESS;
@@ -956,13 +918,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!vma)
return SCAN_VMA_NULL;
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
return SCAN_VMA_CHECK;
- /* Anon VMA expected */
+ /*
+ * Anon VMA expected, the address may be unmapped then
+ * remapped to file after khugepaged reaquired the mmap_lock.
+ *
+ * hugepage_vma_check may return true for qualified file
+ * vmas.
+ */
if (!vma->anon_vma || !vma_is_anonymous(vma))
return SCAN_VMA_CHECK;
return 0;
@@ -972,8 +938,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
- * Called and returns without pte mapped or spinlocks held,
- * but with mmap_lock held to protect against vma changes.
+ * Called and returns without pte mapped or spinlocks held.
+ * Note that if false is returned, mmap_lock will be released.
*/
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
@@ -1000,27 +966,24 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
pte_unmap(vmf.pte);
continue;
}
- swapped_in++;
ret = do_swap_page(&vmf);
- /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
+ /*
+ * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
+ * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
+ * we do not retry here and swap entry will remain in pagetable
+ * resulting in later failure.
+ */
if (ret & VM_FAULT_RETRY) {
- mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, haddr, &vma)) {
- /* vma is no longer available, don't continue to swapin */
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
- /* check if the pmd is still valid */
- if (mm_find_pmd(mm, haddr) != pmd) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
}
if (ret & VM_FAULT_ERROR) {
+ mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
+ swapped_in++;
}
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
@@ -1086,13 +1049,12 @@ static void collapse_huge_page(struct mm_struct *mm,
}
/*
- * __collapse_huge_page_swapin always returns with mmap_lock locked.
- * If it fails, we release mmap_lock and jump out_nolock.
+ * __collapse_huge_page_swapin will return with mmap_lock released
+ * when it fails. So we jump out_nolock directly in that case.
* Continuing to collapse causes inconsistency.
*/
if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
pmd, referenced)) {
- mmap_read_unlock(mm);
goto out_nolock;
}
@@ -1219,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
@@ -1267,7 +1229,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1309,7 +1271,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Check if the page has any GUP (or other external) pins.
*
- * Here the check is racy it may see totmal_mapcount > refcount
+ * Here the check is racy it may see total_mapcount > refcount
* in some cases.
* For example, one process with one forked child process.
* The parent has the PMD split due to MADV_DONTNEED, then
@@ -1382,8 +1344,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
*/
-static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
{
struct mm_slot *mm_slot;
@@ -1394,7 +1356,6 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
spin_unlock(&khugepaged_mm_lock);
- return 0;
}
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1426,7 +1387,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
- struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd;
@@ -1444,7 +1405,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
* the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
* will not fail the vma for missing VM_HUGEPAGE
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
return;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1479,7 +1440,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
goto abort;
page = vm_normal_page(vma, addr, *pte);
-
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ page = NULL;
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1459,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ goto abort;
page_remove_rmap(page, vma, false);
}
@@ -1557,7 +1521,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
- * Not that vma->anon_vma check is racy: it can be set up after
+ * Note that vma->anon_vma check is racy: it can be set up after
* the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
@@ -1885,8 +1849,8 @@ out_unlock:
if (nr_none) {
__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
- if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ /* nr_none is always 0 for non-shmem. */
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
@@ -1950,10 +1914,10 @@ xa_unlocked:
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
- mapping->nrpages -= nr_none;
-
- if (is_shmem)
+ if (nr_none) {
+ mapping->nrpages -= nr_none;
shmem_uncharge(mapping->host, nr_none);
+ }
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
@@ -2092,10 +2056,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
+ struct vma_iterator vmi;
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
+ unsigned long address;
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
@@ -2119,11 +2085,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
- vma = find_vma(mm, khugepaged_scan.address);
progress++;
- for (; vma; vma = vma->vm_next) {
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ address = khugepaged_scan.address;
+ vma_iter_init(&vmi, mm, address);
+ for_each_vma(vmi, vma) {
unsigned long hstart, hend;
cond_resched();
@@ -2131,22 +2100,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
skip:
progress++;
continue;
}
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend)
- goto skip;
+ hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
- if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
- goto skip;
while (khugepaged_scan.address < hend) {
int ret;
@@ -2216,7 +2181,7 @@ breakouterloop_mmap_lock:
static int khugepaged_has_work(void)
{
return !list_empty(&khugepaged_scan.mm_head) &&
- khugepaged_enabled();
+ hugepage_flags_enabled();
}
static int khugepaged_wait_event(void)
@@ -2281,7 +2246,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (khugepaged_enabled())
+ if (hugepage_flags_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -2312,7 +2277,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!khugepaged_enabled()) {
+ if (!hugepage_flags_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -2362,7 +2327,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled()) {
+ if (hugepage_flags_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2388,7 +2353,7 @@ fail:
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled() && khugepaged_thread)
+ if (hugepage_flags_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
diff --git a/mm/ksm.c b/mm/ksm.c
index fe3e0a39f73a..75da1d1a4c22 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
@@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
if (!vma)
goto out;
- page = follow_page(vma, addr, FOLL_GET);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
goto out;
if (PageAnon(page)) {
@@ -981,11 +981,13 @@ static int unmerge_and_remove_all_rmap_items(void)
struct mm_slot, mm_list);
spin_unlock(&ksm_mmlist_lock);
- for (mm_slot = ksm_scan.mm_slot;
- mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
+ for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
+ mm_slot = ksm_scan.mm_slot) {
+ VMA_ITERATOR(vmi, mm_slot->mm, 0);
+
mm = mm_slot->mm;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (ksm_test_exit(mm))
break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
@@ -2232,6 +2234,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
+ struct vma_iterator vmi;
int nid;
if (list_empty(&ksm_mm_head.mm_list))
@@ -2290,13 +2293,13 @@ next_mm:
}
mm = slot->mm;
+ vma_iter_init(&vmi, mm, ksm_scan.address);
+
mmap_read_lock(mm);
if (ksm_test_exit(mm))
- vma = NULL;
- else
- vma = find_vma(mm, ksm_scan.address);
+ goto no_vmas;
- for (; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
@@ -2307,7 +2310,7 @@ next_mm:
while (ksm_scan.address < vma->vm_end) {
if (ksm_test_exit(mm))
break;
- *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+ *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
@@ -2334,6 +2337,7 @@ next_mm:
}
if (ksm_test_exit(mm)) {
+no_vmas:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 0316bbc6441b..851fa4e134bc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,6 @@ success:
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
struct swap_iocb *splug = NULL;
@@ -208,12 +207,13 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
+ pte_t *ptep;
- orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- pte = *(orig_pte + ((index - start) / PAGE_SIZE));
- pte_unmap_unlock(orig_pte, ptl);
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
+ pte = *ptep;
+ pte_unmap_unlock(ptep, ptl);
- if (pte_present(pte) || pte_none(pte))
+ if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
@@ -421,7 +421,7 @@ regular_page:
continue;
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
@@ -1238,7 +1238,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
if (start >= end)
break;
if (prev)
- vma = prev->vm_next;
+ vma = find_vma(mm, prev->vm_end);
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 655c09393ad5..a04903fd733b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -77,6 +77,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct obj_cgroup *root_obj_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -252,9 +253,14 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
return container_of(vmpr, struct mem_cgroup, vmpressure);
}
-#ifdef CONFIG_MEMCG_KMEM
static DEFINE_SPINLOCK(objcg_lock);
+static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
+{
+ return objcg == root_obj_cgroup;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
bool mem_cgroup_kmem_disabled(void)
{
return cgroup_memory_nokmem;
@@ -263,12 +269,10 @@ bool mem_cgroup_kmem_disabled(void)
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
unsigned int nr_pages);
-static void obj_cgroup_release(struct percpu_ref *ref)
+static void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
{
- struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
unsigned int nr_bytes;
unsigned int nr_pages;
- unsigned long flags;
/*
* At this point all allocated objects are freed, and
@@ -282,9 +286,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
* 3) CPU1: a process from another memcg is allocating something,
* the stock if flushed,
* objcg->nr_charged_bytes = PAGE_SIZE - 92
- * 5) CPU0: we do release this object,
+ * 4) CPU0: we do release this object,
* 92 bytes are added to stock->nr_bytes
- * 6) CPU0: stock is flushed,
+ * 5) CPU0: stock is flushed,
* 92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
@@ -296,6 +300,19 @@ static void obj_cgroup_release(struct percpu_ref *ref)
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
+}
+#else
+static inline void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
+{
+}
+#endif
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ unsigned long flags;
+
+ obj_cgroup_release_bytes(objcg);
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -324,28 +341,135 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
}
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static void objcg_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_lock(&objcg_lock);
+}
+
+static void objcg_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
{
struct obj_cgroup *objcg, *iter;
- objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+ objcg = rcu_replace_pointer(src->objcg, NULL, true);
+ /* 1) Ready to reparent active objcg. */
+ list_add(&objcg->list, &src->objcg_list);
+ /* 2) Reparent active objcg and already reparented objcgs to dst. */
+ list_for_each_entry(iter, &src->objcg_list, list)
+ WRITE_ONCE(iter->memcg, dst);
+ /* 3) Move already reparented objcgs to the dst's list */
+ list_splice(&src->objcg_list, &dst->objcg_list);
+}
- spin_lock_irq(&objcg_lock);
+static void objcg_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_unlock(&objcg_lock);
+}
- /* 1) Ready to reparent active objcg. */
- list_add(&objcg->list, &memcg->objcg_list);
- /* 2) Reparent active objcg and already reparented objcgs to parent. */
- list_for_each_entry(iter, &memcg->objcg_list, list)
- WRITE_ONCE(iter->memcg, parent);
- /* 3) Move already reparented objcgs to the parent's list */
- list_splice(&memcg->objcg_list, &parent->objcg_list);
+static DEFINE_MEMCG_REPARENT_OPS(objcg);
+
+static void lruvec_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid, nest = 0;
+
+ for_each_node(nid) {
+ spin_lock_nested(&mem_cgroup_lruvec(src,
+ NODE_DATA(nid))->lru_lock, nest++);
+ spin_lock_nested(&mem_cgroup_lruvec(dst,
+ NODE_DATA(nid))->lru_lock, nest++);
+ }
+}
+
+static void lruvec_reparent_lru(struct lruvec *src, struct lruvec *dst,
+ enum lru_list lru)
+{
+ int zid;
+ struct mem_cgroup_per_node *mz_src, *mz_dst;
+
+ mz_src = container_of(src, struct mem_cgroup_per_node, lruvec);
+ mz_dst = container_of(dst, struct mem_cgroup_per_node, lruvec);
- spin_unlock_irq(&objcg_lock);
+ if (lru != LRU_UNEVICTABLE)
+ list_splice_tail_init(&src->lists[lru], &dst->lists[lru]);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz_dst->lru_zone_size[zid][lru] += mz_src->lru_zone_size[zid][lru];
+ mz_src->lru_zone_size[zid][lru] = 0;
+ }
+}
+
+static void lruvec_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid;
+
+ for_each_node(nid) {
+ enum lru_list lru;
+ struct lruvec *src_lruvec, *dst_lruvec;
+
+ src_lruvec = mem_cgroup_lruvec(src, NODE_DATA(nid));
+ dst_lruvec = mem_cgroup_lruvec(dst, NODE_DATA(nid));
+
+ dst_lruvec->anon_cost += src_lruvec->anon_cost;
+ dst_lruvec->file_cost += src_lruvec->file_cost;
+
+ for_each_lru(lru)
+ lruvec_reparent_lru(src_lruvec, dst_lruvec, lru);
+ }
+}
+
+static void lruvec_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid;
+
+ for_each_node(nid) {
+ spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock);
+ spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock);
+ }
+}
+
+static DEFINE_MEMCG_REPARENT_OPS(lruvec);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+DECLARE_MEMCG_REPARENT_OPS(thp_sq);
+#endif
+
+/* The lock order depends on the order of elements in this array. */
+static const struct memcg_reparent_ops *memcg_reparent_ops[] = {
+ &memcg_objcg_reparent_ops,
+ &memcg_lruvec_reparent_ops,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ &memcg_thp_sq_reparent_ops,
+#endif
+};
+
+#define DEFINE_MEMCG_REPARENT_FUNC(phase) \
+ static void memcg_reparent_##phase(struct mem_cgroup *src, \
+ struct mem_cgroup *dst) \
+ { \
+ int i; \
+ \
+ for (i = 0; i < ARRAY_SIZE(memcg_reparent_ops); i++) \
+ memcg_reparent_ops[i]->phase(src, dst); \
+ }
+
+DEFINE_MEMCG_REPARENT_FUNC(lock)
+DEFINE_MEMCG_REPARENT_FUNC(relocate)
+DEFINE_MEMCG_REPARENT_FUNC(unlock)
+
+static void memcg_reparent_objcgs(struct mem_cgroup *src)
+{
+ struct mem_cgroup *dst = parent_mem_cgroup(src);
+ struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
+
+ local_irq_disable();
+ memcg_reparent_lock(src, dst);
+ memcg_reparent_relocate(src, dst);
+ memcg_reparent_unlock(src, dst);
+ local_irq_enable();
percpu_ref_kill(&objcg->refcnt);
}
+#ifdef CONFIG_MEMCG_KMEM
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
@@ -357,7 +481,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif
/**
- * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
* @page: page of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
@@ -367,13 +491,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return &root_mem_cgroup->css;
- if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg = get_mem_cgroup_from_page(page);
+ if (!memcg)
memcg = root_mem_cgroup;
return &memcg->css;
@@ -756,13 +882,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
- struct page *head = compound_head(page); /* rmap on tail pages */
+ struct folio *folio = page_folio(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
rcu_read_lock();
- memcg = page_memcg(head);
+ memcg = folio_memcg(folio);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
rcu_read_unlock();
@@ -1183,23 +1309,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return;
-
- memcg = folio_memcg(folio);
-
- if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
- else
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
-}
-#endif
-
/**
* folio_lruvec_lock - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
@@ -1214,10 +1323,18 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
*/
struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock(&lruvec->lru_lock);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -1237,10 +1354,18 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
*/
struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irq(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -1262,10 +1387,18 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -2037,7 +2170,9 @@ void folio_memcg_lock(struct folio *folio)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- */
+ *
+ * The RCU lock also protects the memcg from being freed.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
@@ -2766,18 +2901,33 @@ static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
{
- VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
+ VM_BUG_ON_FOLIO(folio_objcg(folio), folio);
/*
- * Any of the following ensures page's memcg stability:
+ * Any of the following ensures page's objcg stability:
*
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
- folio->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)objcg;
+}
+
+static struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
+
+ rcu_read_lock();
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ }
+ rcu_read_unlock();
+
+ return objcg;
}
#ifdef CONFIG_MEMCG_KMEM
@@ -2923,19 +3073,6 @@ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
}
-static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
-{
- struct obj_cgroup *objcg = NULL;
-
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
- break;
- objcg = NULL;
- }
- return objcg;
-}
-
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg = NULL;
@@ -2949,7 +3086,16 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
- objcg = __get_obj_cgroup_from_memcg(memcg);
+
+ if (mem_cgroup_is_root(memcg))
+ goto out;
+
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ if (obj_cgroup_is_root(objcg)) {
+ obj_cgroup_put(objcg);
+ objcg = NULL;
+ }
+out:
rcu_read_unlock();
return objcg;
}
@@ -2961,20 +3107,10 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
if (!memcg_kmem_enabled() || memcg_kmem_bypass())
return NULL;
- if (PageMemcgKmem(page)) {
- objcg = __folio_objcg(page_folio(page));
+ objcg = folio_objcg(page_folio(page));
+ if (objcg)
obj_cgroup_get(objcg);
- } else {
- struct mem_cgroup *memcg;
- rcu_read_lock();
- memcg = __folio_memcg(page_folio(page));
- if (memcg)
- objcg = __get_obj_cgroup_from_memcg(memcg);
- else
- objcg = NULL;
- rcu_read_unlock();
- }
return objcg;
}
@@ -3069,13 +3205,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = folio_objcg(folio);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
+ VM_BUG_ON_FOLIO(!folio_memcg_kmem(folio), folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
folio->memcg_data = 0;
obj_cgroup_put(objcg);
@@ -3329,24 +3465,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
/*
- * Because page_memcg(head) is not set on tails, set it now.
+ * Because page_objcg(head) is not set on tails, set it now.
*/
void split_page_memcg(struct page *head, unsigned int nr)
{
struct folio *folio = page_folio(head);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
int i;
- if (mem_cgroup_disabled() || !memcg)
+ if (mem_cgroup_disabled() || !objcg)
return;
for (i = 1; i < nr; i++)
folio_page(folio, i)->memcg_data = folio->memcg_data;
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
- else
- css_get_many(&memcg->css, nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
}
#ifdef CONFIG_MEMCG_SWAP
@@ -3651,21 +3784,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg;
-
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return 0;
if (unlikely(mem_cgroup_is_root(memcg)))
return 0;
- objcg = obj_cgroup_alloc();
- if (!objcg)
- return -ENOMEM;
-
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
-
static_branch_enable(&memcg_kmem_enabled_key);
memcg->kmemcg_id = memcg->id.id;
@@ -3675,27 +3799,13 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- struct mem_cgroup *parent;
-
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return;
if (unlikely(mem_cgroup_is_root(memcg)))
return;
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- memcg_reparent_objcgs(memcg, parent);
-
- /*
- * After we have finished memcg_reparent_objcgs(), all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty.
- * The ordering is imposed by list_lru_node->lock taken by
- * memcg_reparent_list_lrus().
- */
- memcg_reparent_list_lrus(memcg, parent);
+ memcg_reparent_list_lrus(memcg, parent_mem_cgroup(memcg));
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4562,7 +4672,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@@ -4609,6 +4719,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
frn->memcg_id = wb->memcg_css->id;
frn->at = now;
}
+ css_put(&memcg->css);
}
/* issue foreign writeback flushes for recorded foreign dirtying events */
@@ -5088,6 +5199,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id);
}
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
+
+ cgrp = cgroup_get_from_id(ino);
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+ else
+ memcg = ERR_PTR(-ENOENT);
+
+ cgroup_put(cgrp);
+
+ return memcg;
+}
+#endif
+
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -5152,6 +5286,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
if (memcg->id.id < 0) {
error = memcg->id.id;
+ if (error == -ENOSPC)
+ pr_notice_ratelimited("mem_cgroup_id space is exhausted\n");
goto fail;
}
@@ -5177,8 +5313,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#endif
+ INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -5243,6 +5379,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct obj_cgroup *objcg;
if (memcg_online_kmem(memcg))
goto remove_id;
@@ -5255,6 +5392,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ goto free_shrinker;
+
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ root_obj_cgroup = objcg;
+
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
@@ -5263,6 +5410,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
return 0;
+free_shrinker:
+ free_shrinker_info(memcg);
offline_kmem:
memcg_offline_kmem(memcg);
remove_id:
@@ -5290,6 +5439,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
+ memcg_reparent_objcgs(memcg);
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
@@ -5656,10 +5806,12 @@ static int mem_cgroup_move_account(struct page *page,
*/
smp_mb();
- css_get(&to->css);
- css_put(&from->css);
+ rcu_read_lock();
+ obj_cgroup_get(rcu_dereference(to->objcg));
+ obj_cgroup_put(rcu_dereference(from->objcg));
+ rcu_read_unlock();
- folio->memcg_data = (unsigned long)to;
+ folio->memcg_data = (unsigned long)rcu_access_pointer(to->objcg);
__folio_memcg_unlock(from);
@@ -5693,8 +5845,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru.
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5732,7 +5884,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_device_private_page(page) ||
+ is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -5833,7 +5986,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
mmap_read_lock(mm);
- walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
mmap_read_unlock(mm);
precharge = mc.precharge;
@@ -6131,13 +6284,55 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
- NULL);
-
+ walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
+
+ /*
+ * Moving its pages to another memcg is finished. Wait for already
+ * started RCU-only updates to finish to make sure that the caller
+ * of lock_page_memcg() can unlock the correct move_lock. The
+ * possible bad scenario would like:
+ *
+ * CPU0: CPU1:
+ * mem_cgroup_move_charge()
+ * walk_page_range()
+ *
+ * lock_page_memcg(page)
+ * memcg = folio_memcg()
+ * spin_lock_irqsave(&memcg->move_lock)
+ * memcg->move_lock_task = current
+ *
+ * atomic_dec(&mc.from->moving_account)
+ *
+ * mem_cgroup_css_offline()
+ * memcg_offline_kmem()
+ * memcg_reparent_objcgs() <== reparented
+ *
+ * unlock_page_memcg(page)
+ * memcg = folio_memcg() <== memcg has been changed
+ * if (memcg->move_lock_task == current) <== false
+ * spin_unlock_irqrestore(&memcg->move_lock)
+ *
+ * Once mem_cgroup_move_charge() returns (it means that the cgroup_mutex
+ * would be released soon), the page can be reparented to its parent
+ * memcg. When the unlock_page_memcg() is called for the page, we will
+ * miss unlock the move_lock. So using synchronize_rcu to wait for
+ * already started RCU-only updates to finish before this function
+ * returns (mem_cgroup_move_charge() and mem_cgroup_css_offline() are
+ * serialized by cgroup_mutex).
+ */
+ synchronize_rcu();
}
+/*
+ * The cgroup migration and memory cgroup offlining are serialized by
+ * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
+ * be reparented to its parent memory cgroup. So during the whole process
+ * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
+ * need to worry about the memcg (returned from page_memcg()) being
+ * released even if we do not hold an rcu read lock.
+ */
static void mem_cgroup_move_task(void)
{
if (mc.to) {
@@ -6742,21 +6937,26 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(folio);
- int ret;
+ int ret = 0;
- ret = try_charge(memcg, gfp, nr_pages);
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ /* Do not account at the root objcg level. */
+ if (!obj_cgroup_is_root(objcg))
+ ret = try_charge(memcg, gfp, nr_pages);
if (ret)
goto out;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(folio, objcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, folio_nid(folio));
local_irq_enable();
out:
+ obj_cgroup_put(objcg);
return ret;
}
@@ -6842,7 +7042,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
}
struct uncharge_gather {
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
@@ -6857,63 +7057,56 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(ug->objcg);
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
+ page_counter_uncharge(&memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ page_counter_uncharge(&memcg->memsw, ug->nr_memory);
if (ug->nr_kmem)
- memcg_account_kmem(ug->memcg, -ug->nr_kmem);
- memcg_oom_recover(ug->memcg);
+ memcg_account_kmem(memcg, -ug->nr_kmem);
+ memcg_oom_recover(memcg);
}
local_irq_save(flags);
- __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
- memcg_check_events(ug->memcg, ug->nid);
+ __count_memcg_events(memcg, PGPGOUT, ug->pgpgout);
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
+ memcg_check_events(memcg, ug->nid);
local_irq_restore(flags);
+ rcu_read_unlock();
/* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
+ obj_cgroup_put(ug->objcg);
}
static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
long nr_pages;
- struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * folio memcg or objcg at this point, we have fully
- * exclusive access to the folio.
+ * folio objcg at this point, we have fully exclusive
+ * access to the folio.
*/
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
- /*
- * This get matches the put at the end of the function and
- * kmem pages do not hold memcg references anymore.
- */
- memcg = get_mem_cgroup_from_objcg(objcg);
- } else {
- memcg = __folio_memcg(folio);
- }
-
- if (!memcg)
+ objcg = folio_objcg(folio);
+ if (!objcg)
return;
- if (ug->memcg != memcg) {
- if (ug->memcg) {
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = memcg;
+ ug->objcg = objcg;
ug->nid = folio_nid(folio);
- /* pairs with css_put in uncharge_batch */
- css_get(&memcg->css);
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(objcg);
}
nr_pages = folio_nr_pages(folio);
@@ -6921,19 +7114,15 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
-
- folio->memcg_data = 0;
- obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
ug->nr_memory += nr_pages;
ug->pgpgout++;
-
- folio->memcg_data = 0;
}
- css_put(&memcg->css);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
}
void __mem_cgroup_uncharge(struct folio *folio)
@@ -6941,7 +7130,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
struct uncharge_gather ug;
/* Don't touch folio->lru of any random page, pre-check: */
- if (!folio_memcg(folio))
+ if (!folio_objcg(folio))
return;
uncharge_gather_clear(&ug);
@@ -6964,7 +7153,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
uncharge_gather_clear(&ug);
list_for_each_entry(folio, page_list, lru)
uncharge_folio(folio, &ug);
- if (ug.memcg)
+ if (ug.objcg)
uncharge_batch(&ug);
}
@@ -6981,6 +7170,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(new);
unsigned long flags;
@@ -6993,28 +7183,33 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
return;
/* Page cache replacement: new folio already charged? */
- if (folio_memcg(new))
+ if (folio_objcg(new))
return;
- memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
- if (!memcg)
+ objcg = folio_objcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, old);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+
/* Force-charge the new page. The old one will be freed soon */
- if (!mem_cgroup_is_root(memcg)) {
+ if (!obj_cgroup_is_root(objcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
}
- css_get(&memcg->css);
- commit_charge(new, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(new, objcg);
local_irq_save(flags);
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, folio_nid(new));
local_irq_restore(flags);
+
+ rcu_read_unlock();
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -7173,8 +7368,6 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
break;
}
memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
}
return memcg;
}
@@ -7189,6 +7382,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_entries;
unsigned short oldid;
@@ -7201,13 +7395,18 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return;
/*
+ * Interrupts should be disabled by the caller (see the comments below),
+ * which can serve as RCU read-side critical sections.
+ */
+ memcg = obj_cgroup_memcg(objcg);
+
+ /*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
* ancestor for the swap instead and transfer the memory+swap charge.
@@ -7224,7 +7423,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
folio->memcg_data = 0;
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (!cgroup_memory_noswap && memcg != swap_memcg) {
@@ -7244,7 +7443,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
memcg_stats_unlock();
memcg_check_events(memcg, folio_nid(folio));
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
/**
@@ -7262,19 +7461,21 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
+ int ret = 0;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
+ rcu_read_lock();
memcg = folio_memcg(folio);
VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
- return 0;
+ goto out;
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
- return 0;
+ goto out;
}
memcg = mem_cgroup_id_get_online(memcg);
@@ -7284,7 +7485,8 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/* Get references for the tail pages, too */
@@ -7293,8 +7495,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+out:
+ rcu_read_unlock();
- return 0;
+ return ret;
}
/**
@@ -7339,6 +7543,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
bool mem_cgroup_swap_full(struct page *page)
{
struct mem_cgroup *memcg;
+ bool ret = false;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -7347,19 +7552,24 @@ bool mem_cgroup_swap_full(struct page *page)
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
+ rcu_read_lock();
memcg = page_memcg(page);
if (!memcg)
- return false;
+ goto out;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
- usage * 2 >= READ_ONCE(memcg->swap.max))
- return true;
+ usage * 2 >= READ_ONCE(memcg->swap.max)) {
+ ret = true;
+ goto out;
+ }
}
+out:
+ rcu_read_unlock();
- return false;
+ return ret;
}
static int __init setup_swap_account(char *s)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..001fead45f30 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -297,10 +297,9 @@ void shake_page(struct page *p)
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
@@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ * In other cases, such as anonymous and file-backend page, the address to be
+ * killed can be caculated by @p itself.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+ struct list_head *to_kill)
{
struct to_kill *tk;
@@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ if (is_zone_device_page(p)) {
+ /*
+ * Since page->mapping is not used for fsdax, we need
+ * calculate the address based on the vma.
+ */
+ if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));
/*
@@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -541,12 +550,40 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, true);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, pgoff, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}
+#endif /* CONFIG_FS_DAX */
/*
* Collect the processes who have the corrupted page mapped to kill.
@@ -1007,12 +1044,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p)
static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
+ struct folio *folio = page_folio(p);
int ret;
- delete_from_swap_cache(p);
+ delete_from_swap_cache(folio);
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
- unlock_page(p);
+ folio_unlock(folio);
if (has_extra_refcount(ps, p, false))
ret = MF_FAILED;
@@ -1498,6 +1536,134 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0;
}
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+ struct address_space *mapping, pgoff_t index, int flags)
+{
+ struct to_kill *tk;
+ unsigned long size = 0;
+
+ list_for_each_entry(tk, to_kill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up device-dax
+ * mappings which are constant size. The actual size of the
+ * mapping being torn down is communicated in siginfo, see
+ * kill_proc()
+ */
+ loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
+
+ unmap_mapping_range(mapping, start, size, 0);
+ }
+
+ kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ int rc = 0;
+
+ /*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ cookie = dax_lock_page(page);
+ if (!cookie)
+ return -EBUSY;
+
+ if (hwpoison_filter(page)) {
+ rc = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ /*
+ * TODO: Handle device pages which may need coordination
+ * with device-side memory.
+ */
+ rc = -ENXIO;
+ goto unlock;
+ default:
+ break;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &to_kill, true);
+
+ unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+ dax_unlock_page(page, cookie);
+ return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping: address_space of the file in use
+ * @index: start pgoff of the range within the file
+ * @count: length of the range, in unit of PAGE_SIZE
+ * @mf_flags: memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags)
+{
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ struct page *page;
+ size_t end = index + count;
+
+ mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+ for (; index < end; index++) {
+ page = NULL;
+ cookie = dax_lock_mapping_entry(mapping, index, &page);
+ if (!cookie)
+ return -EBUSY;
+ if (!page)
+ goto unlock;
+
+ SetPageHWPoison(page);
+
+ collect_procs_fsdax(page, mapping, index, &to_kill);
+ unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+ index, mf_flags);
+unlock:
+ dax_unlock_mapping_entry(mapping, index, cookie);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
/*
* Called from hugetlb code with hugetlb_lock held.
*
@@ -1633,23 +1799,27 @@ out:
unlock_page(head);
return res;
}
+
#else
+
+static inline int mf_generic_kill_procs(unsigned long long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ return 0;
+}
+
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
return 0;
}
-#endif
+
+#endif /* CONFIG_HUGETLB_PAGE */
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- unsigned long size = 0;
- struct to_kill *tk;
- LIST_HEAD(tokill);
- int rc = -EBUSY;
- loff_t start;
- dax_entry_t cookie;
+ int rc = -ENXIO;
if (flags & MF_COUNT_INCREASED)
/*
@@ -1658,73 +1828,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
put_page(page);
/* device metadata space is not recoverable */
- if (!pgmap_pfn_valid(pgmap, pfn)) {
- rc = -ENXIO;
+ if (!pgmap_pfn_valid(pgmap, pfn))
goto out;
- }
/*
- * Pages instantiated by device-dax (not filesystem-dax)
- * may be compound pages.
+ * Call driver's implementation to handle the memory failure, otherwise
+ * fall back to generic handler.
*/
- page = compound_head(page);
-
- /*
- * Prevent the inode from being freed while we are interrogating
- * the address_space, typically this would be handled by
- * lock_page(), but dax pages do not use the page lock. This
- * also prevents changes to the mapping of this pfn until
- * poison signaling is complete.
- */
- cookie = dax_lock_page(page);
- if (!cookie)
- goto out;
-
- if (hwpoison_filter(page)) {
- rc = -EOPNOTSUPP;
- goto unlock;
- }
-
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ if (pgmap->ops->memory_failure) {
+ rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
- * TODO: Handle HMM pages which may need coordination
- * with device-side memory.
+ * Fall back to generic handler too if operation is not
+ * supported inside the driver/device/filesystem.
*/
- goto unlock;
+ if (rc != -EOPNOTSUPP)
+ goto out;
}
- /*
- * Use this flag as an indication that the dax page has been
- * remapped UC to prevent speculative consumption of poison.
- */
- SetPageHWPoison(page);
-
- /*
- * Unlike System-RAM there is no possibility to swap in a
- * different physical page at a given virtual address, so all
- * userspace consumption of ZONE_DEVICE memory necessitates
- * SIGBUS (i.e. MF_MUST_KILL)
- */
- flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, true);
-
- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
- /*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
- */
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, size, 0);
- }
- kill_procs(&tokill, true, false, pfn, flags);
- rc = 0;
-unlock:
- dax_unlock_page(page, cookie);
+ rc = mf_generic_kill_procs(pfn, flags, pgmap);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -2178,7 +2299,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
bool lru = PageLRU(page);
if (PageHuge(page)) {
- isolated = isolate_huge_page(page, pagelist);
+ isolated = !isolate_hugetlb(page, pagelist);
} else {
if (lru)
isolated = !isolate_lru_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index ed45dbf9c31e..e3d3596068f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -402,12 +402,21 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *vma, unsigned long floor,
+ unsigned long ceiling)
{
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+ do {
unsigned long addr = vma->vm_start;
+ struct vm_area_struct *next;
+
+ /*
+ * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+ * be 0. This will underflow and is okay.
+ */
+ next = mas_find(&mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -426,7 +435,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = mas_find(&mas, ceiling - 1);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
@@ -434,7 +443,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
floor, next ? next->vm_start : ceiling);
}
vma = next;
- }
+ } while (vma);
}
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
@@ -623,6 +632,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
+ /*
+ * NOTE: New users of ZONE_DEVICE will not set pte_devmap() and
+ * will have refcounts incremented on their struct pages when
+ * they are inserted into PTEs, thus they are safe to return
+ * here. Legacy ZONE_DEVICE pages that set pte_devmap() do not
+ * have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
if (pte_devmap(pte))
return NULL;
@@ -1245,7 +1262,7 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (userfaultfd_wp(dst_vma))
return true;
- if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP))
+ if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return true;
if (src_vma->anon_vma)
@@ -1705,7 +1722,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
@@ -1715,12 +1732,14 @@ void unmap_vmas(struct mmu_gather *tlb,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+ } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1735,8 +1754,11 @@ void unmap_vmas(struct mmu_gather *tlb,
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
+ struct maple_tree *mt = &vma->vm_mm->mm_mt;
+ unsigned long end = start + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -1744,8 +1766,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ do {
unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ } while ((vma = mas_find(&mas, end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -4693,7 +4716,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
pte = pte_modify(old_pte, vma->vm_page_prot);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
goto out_map;
/* TODO: handle PTE-mapped THP */
@@ -4963,6 +4986,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
+ unsigned long vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -4976,7 +5000,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
if (!vmf.pud)
return VM_FAULT_OOM;
retry_pud:
- if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) &&
+ hugepage_vma_check(vma, vm_flags, false, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5009,7 +5034,8 @@ retry_pud:
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
- if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) &&
+ hugepage_vma_check(vma, vm_flags, false, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f1a730c4499..99ecb2b3ff53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -43,30 +43,22 @@
#include "shuffle.h"
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
-{
- if (hugetlb_optimize_vmemmap_enabled())
- return 0;
- return param_set_bool(val, kp);
-}
-
-static const struct kernel_param_ops memmap_on_memory_ops = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = memmap_on_memory_set,
- .get = param_get_bool,
-};
-
/*
* memory_hotplug.memmap_on_memory parameter
*/
static bool memmap_on_memory __ro_after_init;
-module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
+module_param(memmap_on_memory, bool, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
-bool mhp_memmap_on_memory(void)
+static inline bool mhp_memmap_on_memory(void)
{
return memmap_on_memory;
}
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+ return false;
+}
#endif
enum {
@@ -670,12 +662,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
}
+#ifdef CONFIG_ZONE_DEVICE
static void section_taint_zone_device(unsigned long pfn)
{
struct mem_section *ms = __pfn_to_section(pfn);
ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
}
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
/*
* Associate the pfn range with the given zone, initializing the memmaps
@@ -1029,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
struct zone *zone)
{
unsigned long end_pfn = pfn + nr_pages;
- int ret;
+ int ret, i;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
if (ret)
@@ -1037,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ for (i = 0; i < nr_pages; i++)
+ SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+
/*
* It might be that the vmemmap_pages fully span sections. If that is
* the case, mark those sections online here as otherwise they will be
@@ -1641,7 +1642,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_huge_page(head, &source);
+ isolate_hugetlb(head, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d39b01fd52fe..dc74239d1ac7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
@@ -523,7 +524,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
* vm_normal_page() filters out zero pages, but there might
@@ -602,7 +603,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
- if (!isolate_huge_page(page, qp->pagelist) &&
+ if (isolate_hugetlb(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
* Failed to isolate page but allow migrating pages
@@ -656,7 +657,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->vma;
+ struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
@@ -671,9 +672,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
/* hole at head side of range */
return -EFAULT;
}
+ next = find_vma(vma->vm_mm, vma->vm_end);
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
- (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ (!next || vma->vm_end < next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
@@ -787,26 +789,24 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
+ MA_STATE(mas, &mm->mm_mt, start - 1, start - 1);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
-
- vma = find_vma(mm, start);
- VM_BUG_ON(!vma);
- prev = vma->vm_prev;
- if (start > vma->vm_start)
- prev = vma;
+ prev = mas_find_rev(&mas, 0);
+ if (prev && (start < prev->vm_end))
+ vma = prev;
+ else
+ vma = mas_next(&mas, end - 1);
- for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
+ for (; vma; vma = mas_next(&mas, end - 1)) {
+ unsigned long vmstart = max(start, vma->vm_start);
+ unsigned long vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
- continue;
+ goto next;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
@@ -815,6 +815,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto replace;
}
@@ -822,19 +824,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
- replace:
+replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
+next:
+ prev = vma;
}
- out:
+out:
return err;
}
@@ -1049,6 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
@@ -1064,8 +1073,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
+ vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
@@ -1195,14 +1205,13 @@ static struct page *new_page(struct page *page, unsigned long start)
struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
+ VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
- vma = find_vma(current->mm, start);
- while (vma) {
+ for_each_vma(vmi, vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
- vma = vma->vm_next;
}
if (folio_test_hugetlb(src))
@@ -1388,7 +1397,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
unsigned long t;
- if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
+ if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
return -EFAULT;
if (maxnode - bits >= MAX_NUMNODES) {
@@ -1480,6 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);
start = untagged_addr(start);
if (start & ~PAGE_MASK)
@@ -1505,9 +1515,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (end == start)
return 0;
mmap_write_lock(mm);
- vma = find_vma(mm, start);
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
-
+ for_each_vma_range(vmi, vma, end) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
new = mpol_dup(vma_policy(vma));
diff --git a/mm/memremap.c b/mm/memremap.c
index 8b5c8fd4ea8e..f0955785150f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL);
}
break;
+ case MEMORY_DEVICE_COHERENT:
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
case MEMORY_DEVICE_FS_DAX:
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
diff --git a/mm/migrate.c b/mm/migrate.c
index 0b4995f768d0..ddfb8c5f9bc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -132,7 +132,7 @@ static void putback_movable_page(struct page *page)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -314,13 +314,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
__migration_entry_wait(mm, ptep, ptl);
}
-void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte)
+#ifdef CONFIG_HUGETLB_PAGE
+void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
- __migration_entry_wait(mm, pte, ptl);
+ pte_t pte;
+
+ spin_lock(ptl);
+ pte = huge_ptep_get(ptep);
+
+ if (unlikely(!is_hugetlb_entry_migration(pte)))
+ spin_unlock(ptl);
+ else
+ migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
}
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
+{
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
+
+ __migration_entry_wait_huge(pte, ptl);
+}
+#endif
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{
@@ -440,6 +455,10 @@ int folio_migrate_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
+ /*
+ * Irq is disabled, which can serve as RCU read-side critical
+ * sections.
+ */
memcg = folio_memcg(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
@@ -1132,15 +1151,10 @@ static int unmap_and_move(new_page_t get_new_page,
return -ENOSYS;
if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
+ /* Page was freed from under us. So we are done. */
ClearPageActive(page);
ClearPageUnevictable(page);
- if (unlikely(__PageMovable(page))) {
- lock_page(page);
- if (!PageMovable(page))
- ClearPageIsolated(page);
- unlock_page(page);
- }
+ /* free_pages_prepare() will clear PG_isolated. */
goto out;
}
@@ -1655,7 +1669,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1675,8 +1689,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (PageHuge(page)) {
if (PageHead(page)) {
- isolate_huge_page(page, pagelist);
- err = 1;
+ err = isolate_hugetlb(page, pagelist);
+ if (!err)
+ err = 1;
}
} else {
struct page *head;
@@ -1846,7 +1861,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
err = PTR_ERR(page);
if (IS_ERR(page))
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5dd97c39ca6a..ad593d5754cf 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ again:
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
pfn = pte_pfn(pte);
- if (is_zero_pfn(pfn)) {
+ if (is_zero_pfn(pfn) &&
+ (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+ if (page && !is_zone_device_page(page) &&
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ else if (page && is_device_coherent_page(page) &&
+ (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ page->pgmap->owner != migrate->pgmap_owner))
+ goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -480,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args)
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
- if (!args->vma || is_vm_hugetlb_page(args->vma) ||
- (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
- return -EINVAL;
- if (nr_pages <= 0)
- return -EINVAL;
- if (args->start < args->vma->vm_start ||
- args->start >= args->vma->vm_end)
- return -EINVAL;
- if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
- return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
- memset(args->src, 0, sizeof(*args->src) * nr_pages);
- args->cpages = 0;
- args->npages = 0;
-
- migrate_vma_collect(args);
+ if (args->vma) {
+ if (is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start ||
+ args->end > args->vma->vm_end)
+ return -EINVAL;
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+ }
if (args->cpages)
migrate_vma_unmap(args);
@@ -518,7 +524,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
* handle_pte_fault()
* do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
@@ -594,11 +600,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
} else {
- /*
- * For now we only support migrating to un-addressable device
- * memory.
- */
- if (is_zone_device_page(page)) {
+ if (is_zone_device_page(page) &&
+ !is_device_coherent_page(page)) {
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort;
}
@@ -682,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
- if (!page) {
+ if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
@@ -701,10 +704,11 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page);
- if (is_device_private_page(newpage)) {
+ if (is_device_private_page(newpage) ||
+ is_device_coherent_page(newpage)) {
/*
- * For now only support private anonymous when migrating
- * to un-addressable device memory.
+ * For now only support anonymous memory migrating to
+ * device private or coherent memory.
*/
if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -791,3 +795,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
EXPORT_SYMBOL(migrate_vma_finalize);
+
+/*
+ * Migrate a device coherent page back to normal memory. The caller should have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags)
+{
+ unsigned long src_pfn, dst_pfn = 0;
+ struct migrate_vma args;
+ struct page *dpage;
+
+ lock_page(page);
+ src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+ args.src = &src_pfn;
+ args.dst = &dst_pfn;
+ args.cpages = 1;
+ args.npages = 1;
+ args.vma = NULL;
+ migrate_vma_setup(&args);
+ if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+ return NULL;
+
+ dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0);
+
+ /*
+ * get/pin the new page now so we don't have to retry gup after
+ * migrating. We already have a reference so this should never fail.
+ */
+ if (dpage && WARN_ON_ONCE(!try_grab_page(dpage, gup_flags))) {
+ __free_pages(dpage, 0);
+ dpage = NULL;
+ }
+
+ if (dpage) {
+ lock_page(dpage);
+ dst_pfn = migrate_pfn(page_to_pfn(dpage));
+ }
+
+ migrate_vma_pages(&args);
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ copy_highpage(dpage, page);
+ migrate_vma_finalize(&args);
+ if (dpage && !(src_pfn & MIGRATE_PFN_MIGRATE)) {
+ if (gup_flags & FOLL_PIN)
+ unpin_user_page(dpage);
+ else
+ put_page(dpage);
+ dpage = NULL;
+ }
+
+ return dpage;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..d9039fb9c56b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,7 +205,7 @@ static void mlock_pagevec(struct pagevec *pvec)
}
if (lruvec)
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
if (PageTransCompound(page))
continue;
@@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
int error;
+ MA_STATE(mas, &current->mm->mm_mt, start, start);
VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
- if (!vma || vma->vm_start > start)
+ vma = mas_walk(&mas);
+ if (!vma)
return -ENOMEM;
- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
+ else
+ prev = mas_prev(&mas, 0);
for (nstart = start ; ; ) {
vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(prev->vm_mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -526,24 +528,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long count = 0;
+ unsigned long end;
+ VMA_ITERATOR(vmi, mm, start);
- if (mm == NULL)
- mm = current->mm;
+ /* Don't overflow past ULONG_MAX */
+ if (unlikely(ULONG_MAX - len < start))
+ end = ULONG_MAX;
+ else
+ end = start + len;
- vma = find_vma(mm, start);
- if (vma == NULL)
- return 0;
-
- for (; vma ; vma = vma->vm_next) {
- if (start >= vma->vm_end)
- continue;
- if (start + len <= vma->vm_start)
- break;
+ for_each_vma_range(vmi, vma, end) {
if (vma->vm_flags & VM_LOCKED) {
if (start > vma->vm_start)
count -= (start - vma->vm_start);
- if (start + len < vma->vm_end) {
- count += start + len - vma->vm_start;
+ if (end < vma->vm_end) {
+ count += end - vma->vm_start;
break;
}
count += vma->vm_end - vma->vm_start;
@@ -659,6 +658,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
@@ -679,7 +679,7 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
vm_flags_t newflags;
newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -687,6 +687,7 @@ static int apply_mlockall_flags(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ mas_pause(&mas);
cond_resched();
}
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index c14d7286a379..8f776f257893 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -14,7 +14,6 @@
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -39,7 +38,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end);
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
@@ -179,12 +178,10 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
/*
- * Close a vm structure and free it, returning the next.
+ * Close a vm structure and free it.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
@@ -192,20 +189,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
- return next;
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -228,6 +246,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
if (brk < min_brk)
goto out;
+
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
@@ -247,35 +266,51 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ BUG_ON(brkvma == NULL);
+ if (brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
success:
@@ -288,121 +323,64 @@ success:
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
-
out:
mmap_write_unlock(mm);
return origbrk;
}
-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+extern void mt_validate(struct maple_tree *mt);
+extern void mt_dump(const struct maple_tree *mt);
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
+/* Validate the maple tree */
+static void validate_mm_mt(struct mm_struct *mm)
+{
+ struct maple_tree *mt = &mm->mm_mt;
+ struct vm_area_struct *vma_mt;
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
- }
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
+ MA_STATE(mas, mt, 0, 0);
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
+ mas_for_each(&mas, vma_mt, ULONG_MAX) {
+ if ((vma_mt->vm_start != mas.index) ||
+ (vma_mt->vm_end - 1 != mas.last)) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma_mt);
+ pr_emerg("mt piv: %px %lu - %lu\n", vma_mt,
+ mas.index, mas.last);
+ pr_emerg("mt vma: %px %lu - %lu\n", vma_mt,
+ vma_mt->vm_start, vma_mt->vm_end);
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
+ mt_dump(mas.tree);
+ if (vma_mt->vm_end != mas.last + 1) {
+ pr_err("vma: %px vma_mt %lu-%lu\tmt %lu-%lu\n",
+ mm, vma_mt->vm_start, vma_mt->vm_end,
+ mas.index, mas.last);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
+ if (vma_mt->vm_start != mas.index) {
+ pr_err("vma: %px vma_mt %px %lu - %lu doesn't match\n",
+ mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
+ }
}
+ mt_validate(&mm->mm_mt);
}
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- unsigned long highest_address = 0;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+
+ validate_mm_mt(mm);
- while (vma) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
@@ -412,93 +390,20 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
- highest_address = vm_end_gap(vma);
- vma = vma->vm_next;
+#endif
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
- bug = 1;
- }
- if (highest_address != mm->highest_vm_end) {
- pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
- bug = 1;
- }
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
-#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
+#define validate_mm_mt(root) do { } while (0)
+#define validate_mm(mm) do { } while (0)
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -532,208 +437,221 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+ VMA_ITERATOR(vmi, mm, addr);
+ struct vm_area_struct *vma;
+ unsigned long nr_pages = 0;
- mmap_assert_locked(mm);
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vm_start = max(addr, vma->vm_start);
+ unsigned long vm_end = min(end, vma->vm_end);
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
+ nr_pages += (vm_end - vm_start) / PAGE_SIZE;
+ }
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+ return nr_pages;
+}
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
}
/*
- * vma_next() - Get the next VMA.
- * @mm: The mm_struct.
- * @vma: The current vma.
+ * vma_mas_store() - Store a VMA in the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * If @vma is NULL, return the first vma in the mm.
+ * Efficient way to store a VMA in the maple tree when the @mas has already
+ * walked to the correct location.
*
- * Returns: The next VMA after @vma.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
- struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- if (!vma)
- return mm->mmap;
-
- return vma->vm_next;
+ trace_vma_store(mas->tree, vma);
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
}
/*
- * munmap_vma_range() - munmap VMAs that overlap a range.
- * @mm: The mm struct
- * @start: The start of the range.
- * @len: The length of the range.
- * @pprev: pointer to the pointer that will be set to previous vm_area_struct
- * @rb_link: the rb_node
- * @rb_parent: the parent rb_node
- *
- * Find all the vm_area_struct that overlap from @start to
- * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ * vma_mas_remove() - Remove a VMA from the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * Returns: -ENOMEM on munmap failure or 0 on success.
+ * Efficient way to remove a VMA from the maple tree when the @mas has already
+ * been established and points to the correct location.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline int
-munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct rb_node ***link,
- struct rb_node **parent, struct list_head *uf)
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
{
-
- while (find_vma_links(mm, start, start + len, pprev, link, parent))
- if (do_munmap(mm, start, len, uf))
- return -ENOMEM;
-
- return 0;
+ trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1);
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+
+/*
+ * vma_mas_szero() - Set a given range to zero. Used when modifying a
+ * vm_area_struct start or end.
+ *
+ * @mm: The struct_mm
+ * @start: The start address to zero
+ * @end: The end address to zero.
+ */
+static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
+ unsigned long end)
{
- unsigned long nr_pages = 0;
- struct vm_area_struct *vma;
+ trace_vma_mas_szero(mas->tree, start, end - 1);
+ mas_set_range(mas, start, end - 1);
+ mas_store_prealloc(mas, NULL);
+}
- /* Find first overlapping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct address_space *mapping = NULL;
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_start > end)
- break;
+ vma_mas_store(vma, &mas);
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
+ if (mapping) {
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
}
- return nr_pages;
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @mas: The maple state
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct file *file = vma->vm_file;
+ bool remove_next = false;
+ bool anon_cloned = false;
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
+ if (next && (vma != next) && (end == next->vm_end)) {
+ remove_next = true;
+ if (next->anon_vma && !vma->anon_vma) {
+ int error;
-static void __vma_link_file(struct vm_area_struct *vma)
-{
- struct file *file;
+ vma->anon_vma = next->anon_vma;
+ error = anon_vma_clone(vma, next);
+ if (error)
+ return error;
+ anon_cloned = true;
+ }
+ }
- file = vma->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
- if (vma->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto nomem;
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
+ vma_adjust_trans_huge(vma, start, end, 0);
+
+ if (anon_vma) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
}
-}
-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
-}
+ if (file) {
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ i_mmap_lock_write(mapping);
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, root);
+ }
-static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- struct address_space *mapping = NULL;
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_mas_store(vma, mas);
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
+ if (file) {
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
}
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- __vma_link_file(vma);
+ /* Expanding over the next vma */
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
+ }
- if (mapping)
+ if (file) {
i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+ }
- mm->map_count++;
- validate_mm(mm);
-}
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(anon_vma);
+ }
-/*
- * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
- */
-static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
-}
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ }
-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
+ validate_mm(mm);
+ return 0;
+
+nomem:
+ if (anon_cloned)
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
}
/*
@@ -748,18 +666,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
+ bool vma_changed = false;
long adjust_next = 0;
int remove_next = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -788,8 +707,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
+ next_next = find_vma(mm, next->vm_end);
VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
+ end != next_next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
@@ -802,7 +722,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
- exporter = next->vm_next;
+ exporter = find_vma(mm, next->vm_end);
} else if (end > next->vm_start) {
/*
@@ -842,6 +762,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
again:
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ if (exporter && exporter->anon_vma)
+ unlink_anon_vmas(importer);
+ return -ENOMEM;
+ }
+
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
@@ -851,14 +777,14 @@ again:
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
- if (insert) {
+ if (insert && insert->vm_file) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
- __vma_link_file(insert);
+ __vma_link_file(insert, insert->vm_file->f_mapping);
}
}
@@ -882,17 +808,37 @@ again:
}
if (start != vma->vm_start) {
+ if ((vma->vm_start < start) &&
+ (!insert || (insert->vm_end != start))) {
+ vma_mas_szero(&mas, vma->vm_start, start);
+ VM_WARN_ON(insert && insert->vm_start > vma->vm_start);
+ } else {
+ vma_changed = true;
+ }
vma->vm_start = start;
- start_changed = true;
}
if (end != vma->vm_end) {
+ if (vma->vm_end > end) {
+ if (!insert || (insert->vm_start != end)) {
+ vma_mas_szero(&mas, end, vma->vm_end);
+ mas_reset(&mas);
+ VM_WARN_ON(insert &&
+ insert->vm_end < vma->vm_end);
+ }
+ } else {
+ vma_changed = true;
+ }
vma->vm_end = end;
- end_changed = true;
}
+
+ if (vma_changed)
+ vma_mas_store(vma, &mas);
+
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next;
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ vma_mas_store(next, &mas);
}
if (file) {
@@ -902,42 +848,17 @@ again:
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- __insert_vm_struct(mm, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
+ mas_reset(&mas);
+ vma_mas_store(insert, &mas);
+ mm->map_count++;
}
if (anon_vma) {
@@ -964,7 +885,9 @@ again:
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
+ BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -974,10 +897,10 @@ again:
/*
* If "next" was removed and vma->vm_end was
* expanded (up) over it, in turn
- * "next->vm_prev->vm_end" changed and the
- * "vma->vm_next" gap must be updated.
+ * "next->prev->vm_end" changed and the
+ * "vma->next" gap must be updated.
*/
- next = vma->vm_next;
+ next = next_next;
} else {
/*
* For the scope of the comment "next" and
@@ -992,38 +915,17 @@ again:
next = vma;
}
if (remove_next == 2) {
+ mas_reset(&mas);
remove_next = 1;
end = next->vm_end;
goto again;
}
- else if (next)
- vma_gap_update(next);
- else {
- /*
- * If remove_next == 2 we obviously can't
- * reach this path.
- *
- * If remove_next == 3 we can't reach this
- * path because pre-swap() next is always not
- * NULL. pre-swap() "next" is not being
- * removed and its next->vm_end is not altered
- * (and furthermore "end" already matches
- * next->vm_end in remove_next == 3).
- *
- * We reach this only in the remove_next == 1
- * case if the "next" vma that was removed was
- * the highest vma of the mm. However in such
- * case next->vm_end == "end" and the extended
- * "vma" has vma->vm_end == next->vm_end so
- * mm->highest_vm_end doesn't need any update
- * in remove_next == 1 case.
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
}
- if (insert && file)
+ if (insert && file) {
uprobe_mmap(insert);
+ }
+ mas_destroy(&mas);
validate_mm(mm);
return 0;
@@ -1175,8 +1077,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *area, *next;
- int err;
+ struct vm_area_struct *mid, *next, *res;
+ int err = -1;
+ bool merge_prev = false;
+ bool merge_next = false;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -1185,76 +1089,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- next = vma_next(mm, prev);
- area = next;
- if (area && area->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = find_vma(mm, prev ? prev->vm_end : 0);
+ mid = next;
+ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = find_vma(mm, next->vm_end);
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(mid && end > mid->vm_end);
VM_WARN_ON(addr >= end);
- /*
- * Can it merge with the predecessor?
- */
+ /* Can we merge the predecessor? */
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx, anon_name)) {
- /*
- * OK, it can. Can we now merge in the successor as well?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file,
- pgoff+pglen,
- vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) {
- /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- } else /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- if (err)
- return NULL;
- khugepaged_enter_vma(prev, vm_flags);
- return prev;
+ merge_prev = true;
}
-
- /*
- * Can this new request be merged in front of next?
- */
+ /* Can we merge the successor? */
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx, anon_name)) {
+ merge_next = true;
+ }
+ /* Can we merge both the predecessor and the successor? */
+ if (merge_prev && merge_next &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL)) { /* cases 1, 6 */
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
+ res = prev;
+ } else if (merge_prev) { /* cases 2, 5, 7 */
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
+ res = prev;
+ } else if (merge_next) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else { /* cases 3, 8 */
- err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
- /*
- * In case 3 area is already equal to next and
- * this is a noop, but in case 8 "area" has
- * been removed and next was expanded over it.
- */
- area = next;
- }
- if (err)
- return NULL;
- khugepaged_enter_vma(area, vm_flags);
- return area;
+ addr, prev->vm_pgoff, NULL, next);
+ else /* cases 3, 8 */
+ err = __vma_adjust(mid, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ res = next;
}
- return NULL;
+ /*
+ * Cannot merge with predecessor or successor or error in __vma_adjust?
+ */
+ if (err)
+ return NULL;
+ khugepaged_enter_vma(res, vm_flags);
+ return res;
}
/*
@@ -1322,18 +1211,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
/* Try next first. */
- if (vma->vm_next) {
- anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ next = mas_walk(&mas);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
+ prev = mas_prev(&mas, 0);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = mas_prev(&mas, 0);
/* Try prev next. */
- if (vma->vm_prev)
- anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
@@ -1422,6 +1317,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
+ validate_mm(mm);
*populate = 0;
if (!len)
@@ -1722,389 +1618,62 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
-
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
- vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
-
- /* If vm_flags changed after call_mmap(), we should try merge vma again
- * as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (merge) {
- /* ->mmap() can change vma->vm_file and fput the original file. So
- * fput the vma->vm_file here or we would add an extra fput for file
- * and cause general protection fault ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto unmap_writable;
- }
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- if (file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
-
- /*
- * vma_merge() calls khugepaged_enter_vma() either, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
- /* Once vma denies write, undo our temporary denial count */
-unmap_writable:
- if (file && vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
- file = vma->vm_file;
-out:
- perf_event_mmap(vma);
-
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
-}
-
+/*
+ * unmapped_area() Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
- /*
- * We implement the search by looking for an rbtree node that
- * immediately follows a suitable gap. That is,
- * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
- * - gap_end = vma->vm_start >= info->low_limit + length;
- * - gap_end - gap_start >= length
- */
+ unsigned long length, gap;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /* Adjust search limits by the desired length */
- if (info->high_limit < length)
- return -ENOMEM;
- high_limit = info->high_limit - length;
-
- if (info->low_limit > high_limit)
+ if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- goto check_highest;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- goto check_highest;
-
- while (true) {
- /* Visit left subtree if it looks promising */
- gap_end = vm_start_gap(vma);
- if (gap_end >= low_limit && vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
-check_current:
- /* Check if current node has a suitable gap */
- if (gap_start > high_limit)
- return -ENOMEM;
- if (gap_end >= low_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit right subtree if it looks promising */
- if (vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- goto check_highest;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_left) {
- gap_start = vm_end_gap(vma->vm_prev);
- gap_end = vm_start_gap(vma);
- goto check_current;
- }
- }
- }
-check_highest:
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
- if (gap_start > high_limit)
- return -ENOMEM;
-
-found:
- /* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
- gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
-
- VM_BUG_ON(gap_start + info->length > info->high_limit);
- VM_BUG_ON(gap_start + info->length > gap_end);
- return gap_start;
+ gap = mas.index;
+ gap += (info->align_offset - gap) & info->align_mask;
+ return gap;
}
+/* unmapped_area_topdown() Find an area between the low_limit and the
+ * high_limit with * the correct alignment and offset at the highest available
+ * address, all from * @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long length, gap;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /*
- * Adjust search limits by the desired length.
- * See implementation comment at top of unmapped_area().
- */
- gap_end = info->high_limit;
- if (gap_end < length)
- return -ENOMEM;
- high_limit = gap_end - length;
-
- if (info->low_limit > high_limit)
+ if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- if (gap_start <= high_limit)
- goto found_highest;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- return -ENOMEM;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- return -ENOMEM;
-
- while (true) {
- /* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
- if (gap_start <= high_limit && vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
-check_current:
- /* Check if current node has a suitable gap */
- gap_end = vm_start_gap(vma);
- if (gap_end < low_limit)
- return -ENOMEM;
- if (gap_start <= high_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit left subtree if it looks promising */
- if (vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- return -ENOMEM;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_right) {
- gap_start = vma->vm_prev ?
- vm_end_gap(vma->vm_prev) : 0;
- goto check_current;
- }
- }
- }
-
-found:
- /* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
- gap_end = info->high_limit;
-found_highest:
- /* Compute highest gap address at the desired alignment */
- gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
-
- VM_BUG_ON(gap_end < info->low_limit);
- VM_BUG_ON(gap_end < gap_start);
- return gap_end;
+ gap = mas.last + 1 - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ return gap;
}
/*
@@ -2294,58 +1863,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
+ unsigned long index = start_addr;
mmap_assert_locked(mm);
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *tmp;
-
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+/**
+ * find_vma() - Find the VMA for a given address, or the next vma.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ *
+ * Returns: The VMA associated with addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long index = addr;
- if (vma)
- vmacache_update(addr, vma);
- return vma;
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
-
EXPORT_SYMBOL(find_vma);
-/*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+/**
+ * find_vma_prev() - Find the VMA for a given address, or the next vma and
+ * set %pprev to the previous VMA, if any.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ * @pprev: The pointer to set to the previous VMA
+ *
+ * Note that RCU lock is missing here since the external mmap_lock() is used
+ * instead.
+ *
+ * Returns: The VMA associated with @addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- vma = find_vma(mm, addr);
- if (vma) {
- *pprev = vma->vm_prev;
- } else {
- struct rb_node *rb_node = rb_last(&mm->mm_rb);
-
- *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
- }
+ vma = mas_walk(&mas);
+ *pprev = mas_prev(&mas, 0);
+ if (!vma)
+ vma = mas_next(&mas, ULONG_MAX);
return vma;
}
@@ -2399,6 +1977,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -2416,16 +1995,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
- next = vma->vm_next;
- if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2446,15 +2030,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2462,11 +2044,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2475,7 +2055,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2483,10 +2063,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
@@ -2495,7 +2075,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -EPERM;
/* Enforce stack_guard_gap */
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev)) {
@@ -2503,9 +2083,14 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2526,15 +2111,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2543,8 +2126,9 @@ int expand_downwards(struct vm_area_struct *vma,
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2553,7 +2137,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
@@ -2627,25 +2211,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL_GPL(find_extend_vma);
/*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+ * and do the vma updates.
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
unsigned long nr_accounted = 0;
+ struct vm_area_struct *vma;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
- do {
+ mas_for_each(mas, vma, ULONG_MAX) {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
+ remove_vma(vma);
+ }
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -2655,67 +2240,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
+ struct vm_area_struct *next,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end);
- free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mt, vma, start, end);
+ free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
}
/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
-{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
-
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- vma_rb_erase(vma, &mm->mm_rb);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= vma_pages(vma);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- *insertion_point = vma;
- if (vma) {
- vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
- mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
- tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
-
- /*
- * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
- * VM_GROWSUP VMA. Such VMAs can change their size under
- * down_read(mmap_lock) and collide with the VMA we are about to unmap.
- */
- if (vma && (vma->vm_flags & VM_GROWSDOWN))
- return false;
- if (prev && (prev->vm_flags & VM_GROWSUP))
- return false;
- return true;
-}
-
-/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
@@ -2724,6 +2265,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct *new;
int err;
+ validate_mm_mt(mm);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
@@ -2776,6 +2318,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mpol_put(vma_policy(new));
out_free_vma:
vm_area_free(new);
+ validate_mm_mt(mm);
return err;
}
@@ -2792,38 +2335,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-/* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf, bool downgrade)
+static inline int munmap_sidetree(struct vm_area_struct *vma,
+ struct ma_state *mas_detach)
{
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
+ mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ return -ENOMEM;
- len = PAGE_ALIGN(len);
- end = start + len;
- if (len == 0)
- return -EINVAL;
+ if (vma->vm_flags & VM_LOCKED)
+ vma->vm_mm->locked_vm -= vma_pages(vma);
- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
- arch_unmap(mm, start, end);
+ return 0;
+}
- /* Find the first overlapping VMA where start < vma->vm_end */
- vma = find_vma_intersection(mm, start, end);
- if (!vma)
- return 0;
- prev = vma->vm_prev;
+/*
+ * do_mas_align_munmap() - munmap the aligned region from @start to @end.
+ * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ *
+ * If @downgrade is true, check return code for potential release of the lock.
+ */
+static int
+do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool downgrade)
+{
+ struct vm_area_struct *prev, *next = NULL;
+ struct maple_tree mt_detach;
+ int count = 0;
+ int error = -ENOMEM;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+ mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2831,8 +2384,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
+
+ /* Does it split the first one? */
if (start > vma->vm_start) {
- int error;
/*
* Make sure that map_count on return from munmap() will
@@ -2840,22 +2394,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
+ goto map_count_exceeded;
+ /*
+ * mas_pause() is not needed since mas->index needs to be set
+ * differently than vma->vm_end anyways.
+ */
error = __split_vma(mm, vma, start, 0);
if (error)
- return error;
- prev = vma;
+ goto start_split_failed;
+
+ mas_set(mas, start);
+ vma = mas_walk(mas);
}
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
+ prev = mas_prev(mas, 0);
+ if (unlikely((!prev)))
+ mas_set(mas, start);
+
+ /*
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
+ */
+ mas_for_each(mas, next, end - 1) {
+ /* Does it split the end? */
+ if (next->vm_end > end) {
+ struct vm_area_struct *split;
+
+ error = __split_vma(mm, next, end, 1);
+ if (error)
+ goto end_split_failed;
+
+ mas_set(mas, end);
+ split = mas_prev(mas, 0);
+ error = munmap_sidetree(split, &mas_detach);
+ if (error)
+ goto munmap_sidetree_failed;
+
+ count++;
+ if (vma == next)
+ vma = split;
+ break;
+ }
+ error = munmap_sidetree(next, &mas_detach);
if (error)
- return error;
+ goto munmap_sidetree_failed;
+
+ count++;
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < start);
+ BUG_ON(next->vm_start > end);
+#endif
}
- vma = vma_next(mm, prev);
+
+ if (!next)
+ next = mas_next(mas, ULONG_MAX);
if (unlikely(uf)) {
/*
@@ -2867,30 +2460,366 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ error = userfaultfd_unmap_prep(mm, start, end, uf);
+
if (error)
- return error;
+ goto userfaultfd_error;
+ }
+
+ /* Point of no return */
+ mas_set_range(mas, start, end - 1);
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, &mt_detach, start, end - 1);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ rcu_read_lock();
+ vma_test = mas_find(&test, end - 1);
+ mas_for_each(mas, vma_mas, end - 1) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, end - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(count != test_count);
+ mas_set_range(mas, start, end - 1);
+ }
+#endif
+ mas_store_prealloc(mas, NULL);
+ mm->map_count -= count;
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (downgrade) {
+ if (next && (next->vm_flags & VM_GROWSDOWN))
+ downgrade = false;
+ else if (prev && (prev->vm_flags & VM_GROWSUP))
+ downgrade = false;
+ else
+ mmap_write_downgrade(mm);
}
- /* Detach vmas from rbtree */
- if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
- downgrade = false;
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end);
+ /* Statistics and freeing VMAs */
+ mas_set(&mas_detach, start);
+ remove_mt(mm, &mas_detach);
+ __mt_destroy(&mt_detach);
+
+
+ validate_mm(mm);
+ return downgrade ? 1 : 0;
+
+userfaultfd_error:
+munmap_sidetree_failed:
+end_split_failed:
+ __mt_destroy(&mt_detach);
+start_split_failed:
+map_count_exceeded:
+ mas_destroy(mas);
+ return error;
+}
+
+/*
+ * do_mas_munmap() - munmap a given range.
+ * @mas: The maple state
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @downgrade: set to true if the user wants to attempt to write_downgrade the
+ * mmap_sem
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned and any arch_unmap work will be preformed.
+ *
+ * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ */
+int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
- if (downgrade)
- mmap_write_downgrade(mm);
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
- unmap_region(mm, vma, prev, start, end);
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, start, end);
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
+ /* Find the first overlapping VMA */
+ vma = mas_find(mas, end - 1);
+ if (!vma)
+ return 0;
- return downgrade ? 1 : 0;
+ return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
}
+/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length to be munmapped.
+ * @uf: The userfaultfd list_head
+ */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- return __do_munmap(mm, start, len, uf, false);
+ MA_STATE(mas, &mm->mm_mt, start, start);
+
+ return do_mas_munmap(&mas, mm, start, len, uf, false);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *next, *prev, *merge;
+ pgoff_t pglen = len >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, end);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Unmap any existing mapping in the area */
+ if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ return -ENOMEM;
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ next = mas_next(&mas, ULONG_MAX);
+ prev = mas_prev(&mas, 0);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX , NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma(vma, vm_flags);
+ goto expanded;
+ }
+
+ mas.index = addr;
+ mas.last = end - 1;
+cannot_expand:
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = end;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto free_vma;
+ }
+
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ */
+ WARN_ON_ONCE(addr != vma->vm_start);
+
+ addr = vma->vm_start;
+ mas_reset(&mas);
+
+ /*
+ * If vm_flags changed after call_mmap(), we should try merge
+ * vma again as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
+ if (merge) {
+ /*
+ * ->mmap() can change vma->vm_file and fput
+ * the original file. So fput the vma->vm_file
+ * here or we would add an extra fput for file
+ * and cause general protection fault
+ * ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ error = -ENOMEM;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_mas_store(vma, &mas);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
+ /* Once vma denies write, undo our temporary denial count */
+unmap_writable:
+ if (file && vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ file = vma->vm_file;
+expanded:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
+ validate_mm(mm);
+ return addr;
+
+unmap_and_free_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ validate_mm(mm);
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
@@ -2898,11 +2827,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, start, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __do_munmap(mm, start, len, &uf, downgrade);
+ ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2968,11 +2898,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
if (start + size > vma->vm_end) {
- struct vm_area_struct *next;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ struct vm_area_struct *next, *prev = vma;
- for (next = vma->vm_next; next; next = next->vm_next) {
+ for_each_vma_range(vmi, next, start + size) {
/* hole between vmas ? */
- if (next->vm_start != next->vm_prev->vm_end)
+ if (next->vm_start != prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
@@ -2981,8 +2912,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (next->vm_flags != vma->vm_flags)
goto out;
- if (start + size <= next->vm_end)
- break;
+ prev = next;
}
if (!next)
@@ -3012,37 +2942,106 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct unmap, *next;
+ unsigned long unmap_pages;
+ int ret;
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ arch_unmap(mm, newbrk, oldbrk);
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ if (likely((vma->vm_end < oldbrk) || (vma->vm_start >= newbrk))) {
+ /* remove entire mapping(s) */
+ ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf,
+ true);
+ goto munmap_full_vma;
+ }
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ ret = userfaultfd_unmap_prep(mm, newbrk, oldbrk, uf);
+ if (ret)
+ return ret;
+
+ ret = 1;
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ /* Change the oldbrk of vma to the newbrk of the munmap area */
+ vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
return -ENOMEM;
- /* Check against address space limits *after* clearing old maps... */
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+
+ vma->vm_end = newbrk;
+ vma_init(&unmap, mm);
+ unmap.vm_start = newbrk;
+ unmap.vm_end = oldbrk;
+ if (vma->anon_vma)
+ vma_set_anonymous(&unmap);
+
+ vma_mas_remove(&unmap, mas);
+
+ vma->vm_end = newbrk;
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+
+ unmap_pages = vma_pages(&unmap);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= unmap_pages;
+
+ next = mas_next(mas, ULONG_MAX);
+ mmap_write_downgrade(mm);
+ unmap_region(mm, mas->tree, &unmap, vma, next, newbrk, oldbrk);
+ /* Statistics */
+ vm_stat_account(mm, vma->vm_flags, -unmap_pages);
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(unmap_pages);
+
+munmap_full_vma:
+ validate_mm_mt(mm);
+ return ret;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ validate_mm_mt(mm);
+
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -3052,28 +3051,50 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma &&
+ (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+ mas->index = vma->vm_start;
+ mas->last = addr + len - 1;
+ vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;
+
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if ( mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
+
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -3081,16 +3102,25 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
+ validate_mm(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3101,13 +3131,36 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0);
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ if (!vma || vma->vm_end != addr || vma_policy(vma) ||
+ !can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT,NULL_VM_UFFD_CTX, NULL))
+ vma = NULL;
+
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
@@ -3123,34 +3176,19 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int count = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Manually reap the mm to free as much memory as possible.
- * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_lock for
- * write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_lock is
- * dropped.
- *
- * Nothing can be holding mm->mmap_lock here and the above call
- * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
- * __oom_reap_task_mm() will not block.
- */
- (void)__oom_reap_task_mm(mm);
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
-
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mm->mmap;
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
return;
}
@@ -3158,19 +3196,38 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX);
+ mmap_read_unlock(mm);
+
+ /*
+ * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
+ * because the memory has been already freed. Do not bother checking
+ * mm_is_oom_victim because setting a bit unconditionally is cheaper.
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
- /* Walk the list again, actually closing and freeing it. */
- while (vma) {
+ /*
+ * Walk the list again, actually closing and freeing it, with preemption
+ * enabled, without holding any MM locks besides the unreachable
+ * mmap_write_lock.
+ */
+ do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ count++;
cond_resched();
- }
- mm->mmap = NULL;
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+ BUG_ON(count != mm->map_count);
+
+ trace_exit_mmap(mm);
+ __mt_destroy(&mm->mm_mt);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3181,12 +3238,9 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
-
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
return -ENOMEM;
+
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
@@ -3208,7 +3262,9 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, vma))
+ return -ENOMEM;
+
return 0;
}
@@ -3224,9 +3280,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
+ validate_mm_mt(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3236,8 +3292,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ new_vma = find_vma_prev(mm, addr, &prev);
+ if (new_vma->vm_start < addr + len)
return NULL; /* should never get here */
+
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
@@ -3278,16 +3336,22 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
*need_rmap_locks = false;
}
+ validate_mm_mt(mm);
return new_vma;
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
+ validate_mm_mt(mm);
return NULL;
}
@@ -3424,6 +3488,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
+ validate_mm_mt(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3446,10 +3511,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
+ validate_mm_mt(mm);
return vma;
out:
vm_area_free(vma);
+ validate_mm_mt(mm);
return ERR_PTR(ret);
}
@@ -3574,12 +3641,13 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3587,7 +3655,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3595,7 +3664,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3654,11 +3724,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ba5592655ee3..041beeb5fad6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -38,6 +38,39 @@
#include "internal.h"
+static inline bool can_change_pte_writable(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
+
+ if (pte_protnone(pte) || !pte_dirty(pte))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_pte_wp(vma, pte))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /*
+ * We can only special-case on exclusive anonymous pages,
+ * because we know that our write-fault handler similarly would
+ * map them writable without any additional checks while holding
+ * the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+ }
+
+ return true;
+}
+
static unsigned long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
- bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -95,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
continue;
page = vm_normal_page(vma, addr, oldpte);
- if (!page || PageKsm(page))
+ if (!page || is_zone_device_page(page) || PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */
@@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
ptent = pte_wrprotect(ptent);
ptent = pte_mkuffd_wp(ptent);
} else if (uffd_wp_resolve) {
- /*
- * Leave the write bit to be handled
- * by PF interrupt handler, then
- * things like COW could be properly
- * handled.
- */
ptent = pte_clear_uffd_wp(ptent);
}
- /* Avoid taking write faults for known dirty pages */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY))) {
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent) &&
+ can_change_pte_writable(vma, addr, ptent))
ptent = pte_mkwrite(ptent);
- }
+
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
if (pte_needs_flush(oldpte, ptent))
tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
@@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
+ bool try_change_writable;
pgoff_t pgoff;
int error;
- int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
@@ -583,11 +621,20 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ /*
+ * We want to check manually if we can change individual PTEs writable
+ * if we can't do that automatically for all PTEs in a mapping. For
+ * private mappings, that's always the case when we have write
+ * permissions as we properly have to handle COW.
+ */
+ if (vma->vm_flags & VM_SHARED)
+ try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ else
+ try_change_writable = !!(vma->vm_flags & VM_WRITE);
vma_set_page_prot(vma);
change_protection(tlb, vma, start, end, vma->vm_page_prot,
- dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+ try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -621,6 +668,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
struct mmu_gather tlb;
+ MA_STATE(mas, &current->mm->mm_mt, start, start);
start = untagged_addr(start);
@@ -652,7 +700,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
- vma = find_vma(current->mm, start);
+ vma = mas_find(&mas, ULONG_MAX);
error = -ENOMEM;
if (!vma)
goto out;
@@ -678,7 +726,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (start > vma->vm_start)
prev = vma;
else
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
tlb_gather_mmu(&tlb, current->mm);
for (nstart = start ; ; ) {
@@ -741,7 +789,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(current->mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
diff --git a/mm/mremap.c b/mm/mremap.c
index b522cd0259a0..e465ffe279bb 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/ksm.h>
@@ -23,6 +24,7 @@
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
@@ -716,7 +718,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
}
return new_addr;
@@ -866,9 +868,10 @@ out:
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
unsigned long end = vma->vm_end + delta;
+
if (end < vma->vm_end) /* overflow */
return 0;
- if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
return 0;
if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
0, MAP_FIXED) & ~PAGE_MASK)
@@ -975,20 +978,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * __do_munmap does all the needed commit accounting, and
+ * do_mas_munmap does all the needed commit accounting, and
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
+ MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
- retval = __do_munmap(mm, addr+new_len, old_len - new_len,
- &uf_unmap, true);
- if (retval < 0 && old_len != new_len) {
- ret = retval;
- goto out;
+ retval = do_mas_munmap(&mas, mm, addr + new_len,
+ old_len - new_len, &uf_unmap, true);
/* Returning 1 indicates mmap_lock is downgraded to read. */
- } else if (retval == 1)
+ if (retval == 1) {
downgraded = true;
+ } else if (retval < 0 && old_len != new_len) {
+ ret = retval;
+ goto out;
+ }
+
ret = addr;
goto out;
}
@@ -1008,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
long pages = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long extension_start = addr + old_len;
+ unsigned long extension_end = addr + new_len;
+ pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT);
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
@@ -1016,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
- if (vma_adjust(vma, vma->vm_start, addr + new_len,
- vma->vm_pgoff, NULL)) {
+ /*
+ * Function vma_merge() is called on the extension we are adding to
+ * the already existing vma, vma_merge() will merge this extension with
+ * the already existing vma (expand operation itself) and possibly also
+ * with the next vma if it becomes adjacent to the expanded vma and
+ * otherwise compatible.
+ */
+ vma = vma_merge(mm, vma, extension_start, extension_end,
+ vma->vm_flags, vma->anon_vma, vma->vm_file,
+ extension_pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
diff --git a/mm/msync.c b/mm/msync.c
index 137d1c104f3e..ac4c9bfea2e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
}
}
out_unlock:
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..7b3fad611895 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,7 +19,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- struct vm_area_struct *pvma, *prev;
- struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
+}
- BUG_ON(!vma->vm_region);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
+{
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
+}
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
+{
mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
- /* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+/*
+ * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
+ * @mas: The maple state with preallocations.
+ * @mm: The mm_struct
+ * @vma: The vma to add
+ *
+ */
+static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ BUG_ON(!vma->vm_region);
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ setup_vma_to_mm(vma, mm);
- __vma_link_list(mm, vma, prev);
+ /* add the VMA to the tree */
+ vma_mas_store(vma, mas);
}
/*
- * delete a VMA from its owning mm_struct and address space
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
*/
-static void delete_vma_from_mm(struct vm_area_struct *vma)
-{
- int i;
- struct address_space *mapping;
- struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
-
- mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache.vmas[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
+static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
}
+ mas_add_vma_to_mm(&mas, mm, vma);
+ return 0;
+}
+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
+ vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
+ struct address_space *mapping;
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
@@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
- /* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);
- __vma_unlink_list(mm, vma);
+ /* remove from the MM's tree and list */
+ vma_mas_remove(vma, &mas);
+ return 0;
}
/*
@@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma;
-
- /* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- return NULL;
+ return mas_walk(&mas);
}
EXPORT_SYMBOL(find_vma);
@@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;
- return NULL;
+ return vma;
}
/*
@@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
*populate = 0;
@@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ goto error_maple_preallocate;
+
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- add_vma_to_mm(current->mm, vma);
+ mas_add_vma_to_mm(&mas, current->mm, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1262,6 +1250,7 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
+ mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1278,6 +1267,14 @@ error_getting_region:
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_maple_preallocate:
+ kmem_cache_free(vm_region_jar, region);
+ vm_area_free(vma);
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
@@ -1378,7 +1376,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1391,8 +1388,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
+ mas_store(&mas, vma);
+ vma_mas_store(new, &mas);
return 0;
}
@@ -1408,12 +1414,14 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ return -ENOMEM;
if (from > vma->vm_start)
vma->vm_end = from;
else
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ if (add_vma_to_mm(mm, vma))
+ return -ENOMEM;
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1441,9 +1449,10 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;
len = PAGE_ALIGN(len);
if (len == 0)
@@ -1452,7 +1461,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = mas_find(&mas, end - 1);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1471,7 +1480,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = mas_next(&mas, end - 1);
} while (vma);
return -EINVAL;
} else {
@@ -1493,9 +1502,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
}
erase_whole_vma:
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
delete_vma(mm, vma);
- return 0;
+ return ret;
}
int vm_munmap(unsigned long addr, size_t len)
@@ -1520,6 +1530,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
if (!mm)
@@ -1527,12 +1538,18 @@ void exit_mmap(struct mm_struct *mm)
mm->total_vm = 0;
- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
- delete_vma_from_mm(vma);
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}
int vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3c6cf9e3cd66..35ec75cdfee2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -509,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
+ VMA_ITERATOR(vmi, mm, 0);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
@@ -764,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk)
return;
/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
mmgrab(tsk->signal->oom_mm);
- set_bit(MMF_OOM_VICTIM, &mm->flags);
- }
/*
* Make sure that the task is woken up from uninterruptible sleep
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5abeb349c391..2eb6ad5a650a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,97 @@ typedef int __bitwise fpi_t;
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
-struct pagesets {
- local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags) do { } while (0)
+#define pcp_trylock_finish(flag) do { } while (0)
+#else
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags) local_irq_save(flags)
+#define pcp_trylock_finish(flags) local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin() preempt_disable()
+#define pcpu_task_unpin() preempt_enable()
+#else
+#define pcpu_task_pin() migrate_disable()
+#define pcpu_task_unpin() migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock(&_ret->member); \
+ _ret; \
+})
+
+#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock_irqsave(&_ret->member, flags); \
+ _ret; \
+})
+
+#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ if (!spin_trylock_irqsave(&_ret->member, flags)) { \
+ pcpu_task_unpin(); \
+ _ret = NULL; \
+ } \
+ _ret; \
+})
+
+#define pcpu_spin_unlock(member, ptr) \
+({ \
+ spin_unlock(&ptr->member); \
+ pcpu_task_unpin(); \
+})
+
+#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \
+({ \
+ spin_unlock_irqrestore(&ptr->member, flags); \
+ pcpu_task_unpin(); \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr) \
+ pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_lock_irqsave(ptr, flags) \
+ pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_trylock_irqsave(ptr, flags) \
+ pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
+
+#define pcp_spin_unlock_irqrestore(ptr, flags) \
+ pcpu_spin_unlock_irqrestore(lock, ptr, flags)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
- struct zone *zone;
- struct work_struct work;
-};
static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -653,7 +731,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
- base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ return NR_LOWORDER_PCP_LISTS;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -667,7 +745,7 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (pindex == NR_LOWORDER_PCP_LISTS)
order = pageblock_order;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -744,6 +822,14 @@ void prep_compound_page(struct page *page, unsigned int order)
prep_compound_head(page, order);
}
+void destroy_large_folio(struct folio *folio)
+{
+ enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+ VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+ compound_page_dtors[dtor](&folio->page);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
@@ -785,7 +871,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
return false;
__SetPageGuard(page);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -928,7 +1014,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add(&page->lru, &area->free_list[migratetype]);
+ list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -938,7 +1024,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add_tail(&page->lru, &area->free_list[migratetype]);
+ list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -952,7 +1038,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_move_tail(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -962,7 +1048,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
if (page_reported(page))
__ClearPageReported(page);
- list_del(&page->lru);
+ list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
@@ -1296,18 +1382,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
PageSkipKASanPoison(page);
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_pages(struct page *page, int numpages)
{
int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++) {
- u8 tag = page_kasan_tag(page + i);
- page_kasan_tag_reset(page + i);
- clear_highpage(page + i);
- page_kasan_tag_set(page + i, tag);
- }
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
@@ -1396,7 +1478,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1473,10 +1555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1504,11 +1583,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
do {
int mt;
- page = list_last_entry(list, struct page, lru);
+ page = list_last_entry(list, struct page, pcp_list);
mt = get_pcppage_migratetype(page);
/* must delete to avoid corrupting pcp list */
- list_del(&page->lru);
+ list_del(&page->pcp_list);
count -= nr_pages;
pcp->count -= nr_pages;
@@ -2441,7 +2520,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
/* If memory is still not initialized, do it now. */
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
SetPageSkipKASanPoison(page);
@@ -3044,10 +3123,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
{
int i, allocated = 0;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -3068,7 +3144,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
- list_add_tail(&page->lru, list);
+ list_add_tail(&page->pcp_list, list);
allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3091,51 +3167,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- unsigned long flags;
int to_drain, batch;
- local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
- if (to_drain > 0)
+ if (to_drain > 0) {
+ unsigned long flags;
+
+ /*
+ * free_pcppages_bulk expects IRQs disabled for zone->lock
+ * so even though pcp->lock is not intended to be IRQ-safe,
+ * it's needed in this context.
+ */
+ spin_lock_irqsave(&pcp->lock, flags);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
- unsigned long flags;
struct per_cpu_pages *pcp;
- local_lock_irqsave(&pagesets.lock, flags);
-
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ if (pcp->count) {
+ unsigned long flags;
- local_unlock_irqrestore(&pagesets.lock, flags);
+ /* See drain_zone_pages on why this is disabling IRQs */
+ spin_lock_irqsave(&pcp->lock, flags);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
/*
* Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages(unsigned int cpu)
{
@@ -3148,9 +3221,6 @@ static void drain_pages(unsigned int cpu)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
@@ -3162,24 +3232,6 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
-static void drain_local_pages_wq(struct work_struct *work)
-{
- struct pcpu_drain *drain;
-
- drain = container_of(work, struct pcpu_drain, work);
-
- /*
- * drain_all_pages doesn't use proper cpu hotplug protection so
- * we can race with cpu offline when the WQ can move this from
- * a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is alright but we also have to make sure to not move to
- * a different one.
- */
- migrate_disable();
- drain_local_pages(drain->zone);
- migrate_enable();
-}
-
/*
* The implementation of drain_all_pages(), exposing an extra parameter to
* drain on all cpus.
@@ -3201,13 +3253,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
static cpumask_t cpus_with_pcps;
/*
- * Make sure nobody triggers this path before mm_percpu_wq is fully
- * initialized.
- */
- if (WARN_ON_ONCE(!mm_percpu_wq))
- return;
-
- /*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
@@ -3256,14 +3301,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-
- drain->zone = zone;
- INIT_WORK(&drain->work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, &drain->work);
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}
- for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -3272,8 +3314,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
@@ -3318,7 +3358,7 @@ void mark_free_pages(struct zone *zone)
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
- &zone->free_area[order].free_list[t], lru) {
+ &zone->free_area[order].free_list[t], buddy_list) {
unsigned long i;
pfn = page_to_pfn(page);
@@ -3395,19 +3435,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+ struct page *page, int migratetype,
unsigned int order)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
int high;
int pindex;
bool free_high;
__count_vm_event(PGFREE);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pindex = order_to_pindex(migratetype, order);
- list_add(&page->lru, &pcp->lists[pindex]);
+ list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
/*
@@ -3432,6 +3470,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
void free_unref_page(struct page *page, unsigned int order)
{
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
+ struct per_cpu_pages *pcp;
+ struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype;
@@ -3454,9 +3495,16 @@ void free_unref_page(struct page *page, unsigned int order)
migratetype = MIGRATE_MOVABLE;
}
- local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, migratetype, order);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ zone = page_zone(page);
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (pcp) {
+ free_unref_page_commit(zone, pcp, page, migratetype, order);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ } else {
+ free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+ }
+ pcp_trylock_finish(UP_flags);
}
/*
@@ -3465,6 +3513,8 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ struct per_cpu_pages *pcp = NULL;
+ struct zone *locked_zone = NULL;
unsigned long flags;
int batch_count = 0;
int migratetype;
@@ -3489,8 +3539,18 @@ void free_unref_page_list(struct list_head *list)
}
}
- local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
+ struct zone *zone = page_zone(page);
+
+ /* Different zone, different pcp lock. */
+ if (zone != locked_zone) {
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
+
+ locked_zone = zone;
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
+ }
+
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3500,19 +3560,21 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, migratetype, 0);
+ free_unref_page_commit(zone, pcp, page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
batch_count = 0;
- local_lock_irqsave(&pagesets.lock, flags);
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
}
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
}
/*
@@ -3637,6 +3699,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
#endif
}
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ unsigned int order, unsigned int alloc_flags,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long flags;
+
+ do {
+ page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER)
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (!page) {
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
+ }
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
+
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone, 1);
+
+ return page;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3670,8 +3769,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
return NULL;
}
- page = list_first_entry(list, struct page, lru);
- list_del(&page->lru);
+ page = list_first_entry(list, struct page, pcp_list);
+ list_del(&page->pcp_list);
pcp->count -= 1 << order;
} while (check_new_pcp(page, order));
@@ -3688,19 +3787,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct list_head *list;
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
- local_lock_irqsave(&pagesets.lock, flags);
+ /*
+ * spin_trylock may fail due to a parallel drain. In the future, the
+ * trylock will also protect against IRQ reentrancy.
+ */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp) {
+ pcp_trylock_finish(UP_flags);
+ return NULL;
+ }
/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp->free_factor >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
@@ -3717,9 +3826,14 @@ struct page *rmqueue(struct zone *preferred_zone,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
- unsigned long flags;
struct page *page;
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3729,53 +3843,23 @@ struct page *rmqueue(struct zone *preferred_zone,
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
- goto out;
+ if (likely(page))
+ goto out;
}
}
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
- do {
- page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
- /*
- * order-0 request can reach here when the pcplist is skipped
- * due to non-CMA allocation context. HIGHATOMIC area is
- * reserved for high-order atomic allocation, so order-0
- * request should skip it.
- */
- if (order > 0 && alloc_flags & ALLOC_HARDER)
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
- if (!page)
- goto failed;
- }
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- spin_unlock_irqrestore(&zone->lock, flags);
- } while (check_new_pages(page, order));
-
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, 1);
+ page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+ migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
-
-failed:
- spin_unlock_irqrestore(&zone->lock, flags);
- return NULL;
}
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -3980,12 +4064,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
free_pages))
return true;
/*
- * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * Ignore watermark boosting for GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
- if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -4720,12 +4804,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH.
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
- if (gfp_mask & __GFP_ATOMIC) {
+ if (gfp_mask & __GFP_HIGH) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
@@ -4918,14 +5002,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int cpuset_mems_cookie;
int reserve_flags;
- /*
- * We also sanity check to catch abuse of atomic reserves being used by
- * callers that are not in atomic context.
- */
- if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
- (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
- gfp_mask &= ~__GFP_ATOMIC;
-
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
@@ -5245,6 +5321,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
{
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5325,11 +5402,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(!zone))
goto failed;
+ /* Is a parallel drain in progress? */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp)
+ goto failed_irq;
+
/* Attempt the batch allocation */
- local_lock_irqsave(&pagesets.lock, flags);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
-
while (nr_populated < nr_pages) {
/* Skip existing pages */
@@ -5342,8 +5422,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
pcp, pcp_list);
if (unlikely(!page)) {
/* Try and allocate at least one page */
- if (!nr_account)
+ if (!nr_account) {
+ pcp_spin_unlock_irqrestore(pcp, flags);
goto failed_irq;
+ }
break;
}
nr_account++;
@@ -5356,7 +5438,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nr_populated++;
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
@@ -5365,7 +5448,7 @@ out:
return nr_populated;
failed_irq:
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_trylock_finish(UP_flags);
failed:
page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@ -5796,14 +5879,14 @@ long si_mem_available(void)
/*
* Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
+ * without causing swapping or OOM.
*/
available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
@@ -6690,13 +6773,18 @@ static void __ref memmap_init_compound(struct page *head,
set_page_count(page, 0);
/*
- * The first tail page stores compound_mapcount_ptr() and
- * compound_order() and the second tail page stores
- * compound_pincount_ptr(). Call prep_compound_head() after
- * the first and second tail pages have been initialized to
- * not have the data overwritten.
+ * The first tail page stores compound_mapcount_ptr(),
+ * compound_order() and compound_pincount_ptr(). Call
+ * prep_compound_head() after the first tail page have
+ * been initialized to not have the data overwritten.
+ *
+ * Note the idea to make this right after we initialize
+ * the offending tail pages is trying to take advantage
+ * of the likelihood of those tail struct pages being
+ * cached given that we will read them right after in
+ * prep_compound_head().
*/
- if (pfn == head_pfn + 2)
+ if (unlikely(pfn == head_pfn + 1))
prep_compound_head(head, order);
}
}
@@ -7005,6 +7093,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));
+ spin_lock_init(&pcp->lock);
for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
INIT_LIST_HEAD(&pcp->lists[pindex]);
diff --git a/mm/page_io.c b/mm/page_io.c
index 68318134dc92..f75ebbc95ee6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -222,13 +222,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
+ rcu_read_lock();
memcg = page_memcg(page);
if (!memcg)
- return;
+ goto out;
- rcu_read_lock();
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
+out:
rcu_read_unlock();
}
#else
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..e971a467fcdf 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -243,7 +243,7 @@ restart:
* cleared *pmd but not decremented compound_mapcount().
*/
if ((pvmw->flags & PVMW_SYNC) &&
- transparent_hugepage_active(vma) &&
+ transhuge_vma_suitable(vma, pvmw->address) &&
(pvmw->nr_pages >= HPAGE_PMD_NR)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 9b3db11a4d1d..53e5c145fcce 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -456,7 +456,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
} else { /* inside vma */
walk.vma = vma;
next = min(end, vma->vm_end);
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
err = walk_page_test(start, next, &walk);
if (err > 0) {
diff --git a/mm/rmap.c b/mm/rmap.c
index f22d7578ac02..edc06c52bc82 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1537,6 +1537,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
+
/*
* The try_to_unmap() is only passed a hugetlb page
* in the case where the hugetlb page is poisoned.
@@ -1551,31 +1553,28 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
flush_cache_range(vma, range.start, range.end);
- if (!folio_test_anon(folio)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
+ if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
@@ -1619,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_swap_pte_at(mm, address,
- pvmw.pte, pteval,
- vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1921,6 +1918,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
+
/*
* huge_pmd_unshare may unmap an entire PMD page.
* There is no way of knowing exactly which PMDs may
@@ -1930,31 +1929,28 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
flush_cache_range(vma, range.start, range.end);
- if (!folio_test_anon(folio)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
+ if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
/* Nuke the hugetlb page table entry */
@@ -1972,7 +1968,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_zone_device(folio)) {
+ if (folio_is_device_private(folio)) {
unsigned long pfn = folio_pfn(folio);
swp_entry_t entry;
pte_t swp_pte;
@@ -2013,9 +2009,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_swap_pte_at(mm, address,
- pvmw.pte, pteval,
- vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2083,8 +2077,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
if (folio_test_hugetlb(folio))
- set_huge_swap_pte_at(mm, address, pvmw.pte,
- swp_pte, vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
@@ -2138,7 +2131,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
TTU_SYNC)))
return;
- if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
+ if (folio_is_zone_device(folio) &&
+ (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
return;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 133c67057d41..7331ed1be014 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1690,7 +1690,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
return;
folio_wait_writeback(folio);
- delete_from_swap_cache(&folio->page);
+ delete_from_swap_cache(folio);
spin_lock_irq(&info->lock);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
@@ -1788,7 +1788,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(&folio->page);
+ delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..e5b40c43221d
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+ struct mem_cgroup *memcg,
+ unsigned long *count_per_node)
+{
+ unsigned long nr, total = 0;
+ int nid;
+
+ for_each_node(nid) {
+ if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .nid = nid,
+ .memcg = memcg,
+ };
+
+ nr = shrinker->count_objects(shrinker, &sc);
+ if (nr == SHRINK_EMPTY)
+ nr = 0;
+ } else {
+ nr = 0;
+ }
+
+ count_per_node[nid] = nr;
+ total += nr;
+ }
+
+ return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+ struct shrinker *shrinker = m->private;
+ unsigned long *count_per_node;
+ struct mem_cgroup *memcg;
+ unsigned long total;
+ bool memcg_aware;
+ int ret, nid;
+
+ count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+ if (!count_per_node)
+ return -ENOMEM;
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ kfree(count_per_node);
+ return ret;
+ }
+ rcu_read_lock();
+
+ memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ if (memcg && !mem_cgroup_online(memcg))
+ continue;
+
+ total = shrinker_count_objects(shrinker,
+ memcg_aware ? memcg : NULL,
+ count_per_node);
+ if (total) {
+ seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+ for_each_node(nid)
+ seq_printf(m, " %lu", count_per_node[nid]);
+ seq_putc(m, '\n');
+ }
+
+ if (!memcg_aware) {
+ mem_cgroup_iter_break(NULL, memcg);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ ret = -EINTR;
+ break;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+ rcu_read_unlock();
+ up_read(&shrinker_rwsem);
+
+ kfree(count_per_node);
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return nonseekable_open(inode, file);
+}
+
+static ssize_t shrinker_debugfs_scan_write(struct file *file,
+ const char __user *buf,
+ size_t size, loff_t *pos)
+{
+ struct shrinker *shrinker = file->private_data;
+ unsigned long nr_to_scan = 0, ino, read_len;
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ struct mem_cgroup *memcg = NULL;
+ int nid;
+ char kbuf[72];
+ ssize_t ret;
+
+ read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+
+ if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3)
+ return -EINVAL;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return -EINVAL;
+
+ if (nr_to_scan == 0)
+ return size;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ memcg = mem_cgroup_get_from_ino(ino);
+ if (!memcg || IS_ERR(memcg))
+ return -ENOENT;
+
+ if (!mem_cgroup_online(memcg)) {
+ mem_cgroup_put(memcg);
+ return -ENOENT;
+ }
+ } else if (ino != 0) {
+ return -EINVAL;
+ }
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ mem_cgroup_put(memcg);
+ return ret;
+ }
+
+ sc.nid = nid;
+ sc.memcg = memcg;
+ sc.nr_to_scan = nr_to_scan;
+ sc.nr_scanned = nr_to_scan;
+
+ shrinker->scan_objects(shrinker, &sc);
+
+ up_read(&shrinker_rwsem);
+ mem_cgroup_put(memcg);
+
+ return size;
+}
+
+static const struct file_operations shrinker_debugfs_scan_fops = {
+ .owner = THIS_MODULE,
+ .open = shrinker_debugfs_scan_open,
+ .write = shrinker_debugfs_scan_write,
+};
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+ struct dentry *entry;
+ char buf[128];
+ int id;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ /* debugfs isn't initialized yet, add debugfs entries later. */
+ if (!shrinker_debugfs_root)
+ return 0;
+
+ id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+ if (id < 0)
+ return id;
+ shrinker->debugfs_id = id;
+
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id);
+
+ /* create debugfs entry */
+ entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+ if (IS_ERR(entry)) {
+ ida_free(&shrinker_debugfs_ida, id);
+ return PTR_ERR(entry);
+ }
+ shrinker->debugfs_entry = entry;
+
+ debugfs_create_file("count", 0220, entry, shrinker,
+ &shrinker_debugfs_count_fops);
+ debugfs_create_file("scan", 0440, entry, shrinker,
+ &shrinker_debugfs_scan_fops);
+ return 0;
+}
+
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+ struct dentry *entry;
+ char buf[128];
+ const char *new, *old;
+ va_list ap;
+ int ret = 0;
+
+ va_start(ap, fmt);
+ new = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!new)
+ return -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+
+ old = shrinker->name;
+ shrinker->name = new;
+
+ if (shrinker->debugfs_entry) {
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
+ shrinker->debugfs_id);
+
+ entry = debugfs_rename(shrinker_debugfs_root,
+ shrinker->debugfs_entry,
+ shrinker_debugfs_root, buf);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ else
+ shrinker->debugfs_entry = entry;
+ }
+
+ up_write(&shrinker_rwsem);
+
+ kfree_const(old);
+
+ return ret;
+}
+EXPORT_SYMBOL(shrinker_debugfs_rename);
+
+void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+ lockdep_assert_held(&shrinker_rwsem);
+
+ kfree_const(shrinker->name);
+
+ if (!shrinker->debugfs_entry)
+ return;
+
+ debugfs_remove_recursive(shrinker->debugfs_entry);
+ ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+ struct shrinker *shrinker;
+ struct dentry *dentry;
+ int ret = 0;
+
+ dentry = debugfs_create_dir("shrinker", NULL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ shrinker_debugfs_root = dentry;
+
+ /* Create debugfs entries for shrinkers registered at boot */
+ down_write(&shrinker_rwsem);
+ list_for_each_entry(shrinker, &shrinker_list, list)
+ if (!shrinker->debugfs_entry) {
+ ret = shrinker_debugfs_add(shrinker);
+ if (ret)
+ break;
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d8bb32fd58fc..5f0ed4717ed0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -556,7 +556,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
- * there's a a free_pages() call to this page allocated
+ * there's a free_pages() call to this page allocated
* above. Thus this get_page() is paired with the
* put_page_testzero() on the freeing path.
* This can only called by certain ZONE_DEVICE path,
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..e5a8a3a0edd7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
{
unsigned long coded_mem_map =
(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
- BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
return coded_mem_map;
}
diff --git a/mm/swap.c b/mm/swap.c
index 275a4ea1bc66..ae256e333713 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,30 +46,30 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
local_lock_t lock;
- struct pagevec pvec;
+ struct folio_batch fbatch;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
- * The following struct pagevec are grouped together because they are protected
+ * The following folio batches are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
-struct lru_pvecs {
+struct cpu_fbatches {
local_lock_t lock;
- struct pagevec lru_add;
- struct pagevec lru_deactivate_file;
- struct pagevec lru_deactivate;
- struct pagevec lru_lazyfree;
+ struct folio_batch lru_add;
+ struct folio_batch lru_deactivate_file;
+ struct folio_batch lru_deactivate;
+ struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
- struct pagevec activate_page;
+ struct folio_batch activate;
#endif
};
-static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -77,36 +77,35 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
* This path almost never happens for VM activity - pages are normally freed
* via pagevecs. But it gets used by networking - and for compound pages.
*/
-static void __page_cache_release(struct page *page)
+static void __page_cache_release(struct folio *folio)
{
- if (PageLRU(page)) {
- struct folio *folio = page_folio(page);
+ if (folio_test_lru(folio)) {
struct lruvec *lruvec;
unsigned long flags;
lruvec = folio_lruvec_lock_irqsave(folio, &flags);
- del_page_from_lru_list(page, lruvec);
- __clear_page_lru_flags(page);
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
+ lruvec_unlock_irqrestore(lruvec, flags);
}
- /* See comment on PageMlocked in release_pages() */
- if (unlikely(PageMlocked(page))) {
- int nr_pages = thp_nr_pages(page);
+ /* See comment on folio_test_mlocked in release_pages() */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
- __ClearPageMlocked(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}
}
-static void __put_single_page(struct page *page)
+static void __folio_put_small(struct folio *folio)
{
- __page_cache_release(page);
- mem_cgroup_uncharge(page_folio(page));
- free_unref_page(page, 0);
+ __page_cache_release(folio);
+ mem_cgroup_uncharge(folio);
+ free_unref_page(&folio->page, 0);
}
-static void __put_compound_page(struct page *page)
+static void __folio_put_large(struct folio *folio)
{
/*
* __page_cache_release() is supposed to be called for thp, not for
@@ -114,21 +113,21 @@ static void __put_compound_page(struct page *page)
* (it's never listed to any LRU lists) and no memcg routines should
* be called for hugetlb (it has a separate hugetlb_cgroup.)
*/
- if (!PageHuge(page))
- __page_cache_release(page);
- destroy_compound_page(page);
+ if (!folio_test_hugetlb(folio))
+ __page_cache_release(folio);
+ destroy_large_folio(folio);
}
-void __put_page(struct page *page)
+void __folio_put(struct folio *folio)
{
- if (unlikely(is_zone_device_page(page)))
- free_zone_device_page(page);
- else if (unlikely(PageCompound(page)))
- __put_compound_page(page);
+ if (unlikely(folio_is_zone_device(folio)))
+ free_zone_device_page(&folio->page);
+ else if (unlikely(folio_test_large(folio)))
+ __folio_put_large(folio);
else
- __put_single_page(page);
+ __folio_put_small(folio);
}
-EXPORT_SYMBOL(__put_page);
+EXPORT_SYMBOL(__folio_put);
/**
* put_pages_list() - release a list of pages
@@ -138,19 +137,19 @@ EXPORT_SYMBOL(__put_page);
*/
void put_pages_list(struct list_head *pages)
{
- struct page *page, *next;
+ struct folio *folio, *next;
- list_for_each_entry_safe(page, next, pages, lru) {
- if (!put_page_testzero(page)) {
- list_del(&page->lru);
+ list_for_each_entry_safe(folio, next, pages, lru) {
+ if (!folio_put_testzero(folio)) {
+ list_del(&folio->lru);
continue;
}
- if (PageHead(page)) {
- list_del(&page->lru);
- __put_compound_page(page);
+ if (folio_test_large(folio)) {
+ list_del(&folio->lru);
+ __folio_put_large(folio);
continue;
}
- /* Cannot be PageLRU because it's passed to us using the lru */
+ /* LRU flag must be clear because it's passed using the lru */
}
free_unref_page_list(pages);
@@ -188,36 +187,84 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
}
EXPORT_SYMBOL_GPL(get_kernel_pages);
-static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, struct lruvec *lruvec))
+typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
+
+static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
+{
+ int was_unevictable = folio_test_clear_unevictable(folio);
+ long nr_pages = folio_nr_pages(folio);
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+ /*
+ * Is an smp_mb__after_atomic() still required here, before
+ * folio_evictable() tests the mlocked flag, to rule out the possibility
+ * of stranding an evictable folio on an unevictable LRU? I think
+ * not, because __munlock_page() only clears the mlocked flag
+ * while the LRU lock is held.
+ *
+ * (That is not true of __page_cache_release(), and not necessarily
+ * true of release_pages(): but those only clear the mlocked flag after
+ * folio_put_testzero() has excluded any other users of the folio.)
+ */
+ if (folio_evictable(folio)) {
+ if (was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ } else {
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
+ /*
+ * folio->mlock_count = !!folio_test_mlocked(folio)?
+ * But that leaves __mlock_page() in doubt whether another
+ * actor has already counted the mlock or not. Err on the
+ * safe side, underestimate, let page reclaim fix it, rather
+ * than leaving a page on the unevictable LRU indefinitely.
+ */
+ folio->mlock_count = 0;
+ if (!was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+ }
+
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_insertion(folio);
+}
+
+static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct folio *folio = page_folio(page);
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- /* block memcg migration during page moving between lru */
- if (!TestClearPageLRU(page))
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
continue;
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
- (*move_fn)(page, lruvec);
+ move_fn(lruvec, folio);
- SetPageLRU(page);
+ folio_set_lru(folio);
}
+
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
+ lruvec_unlock_irqrestore(lruvec, flags);
+ folios_put(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_init(fbatch);
}
-static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
+static void folio_batch_add_and_move(struct folio_batch *fbatch,
+ struct folio *folio, move_fn_t move_fn)
{
- struct folio *folio = page_folio(page);
+ if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
+ !lru_cache_disabled())
+ return;
+ folio_batch_move_lru(fbatch, move_fn);
+}
+static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
+{
if (!folio_test_unevictable(folio)) {
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
@@ -226,18 +273,6 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
}
}
-/* return true if pagevec needs to drain */
-static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
-{
- bool ret = false;
-
- if (!pagevec_add(pvec, page) || PageCompound(page) ||
- lru_cache_disabled())
- ret = true;
-
- return ret;
-}
-
/*
* Writeback is about to end against a folio which has been marked for
* immediate reclaim. If it still appears to be reclaimable, move it
@@ -249,14 +284,13 @@ void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
!folio_test_unevictable(folio) && folio_test_lru(folio)) {
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
local_lock_irqsave(&lru_rotate.lock, flags);
- pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
+ fbatch = this_cpu_ptr(&lru_rotate.fbatch);
+ folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
@@ -303,11 +337,16 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
void lru_note_cost_folio(struct folio *folio)
{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ /*
+ * The rcu read lock is held by the caller, so we do not need to
+ * care about the lruvec returned by folio_lruvec() being released.
+ */
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
folio_nr_pages(folio));
}
-static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
+static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
{
if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
long nr_pages = folio_nr_pages(folio);
@@ -324,41 +363,30 @@ static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
}
#ifdef CONFIG_SMP
-static void __activate_page(struct page *page, struct lruvec *lruvec)
+static void folio_activate_drain(int cpu)
{
- return __folio_activate(page_folio(page), lruvec);
-}
+ struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);
-static void activate_page_drain(int cpu)
-{
- struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
-
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, __activate_page);
-}
-
-static bool need_activate_page_drain(int cpu)
-{
- return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, folio_activate_fn);
}
static void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.activate_page);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, __activate_page);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.activate);
+ folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
#else
-static inline void activate_page_drain(int cpu)
+static inline void folio_activate_drain(int cpu)
{
}
@@ -368,8 +396,8 @@ static void folio_activate(struct folio *folio)
if (folio_test_clear_lru(folio)) {
lruvec = folio_lruvec_lock_irq(folio);
- __folio_activate(folio, lruvec);
- unlock_page_lruvec_irq(lruvec);
+ folio_activate_fn(lruvec, folio);
+ lruvec_unlock_irq(lruvec);
folio_set_lru(folio);
}
}
@@ -377,32 +405,32 @@ static void folio_activate(struct folio *folio)
static void __lru_cache_activate_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
int i;
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
/*
- * Search backwards on the optimistic assumption that the page being
- * activated has just been added to this pagevec. Note that only
- * the local pagevec is examined as a !PageLRU page could be in the
+ * Search backwards on the optimistic assumption that the folio being
+ * activated has just been added to this batch. Note that only
+ * the local batch is examined as a !LRU folio could be in the
* process of being released, reclaimed, migrated or on a remote
- * pagevec that is currently being drained. Furthermore, marking
- * a remote pagevec's page PageActive potentially hits a race where
- * a page is marked PageActive just after it is added to the inactive
+ * batch that is currently being drained. Furthermore, marking
+ * a remote batch's folio active potentially hits a race where
+ * a folio is marked active just after it is added to the inactive
* list causing accounting errors and BUG_ON checks to trigger.
*/
- for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
- struct page *pagevec_page = pvec->pages[i];
+ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
+ struct folio *batch_folio = fbatch->folios[i];
- if (pagevec_page == &folio->page) {
+ if (batch_folio == folio) {
folio_set_active(folio);
break;
}
}
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
}
/*
@@ -427,9 +455,9 @@ void folio_mark_accessed(struct folio *folio)
*/
} else if (!folio_test_active(folio)) {
/*
- * If the page is on the LRU, queue it for activation via
- * lru_pvecs.activate_page. Otherwise, assume the page is on a
- * pagevec, mark it active and it'll be moved to the active
+ * If the folio is on the LRU, queue it for activation via
+ * cpu_fbatches.activate. Otherwise, assume the folio is in a
+ * folio_batch, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
if (folio_test_lru(folio))
@@ -450,22 +478,22 @@ EXPORT_SYMBOL(folio_mark_accessed);
*
* Queue the folio for addition to the LRU. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of folio_add_lru()
+ * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
* have the folio added to the active list using folio_mark_accessed().
*/
void folio_add_lru(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_active(folio) &&
+ folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- __pagevec_lru_add(pvec);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
+ folio_batch_add_and_move(fbatch, folio, lru_add_fn);
+ local_unlock(&cpu_fbatches.lock);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -489,56 +517,57 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
}
/*
- * If the page can not be invalidated, it is moved to the
+ * If the folio cannot be invalidated, it is moved to the
* inactive list to speed up its reclaim. It is moved to the
* head of the list, rather than the tail, to give the flusher
* threads some time to write it out, as this is much more
* effective than the single-page writeout from reclaim.
*
- * If the page isn't page_mapped and dirty/writeback, the page
- * could reclaim asap using PG_reclaim.
+ * If the folio isn't mapped and dirty/writeback, the folio
+ * could be reclaimed asap using the reclaim flag.
*
- * 1. active, mapped page -> none
- * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
- * 3. inactive, mapped page -> none
- * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 1. active, mapped folio -> none
+ * 2. active, dirty/writeback folio -> inactive, head, reclaim
+ * 3. inactive, mapped folio -> none
+ * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
* 5. inactive, clean -> inactive, tail
* 6. Others -> none
*
- * In 4, why it moves inactive's head, the VM expects the page would
- * be write it out by flusher threads as this is much more effective
+ * In 4, it moves to the head of the inactive list so the folio is
+ * written out by flusher threads as this is much more efficient
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
+static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
{
- bool active = PageActive(page);
- int nr_pages = thp_nr_pages(page);
+ bool active = folio_test_active(folio);
+ long nr_pages = folio_nr_pages(folio);
- if (PageUnevictable(page))
+ if (folio_test_unevictable(folio))
return;
- /* Some processes are using the page */
- if (page_mapped(page))
+ /* Some processes are using the folio */
+ if (folio_mapped(folio))
return;
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
- if (PageWriteback(page) || PageDirty(page)) {
+ if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
/*
- * PG_reclaim could be raced with end_page_writeback
- * It can make readahead confusing. But race window
- * is _really_ small and it's non-critical problem.
+ * Setting the reclaim flag could race with
+ * folio_end_writeback() and confuse readahead. But the
+ * race window is _really_ small and it's not a critical
+ * problem.
*/
- add_page_to_lru_list(page, lruvec);
- SetPageReclaim(page);
+ lruvec_add_folio(lruvec, folio);
+ folio_set_reclaim(folio);
} else {
/*
- * The page's writeback ends up during pagevec
- * We move that page into tail of inactive.
+ * The folio's writeback ended while it was in the batch.
+ * We move that folio to the tail of the inactive list.
*/
- add_page_to_lru_list_tail(page, lruvec);
+ lruvec_add_folio_tail(lruvec, folio);
__count_vm_events(PGROTATED, nr_pages);
}
@@ -549,15 +578,15 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
}
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
+static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageActive(page) && !PageUnevictable(page)) {
- int nr_pages = thp_nr_pages(page);
+ if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -565,22 +594,22 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
}
}
-static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
+static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- int nr_pages = thp_nr_pages(page);
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
/*
- * Lazyfree pages are clean anonymous pages. They have
- * PG_swapbacked flag cleared, to distinguish them from normal
- * anonymous pages
+ * Lazyfree folios are clean anonymous folios. They have
+ * the swapbacked flag cleared, to distinguish them from normal
+ * anonymous folios
*/
- ClearPageSwapBacked(page);
- add_page_to_lru_list(page, lruvec);
+ folio_clear_swapbacked(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -589,71 +618,67 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
}
/*
- * Drain pages out of the cpu's pagevecs.
+ * Drain pages out of the cpu's folio_batch.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+ struct folio_batch *fbatch = &fbatches->lru_add;
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_add_fn);
- pvec = &per_cpu(lru_rotate.pvec, cpu);
+ fbatch = &per_cpu(lru_rotate.fbatch, cpu);
/* Disabling interrupts below acts as a compiler barrier. */
- if (data_race(pagevec_count(pvec))) {
+ if (data_race(folio_batch_count(fbatch))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
local_lock_irqsave(&lru_rotate.lock, flags);
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
+ folio_batch_move_lru(fbatch, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
- pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
+ fbatch = &fbatches->lru_deactivate_file;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_file_fn);
- pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn);
+ fbatch = &fbatches->lru_deactivate;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_fn);
- pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
+ fbatch = &fbatches->lru_lazyfree;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_lazyfree_fn);
- activate_page_drain(cpu);
+ folio_activate_drain(cpu);
}
/**
- * deactivate_file_folio() - Forcefully deactivate a file folio.
+ * deactivate_file_folio() - Deactivate a file folio.
* @folio: Folio to deactivate.
*
* This function hints to the VM that @folio is a good reclaim candidate,
* for example if its invalidation fails due to the folio being dirty
* or under writeback.
*
- * Context: Caller holds a reference on the page.
+ * Context: Caller holds a reference on the folio.
*/
void deactivate_file_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- /*
- * In a workload with many unevictable pages such as mprotect,
- * unevictable folio deactivation for accelerating reclaim is pointless.
- */
+ /* Deactivating an unevictable folio will not accelerate reclaim */
if (folio_test_unevictable(folio))
return;
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
-
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
+ local_unlock(&cpu_fbatches.lock);
}
/*
@@ -666,15 +691,17 @@ void deactivate_file_folio(struct folio *folio)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
- get_page(page);
- if (pagevec_add_and_need_flush(pvec, page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn);
- local_unlock(&lru_pvecs.lock);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_lru(folio) && folio_test_active(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
@@ -687,24 +714,26 @@ void deactivate_page(struct page *page)
*/
void mark_page_lazyfree(struct page *page)
{
- if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
- get_page(page);
- if (pagevec_add_and_need_flush(pvec, page))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
- local_unlock(&lru_pvecs.lock);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_lru(folio) && folio_test_anon(folio) &&
+ folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
+ folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
void lru_add_drain(void)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
mlock_page_drain_local();
}
@@ -716,19 +745,19 @@ void lru_add_drain(void)
*/
static void lru_add_and_bh_lrus_drain(void)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
invalidate_bh_lrus_cpu();
mlock_page_drain_local();
}
void lru_add_drain_cpu_zone(struct zone *zone)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
mlock_page_drain_local();
}
@@ -741,6 +770,21 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_and_bh_lrus_drain();
}
+static bool cpu_needs_drain(unsigned int cpu)
+{
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+
+ /* Check these in order of likelihood that they're not zero */
+ return folio_batch_count(&fbatches->lru_add) ||
+ data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
+ folio_batch_count(&fbatches->lru_deactivate_file) ||
+ folio_batch_count(&fbatches->lru_deactivate) ||
+ folio_batch_count(&fbatches->lru_lazyfree) ||
+ folio_batch_count(&fbatches->activate) ||
+ need_mlock_page_drain(cpu) ||
+ has_bh_in_lru(cpu, NULL);
+}
+
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -773,8 +817,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
return;
/*
- * Guarantee pagevec counter stores visible by this CPU are visible to
- * other CPUs before loading the current drain generation.
+ * Guarantee folio_batch counter stores visible by this CPU
+ * are visible to other CPUs before loading the current drain
+ * generation.
*/
smp_mb();
@@ -800,8 +845,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
* (D) Increment global generation number
*
* Pairs with smp_load_acquire() at (B), outside of the critical
- * section. Use a full memory barrier to guarantee that the new global
- * drain generation number is stored before loading pagevec counters.
+ * section. Use a full memory barrier to guarantee that the
+ * new global drain generation number is stored before loading
+ * folio_batch counters.
*
* This pairing must be done here, before the for_each_online_cpu loop
* below which drains the page vectors.
@@ -823,14 +869,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
- data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
- need_activate_page_drain(cpu) ||
- need_mlock_page_drain(cpu) ||
- has_bh_in_lru(cpu, NULL)) {
+ if (cpu_needs_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
@@ -906,8 +945,7 @@ void release_pages(struct page **pages, int nr)
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
- struct page *page = pages[i];
- struct folio *folio = page_folio(page);
+ struct folio *folio = page_folio(pages[i]);
/*
* Make sure the IRQ-safe lock-holding time does not get
@@ -915,39 +953,38 @@ void release_pages(struct page **pages, int nr)
* same lruvec. The lock is held only if lruvec != NULL.
*/
if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
- page = &folio->page;
- if (is_huge_zero_page(page))
+ if (is_huge_zero_page(&folio->page))
continue;
- if (is_zone_device_page(page)) {
+ if (folio_is_zone_device(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(page))
+ if (put_devmap_managed_page(&folio->page))
continue;
- if (put_page_testzero(page))
- free_zone_device_page(page);
+ if (folio_put_testzero(folio))
+ free_zone_device_page(&folio->page);
continue;
}
- if (!put_page_testzero(page))
+ if (!folio_put_testzero(folio))
continue;
- if (PageCompound(page)) {
+ if (folio_test_large(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
- __put_compound_page(page);
+ __folio_put_large(folio);
continue;
}
- if (PageLRU(page)) {
+ if (folio_test_lru(folio)) {
struct lruvec *prev_lruvec = lruvec;
lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
@@ -955,8 +992,8 @@ void release_pages(struct page **pages, int nr)
if (prev_lruvec != lruvec)
lock_batch = 0;
- del_page_from_lru_list(page, lruvec);
- __clear_page_lru_flags(page);
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
}
/*
@@ -965,16 +1002,16 @@ void release_pages(struct page **pages, int nr)
* found set here. This does not indicate a problem, unless
* "unevictable_pgs_cleared" appears worryingly large.
*/
- if (unlikely(PageMlocked(page))) {
- __ClearPageMlocked(page);
- dec_zone_page_state(page, NR_MLOCK);
+ if (unlikely(folio_test_mlocked(folio))) {
+ __folio_clear_mlocked(folio);
+ zone_stat_sub_folio(folio, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGCLEARED);
}
- list_add(&page->lru, &pages_to_free);
+ list_add(&folio->lru, &pages_to_free);
}
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -987,8 +1024,8 @@ EXPORT_SYMBOL(release_pages);
* OK from a correctness point of view but is inefficient - those pages may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
- * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
- * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * So __pagevec_release() will drain those queues here.
+ * folio_batch_move_lru() calls folios_put() directly to avoid
* mutual recursion.
*/
void __pagevec_release(struct pagevec *pvec)
@@ -1002,69 +1039,6 @@ void __pagevec_release(struct pagevec *pvec)
}
EXPORT_SYMBOL(__pagevec_release);
-static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
-{
- int was_unevictable = folio_test_clear_unevictable(folio);
- long nr_pages = folio_nr_pages(folio);
-
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
-
- folio_set_lru(folio);
- /*
- * Is an smp_mb__after_atomic() still required here, before
- * folio_evictable() tests PageMlocked, to rule out the possibility
- * of stranding an evictable folio on an unevictable LRU? I think
- * not, because __munlock_page() only clears PageMlocked while the LRU
- * lock is held.
- *
- * (That is not true of __page_cache_release(), and not necessarily
- * true of release_pages(): but those only clear PageMlocked after
- * put_page_testzero() has excluded any other users of the page.)
- */
- if (folio_evictable(folio)) {
- if (was_unevictable)
- __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
- } else {
- folio_clear_active(folio);
- folio_set_unevictable(folio);
- /*
- * folio->mlock_count = !!folio_test_mlocked(folio)?
- * But that leaves __mlock_page() in doubt whether another
- * actor has already counted the mlock or not. Err on the
- * safe side, underestimate, let page reclaim fix it, rather
- * than leaving a page on the unevictable LRU indefinitely.
- */
- folio->mlock_count = 0;
- if (!was_unevictable)
- __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
- }
-
- lruvec_add_folio(lruvec, folio);
- trace_mm_lru_insertion(folio);
-}
-
-/*
- * Add the passed pages to the LRU, then drop the caller's refcount
- * on them. Reinitialises the caller's pagevec.
- */
-void __pagevec_lru_add(struct pagevec *pvec)
-{
- int i;
- struct lruvec *lruvec = NULL;
- unsigned long flags = 0;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct folio *folio = page_folio(pvec->pages[i]);
-
- lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
- __pagevec_lru_add_fn(folio, lruvec);
- }
- if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
-}
-
/**
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
* @fbatch: The batch to prune
diff --git a/mm/swap.h b/mm/swap.h
index 0193797b0c92..17936e068c1c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -36,12 +36,11 @@ bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct page *page, swp_entry_t entry,
gfp_t gfp, void **shadowp);
-void __delete_from_swap_cache(struct page *page,
+void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow);
-void delete_from_swap_cache(struct page *page);
+void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-void free_swap_cache(struct page *page);
struct page *lookup_swap_cache(swp_entry_t entry,
struct vm_area_struct *vma,
unsigned long addr);
@@ -61,9 +60,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
-static inline unsigned int page_swap_flags(struct page *page)
+static inline unsigned int folio_swap_flags(struct folio *folio)
{
- return page_swap_info(page)->flags;
+ return page_swap_info(&folio->page)->flags;
}
#else /* CONFIG_SWAP */
struct swap_iocb;
@@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
return NULL;
}
-static inline void free_swap_cache(struct page *page)
-{
-}
-
static inline void show_swap_cache_info(void)
{
}
@@ -135,12 +130,12 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
return -1;
}
-static inline void __delete_from_swap_cache(struct page *page,
+static inline void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
}
-static inline void delete_from_swap_cache(struct page *page)
+static inline void delete_from_swap_cache(struct folio *folio)
{
}
@@ -149,7 +144,7 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
{
}
-static inline unsigned int page_swap_flags(struct page *page)
+static inline unsigned int folio_swap_flags(struct folio *folio)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0a2021fc55ad..e166051566f4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -59,24 +59,11 @@ static bool enable_vma_readahead __read_mostly = true;
#define GET_SWAP_RA_VAL(vma) \
(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
-#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
-#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
-
-static struct {
- unsigned long add_total;
- unsigned long del_total;
- unsigned long find_success;
- unsigned long find_total;
-} swap_cache_info;
-
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
- printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
- swap_cache_info.add_total, swap_cache_info.del_total,
- swap_cache_info.find_success, swap_cache_info.find_total);
printk("Free swap = %ldkB\n",
get_nr_swap_pages() << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
@@ -133,7 +120,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
- ADD_CACHE_INFO(add_total, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -147,32 +133,32 @@ unlock:
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page,
+void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
- int i, nr = thp_nr_pages(page);
+ int i;
+ long nr = folio_nr_pages(folio);
pgoff_t idx = swp_offset(entry);
XA_STATE(xas, &address_space->i_pages, idx);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != page, entry);
- set_page_private(page + i, 0);
+ VM_BUG_ON_FOLIO(entry != folio, folio);
+ set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
- ClearPageSwapCache(page);
+ folio_clear_swapcache(folio);
address_space->nrpages -= nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
- __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
- ADD_CACHE_INFO(del_total, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
/**
@@ -237,22 +223,22 @@ fail:
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache and locked.
- * It will never put the page into the free list,
- * the caller has a reference on the page.
+ * It will never put the folio into the free list,
+ * the caller has a reference on the folio.
*/
-void delete_from_swap_cache(struct page *page)
+void delete_from_swap_cache(struct folio *folio)
{
- swp_entry_t entry = { .val = page_private(page) };
+ swp_entry_t entry = folio_swap_entry(folio);
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry, NULL);
+ __delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_page(page, entry);
- page_ref_sub(page, thp_nr_pages(page));
+ put_swap_page(&folio->page, entry);
+ folio_ref_sub(folio, folio_nr_pages(folio));
}
void clear_shadow_from_swap_cache(int type, unsigned long begin,
@@ -348,12 +334,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
page = find_get_page(swap_address_space(entry), swp_offset(entry));
put_swap_device(si);
- INC_CACHE_INFO(find_total);
if (page) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
- INC_CACHE_INFO(find_success);
/*
* At the moment, we don't support PG_readahead for anon THP
* so let's bail out rather than confusing the readahead stat.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a2e66d855b19..5c8681a3f1d9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -695,7 +695,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->lowest_bit += nr_entries;
if (end == si->highest_bit)
WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- si->inuse_pages += nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
@@ -732,7 +732,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
- si->inuse_pages -= nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -1568,16 +1568,15 @@ unlock_out:
return ret;
}
-static bool page_swapped(struct page *page)
+static bool folio_swapped(struct folio *folio)
{
swp_entry_t entry;
struct swap_info_struct *si;
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
- return page_swapcount(page) != 0;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
+ return page_swapcount(&folio->page) != 0;
- page = compound_head(page);
- entry.val = page_private(page);
+ entry = folio_swap_entry(folio);
si = _swap_info_get(entry);
if (si)
return swap_page_trans_huge_swapped(si, entry);
@@ -1590,13 +1589,14 @@ static bool page_swapped(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ struct folio *folio = page_folio(page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
return 0;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return 0;
- if (page_swapped(page))
+ if (folio_swapped(folio))
return 0;
/*
@@ -1617,9 +1617,8 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage())
return 0;
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
+ delete_from_swap_cache(folio);
+ folio_set_dirty(folio);
return 1;
}
@@ -1991,14 +1990,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type);
if (ret)
break;
}
+
cond_resched();
}
mmap_read_unlock(mm);
@@ -2640,7 +2641,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = si->pages << (PAGE_SHIFT - 10);
- inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+ inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -3259,7 +3260,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += si->inuse_pages;
+ nr_to_be_unused += READ_ONCE(si->inuse_pages);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..1266a33a49ea 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user_nul);
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- next = mm->mmap;
- mm->mmap = vma;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev, *next;
-
- next = vma->vm_next;
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- if (next)
- next->vm_prev = prev;
-}
-
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
diff --git a/mm/vmacache.c b/mm/vmacache.c
deleted file mode 100644
index 01a6e6688ec1..000000000000
--- a/mm/vmacache.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2014 Davidlohr Bueso.
- */
-#include <linux/sched/signal.h>
-#include <linux/sched/task.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-
-/*
- * Hash based on the pmd of addr if configured with MMU, which provides a good
- * hit rate for workloads with spatial locality. Otherwise, use pages.
- */
-#ifdef CONFIG_MMU
-#define VMACACHE_SHIFT PMD_SHIFT
-#else
-#define VMACACHE_SHIFT PAGE_SHIFT
-#endif
-#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
-
-/*
- * This task may be accessing a foreign mm via (for example)
- * get_user_pages()->find_vma(). The vmacache is task-local and this
- * task's vmacache pertains to a different mm (ie, its own). There is
- * nothing we can do here.
- *
- * Also handle the case where a kernel thread has adopted this mm via
- * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
- */
-static inline bool vmacache_valid_mm(struct mm_struct *mm)
-{
- return current->mm == mm && !(current->flags & PF_KTHREAD);
-}
-
-void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
-{
- if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
-}
-
-static bool vmacache_valid(struct mm_struct *mm)
-{
- struct task_struct *curr;
-
- if (!vmacache_valid_mm(mm))
- return false;
-
- curr = current;
- if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
- /*
- * First attempt will always be invalid, initialize
- * the new cache for this task here.
- */
- curr->vmacache.seqnum = mm->vmacache_seqnum;
- vmacache_flush(curr);
- return false;
- }
- return true;
-}
-
-struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
-{
- int idx = VMACACHE_HASH(addr);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma) {
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
-#endif
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-
-#ifndef CONFIG_MMU
-struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- int idx = VMACACHE_HASH(start);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma && vma->vm_start == start && vma->vm_end == end) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5977b178694d..dd6cdb201195 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -815,9 +815,9 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
return va;
}
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
- struct rb_node *n = vmap_area_root.rb_node;
+ struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
@@ -910,8 +910,9 @@ get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
}
static __always_inline void
-link_va(struct vmap_area *va, struct rb_root *root,
- struct rb_node *parent, struct rb_node **link, struct list_head *head)
+__link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head, bool augment)
{
/*
* VA is still not in the list, but we can
@@ -925,7 +926,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link);
- if (root == &free_vmap_area_root) {
+ if (augment) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
@@ -949,21 +950,49 @@ link_va(struct vmap_area *va, struct rb_root *root,
}
static __always_inline void
-unlink_va(struct vmap_area *va, struct rb_root *root)
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, false);
+}
+
+static __always_inline void
+link_va_augment(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, true);
+}
+
+static __always_inline void
+__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
- if (root == &free_vmap_area_root)
+ if (augment)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
- list_del(&va->list);
+ list_del_init(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, false);
+}
+
+static __always_inline void
+unlink_va_augment(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, true);
+}
+
#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
* Gets called when remove the node and rotate.
@@ -1059,7 +1088,7 @@ insert_vmap_area_augment(struct vmap_area *va,
link = find_va_links(va, root, NULL, &parent);
if (link) {
- link_va(va, root, parent, link, head);
+ link_va_augment(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
}
@@ -1076,8 +1105,8 @@ insert_vmap_area_augment(struct vmap_area *va,
* ongoing.
*/
static __always_inline struct vmap_area *
-merge_or_add_vmap_area(struct vmap_area *va,
- struct rb_root *root, struct list_head *head)
+__merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head, bool augment)
{
struct vmap_area *sibling;
struct list_head *next;
@@ -1139,7 +1168,7 @@ merge_or_add_vmap_area(struct vmap_area *va,
* "normalized" because of rotation operations.
*/
if (merged)
- unlink_va(va, root);
+ __unlink_va(va, root, augment);
sibling->va_end = va->va_end;
@@ -1154,16 +1183,23 @@ merge_or_add_vmap_area(struct vmap_area *va,
insert:
if (!merged)
- link_va(va, root, parent, link, head);
+ __link_va(va, root, parent, link, head, augment);
return va;
}
static __always_inline struct vmap_area *
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ return __merge_or_add_vmap_area(va, root, head, false);
+}
+
+static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
- va = merge_or_add_vmap_area(va, root, head);
+ va = __merge_or_add_vmap_area(va, root, head, true);
if (va)
augment_tree_propagate_from(va);
@@ -1197,15 +1233,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
* overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
- unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_root *root, unsigned long size,
+ unsigned long align, unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
- node = free_vmap_area_root.rb_node;
+ node = root->rb_node;
/* Adjust the search size for alignment overhead. */
length = adjust_search_size ? size + align - 1 : size;
@@ -1333,8 +1369,9 @@ classify_va_fit_type(struct vmap_area *va,
}
static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
- unsigned long nva_start_addr, unsigned long size)
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size)
{
struct vmap_area *lva = NULL;
enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
@@ -1347,7 +1384,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
* V NVA V
* |---------------|
*/
- unlink_va(va, &free_vmap_area_root);
+ unlink_va_augment(va, root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
/*
@@ -1425,8 +1462,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
augment_tree_propagate_from(va);
if (lva) /* type == NE_FIT_TYPE */
- insert_vmap_area_augment(lva, &va->rb_node,
- &free_vmap_area_root, &free_vmap_area_list);
+ insert_vmap_area_augment(lva, &va->rb_node, root, head);
}
return 0;
@@ -1437,7 +1473,8 @@ adjust_va_to_fit_type(struct vmap_area *va,
* Otherwise a vend is returned that indicates failure.
*/
static __always_inline unsigned long
-__alloc_vmap_area(unsigned long size, unsigned long align,
+__alloc_vmap_area(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
bool adjust_search_size = true;
@@ -1457,7 +1494,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
adjust_search_size = false;
- va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+ va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1471,7 +1508,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
return vend;
/* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(va, nva_start_addr, size);
+ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
return vend;
@@ -1562,7 +1599,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
retry:
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
- addr = __alloc_vmap_area(size, align, vstart, vend);
+ addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+ size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
/*
@@ -1796,7 +1834,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr);
+ va = __find_vmap_area(addr, &vmap_area_root);
spin_unlock(&vmap_area_lock);
return va;
@@ -2539,7 +2577,7 @@ struct vm_struct *remove_vm_area(const void *addr)
might_sleep();
spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
if (va && va->vm) {
struct vm_struct *vm = va->vm;
@@ -3161,15 +3199,15 @@ again:
/*
* Mark the pages as accessible, now that they are mapped.
- * The init condition should match the one in post_alloc_hook()
- * (except for the should_skip_init() check) to make sure that memory
- * is initialized under the same conditions regardless of the enabled
- * KASAN mode.
+ * The condition for setting KASAN_VMALLOC_INIT should complement the
+ * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
+ * to make sure that memory is initialized under the same conditions.
* Tag-based KASAN modes only assign tags to normal non-executable
* allocations, see __kasan_unpoison_vmalloc().
*/
kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
- if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
+ (gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
@@ -3838,7 +3876,9 @@ retry:
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(va, start, size);
+ ret = adjust_va_to_fit_type(&free_vmap_area_root,
+ &free_vmap_area_list,
+ va, start, size);
if (WARN_ON_ONCE(unlikely(ret)))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04f8671caad9..0070c7fb600a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -26,8 +26,7 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
-#include <linux/buffer_head.h> /* for try_to_release_page(),
- buffer_heads_over_limit */
+#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
@@ -160,17 +159,17 @@ struct scan_control {
};
#ifdef ARCH_HAS_PREFETCHW
-#define prefetchw_prev_lru_page(_page, _base, _field) \
+#define prefetchw_prev_lru_folio(_folio, _base, _field) \
do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
+ if ((_folio)->lru.prev != _base) { \
+ struct folio *prev; \
\
- prev = lru_to_page(&(_page->lru)); \
+ prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
-#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
#endif
/*
@@ -190,8 +189,8 @@ static void set_task_reclaim_state(struct task_struct *task,
task->reclaim_state = rs;
}
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -410,13 +409,9 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
int i, nid;
long nr;
- struct mem_cgroup *parent;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct shrinker_info *child_info, *parent_info;
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
/* Prevent from concurrent shrinker_info expand */
down_read(&shrinker_rwsem);
for_each_node(nid) {
@@ -608,7 +603,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
/*
* Add a shrinker callback to be called from the vm.
*/
-int prealloc_shrinker(struct shrinker *shrinker)
+static int __prealloc_shrinker(struct shrinker *shrinker)
{
unsigned int size;
int err;
@@ -632,8 +627,36 @@ int prealloc_shrinker(struct shrinker *shrinker)
return 0;
}
+#ifdef CONFIG_SHRINKER_DEBUG
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __prealloc_shrinker(shrinker);
+ if (err)
+ kfree_const(shrinker->name);
+
+ return err;
+}
+#else
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __prealloc_shrinker(shrinker);
+}
+#endif
+
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+#ifdef CONFIG_SHRINKER_DEBUG
+ kfree_const(shrinker->name);
+#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
@@ -650,18 +673,43 @@ void register_shrinker_prepared(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
+ shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem);
}
-int register_shrinker(struct shrinker *shrinker)
+static int __register_shrinker(struct shrinker *shrinker)
{
- int err = prealloc_shrinker(shrinker);
+ int err = __prealloc_shrinker(shrinker);
if (err)
return err;
register_shrinker_prepared(shrinker);
return 0;
}
+
+#ifdef CONFIG_SHRINKER_DEBUG
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __register_shrinker(shrinker);
+ if (err)
+ kfree_const(shrinker->name);
+ return err;
+}
+#else
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __register_shrinker(shrinker);
+}
+#endif
EXPORT_SYMBOL(register_shrinker);
/*
@@ -677,6 +725,7 @@ void unregister_shrinker(struct shrinker *shrinker)
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
+ shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem);
kfree(shrinker->nr_deferred);
@@ -1276,7 +1325,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
mem_cgroup_swapout(folio, swap);
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __delete_from_swap_cache(&folio->page, swap, shadow);
+ __delete_from_swap_cache(folio, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
put_swap_page(&folio->page, swap);
} else {
@@ -1519,7 +1568,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
* but that will never affect SWP_FS_OPS, so the data_race
* is safe.
*/
- return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS);
+ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}
/*
@@ -1556,13 +1605,19 @@ retry:
folio = lru_to_folio(page_list);
list_del(&folio->lru);
+ nr_pages = folio_nr_pages(folio);
+
+ if (folio_ref_count(folio) == 1 &&
+ folio_ref_freeze(folio, 1)) {
+ /* folio was freed from under us. So we are done. */
+ goto free_it;
+ }
+
if (!folio_trylock(folio))
goto keep;
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
- nr_pages = folio_nr_pages(folio);
-
/* Account the number of base pages */
sc->nr_scanned += nr_pages;
@@ -1926,7 +1981,7 @@ free_it:
* appear not as the counts should be low
*/
if (unlikely(folio_test_large(folio)))
- destroy_compound_page(&folio->page);
+ destroy_large_folio(folio);
else
list_add(&folio->lru, &free_pages);
continue;
@@ -1987,7 +2042,7 @@ keep:
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *page_list)
+ struct list_head *folio_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1995,16 +2050,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
- struct page *page, *next;
- LIST_HEAD(clean_pages);
+ struct folio *folio, *next;
+ LIST_HEAD(clean_folios);
unsigned int noreclaim_flag;
- list_for_each_entry_safe(page, next, page_list, lru) {
- if (!PageHuge(page) && page_is_file_lru(page) &&
- !PageDirty(page) && !__PageMovable(page) &&
- !PageUnevictable(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &clean_pages);
+ list_for_each_entry_safe(folio, next, folio_list, lru) {
+ if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
+ !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
+ !folio_test_unevictable(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &clean_folios);
}
}
@@ -2015,11 +2070,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
- nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true);
memalloc_noreclaim_restore(noreclaim_flag);
- list_splice(&clean_pages, page_list);
+ list_splice(&clean_folios, folio_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
/*
@@ -2085,72 +2140,72 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
- LIST_HEAD(pages_skipped);
+ LIST_HEAD(folios_skipped);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
- struct page *page;
+ struct folio *folio;
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
+ folio = lru_to_folio(src);
+ prefetchw_prev_lru_folio(folio, src, flags);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (page_zonenum(page) > sc->reclaim_idx) {
- nr_skipped[page_zonenum(page)] += nr_pages;
- move_to = &pages_skipped;
+ if (folio_zonenum(folio) > sc->reclaim_idx) {
+ nr_skipped[folio_zonenum(folio)] += nr_pages;
+ move_to = &folios_skipped;
goto move;
}
/*
- * Do not count skipped pages because that makes the function
- * return with no isolated pages if the LRU mostly contains
- * ineligible pages. This causes the VM to not reclaim any
- * pages, triggering a premature OOM.
- * Account all tail pages of THP.
+ * Do not count skipped folios because that makes the function
+ * return with no isolated folios if the LRU mostly contains
+ * ineligible folios. This causes the VM to not reclaim any
+ * folios, triggering a premature OOM.
+ * Account all pages in a folio.
*/
scan += nr_pages;
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
goto move;
- if (!sc->may_unmap && page_mapped(page))
+ if (!sc->may_unmap && folio_mapped(folio))
goto move;
/*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
+ * Be careful not to clear the lru flag until after we're
+ * sure the folio is not being freed elsewhere -- the
+ * folio release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ if (unlikely(!folio_try_get(folio)))
goto move;
- if (!TestClearPageLRU(page)) {
- /* Another thread is already isolating this page */
- put_page(page);
+ if (!folio_test_clear_lru(folio)) {
+ /* Another thread is already isolating this folio */
+ folio_put(folio);
goto move;
}
nr_taken += nr_pages;
- nr_zone_taken[page_zonenum(page)] += nr_pages;
+ nr_zone_taken[folio_zonenum(folio)] += nr_pages;
move_to = dst;
move:
- list_move(&page->lru, move_to);
+ list_move(&folio->lru, move_to);
}
/*
- * Splice any skipped pages to the start of the LRU list. Note that
+ * Splice any skipped folios to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
- * scanning would soon rescan the same pages to skip and waste lots
+ * scanning would soon rescan the same folios to skip and waste lots
* of cpu cycles.
*/
- if (!list_empty(&pages_skipped)) {
+ if (!list_empty(&folios_skipped)) {
int zid;
- list_splice(&pages_skipped, src);
+ list_splice(&folios_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -2202,7 +2257,7 @@ int folio_isolate_lru(struct folio *folio)
folio_get(folio);
lruvec = folio_lruvec_lock_irq(folio);
lruvec_del_folio(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
ret = 0;
}
@@ -2254,71 +2309,71 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
}
/*
- * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
- * On return, @list is reused as a list of pages to be freed by the caller.
+ * move_pages_to_lru() moves folios from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of folios to be freed by the caller.
*
- * Returns the number of pages moved to the given lruvec.
+ * Returns the number of pages moved to the appropriate LRU list.
+ *
+ * Note: The caller must not hold any lruvec lock.
*/
-static unsigned int move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_pages_to_lru(struct list_head *list)
{
int nr_pages, nr_moved = 0;
- LIST_HEAD(pages_to_free);
- struct page *page;
+ struct lruvec *lruvec = NULL;
+ LIST_HEAD(folios_to_free);
while (!list_empty(list)) {
- page = lru_to_page(list);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
- if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&lruvec->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&lruvec->lru_lock);
+ struct folio *folio = lru_to_folio(list);
+
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ list_del(&folio->lru);
+ if (unlikely(!folio_evictable(folio))) {
+ lruvec_unlock_irq(lruvec);
+ folio_putback_lru(folio);
+ lruvec = NULL;
continue;
}
/*
- * The SetPageLRU needs to be kept here for list integrity.
+ * The folio_set_lru needs to be kept here for list integrity.
* Otherwise:
* #0 move_pages_to_lru #1 release_pages
- * if !put_page_testzero
- * if (put_page_testzero())
- * !PageLRU //skip lru_lock
- * SetPageLRU()
- * list_add(&page->lru,)
- * list_add(&page->lru,)
+ * if (!folio_put_testzero())
+ * if (folio_put_testzero())
+ * !lru //skip lru_lock
+ * folio_set_lru()
+ * list_add(&folio->lru,)
+ * list_add(&folio->lru,)
*/
- SetPageLRU(page);
+ folio_set_lru(folio);
- if (unlikely(put_page_testzero(page))) {
- __clear_page_lru_flags(page);
+ if (unlikely(folio_put_testzero(folio))) {
+ __folio_clear_lru_flags(folio);
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&lruvec->lru_lock);
- destroy_compound_page(page);
- spin_lock_irq(&lruvec->lru_lock);
+ if (unlikely(folio_test_large(folio))) {
+ lruvec_unlock_irq(lruvec);
+ destroy_large_folio(folio);
+ lruvec = NULL;
} else
- list_add(&page->lru, &pages_to_free);
+ list_add(&folio->lru, &folios_to_free);
continue;
}
- /*
- * All pages were isolated from the same lruvec (and isolation
- * inhibits memcg migration).
- */
- VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
- add_page_to_lru_list(page, lruvec);
- nr_pages = thp_nr_pages(page);
+ lruvec_add_folio(lruvec, folio);
+ nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
- if (PageActive(page))
+ if (folio_test_active(folio))
workingset_age_nonresident(lruvec, nr_pages);
}
+ if (lruvec)
+ lruvec_unlock_irq(lruvec);
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, list);
+ list_splice(&folios_to_free, list);
return nr_moved;
}
@@ -2385,16 +2440,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
- spin_lock_irq(&lruvec->lru_lock);
- move_pages_to_lru(lruvec, &page_list);
+ move_pages_to_lru(&page_list);
+ local_irq_disable();
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
+ local_irq_enable();
lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
@@ -2429,21 +2484,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
}
/*
- * shrink_active_list() moves pages from the active LRU to the inactive LRU.
+ * shrink_active_list() moves folios from the active LRU to the inactive LRU.
*
- * We move them the other way if the page is referenced by one or more
+ * We move them the other way if the folio is referenced by one or more
* processes.
*
- * If the pages are mostly unmapped, the processing is fast and it is
+ * If the folios are mostly unmapped, the processing is fast and it is
* appropriate to hold lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (folio_referenced()), so
- * we should drop lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
+ * the folios are mapped, the processing is slow (folio_referenced()), so
+ * we should drop lru_lock around each folio. It's impossible to balance
+ * this, so instead we remove the folios from the LRU while processing them.
+ * It is safe to rely on the active flag against the non-LRU folios in here
+ * because nobody will play with that bit on a non-LRU folio.
*
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * The downside is that we have to touch folio->_refcount against each folio.
+ * But we had to alter folio->flags anyway.
*/
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
@@ -2453,7 +2508,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
unsigned nr_deactivate, nr_activate;
@@ -2478,23 +2533,21 @@ static void shrink_active_list(unsigned long nr_to_scan,
while (!list_empty(&l_hold)) {
struct folio *folio;
- struct page *page;
cond_resched();
folio = lru_to_folio(&l_hold);
list_del(&folio->lru);
- page = &folio->page;
- if (unlikely(!page_evictable(page))) {
- putback_lru_page(page);
+ if (unlikely(!folio_evictable(folio))) {
+ folio_putback_lru(folio);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
- if (page_has_private(page) && trylock_page(page)) {
- if (page_has_private(page))
- try_to_release_page(page, 0);
- unlock_page(page);
+ if (folio_get_private(folio) && folio_trylock(folio)) {
+ if (folio_get_private(folio))
+ filemap_release_folio(folio, 0);
+ folio_unlock(folio);
}
}
@@ -2502,41 +2555,39 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
&vm_flags) != 0) {
/*
- * Identify referenced, file-backed active pages and
+ * Identify referenced, file-backed active folios and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
- * memory under moderate memory pressure. Anon pages
+ * memory under moderate memory pressure. Anon folios
* are not likely to be evicted by use-once streaming
- * IO, plus JVM can create lots of anon VM_EXEC pages,
+ * IO, plus JVM can create lots of anon VM_EXEC folios,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
- nr_rotated += thp_nr_pages(page);
- list_add(&page->lru, &l_active);
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
+ nr_rotated += folio_nr_pages(folio);
+ list_add(&folio->lru, &l_active);
continue;
}
}
- ClearPageActive(page); /* we are de-activating */
- SetPageWorkingset(page);
- list_add(&page->lru, &l_inactive);
+ folio_clear_active(folio); /* we are de-activating */
+ folio_set_workingset(folio);
+ list_add(&folio->lru, &l_inactive);
}
/*
- * Move pages back to the lru list.
+ * Move folios back to the lru list.
*/
- spin_lock_irq(&lruvec->lru_lock);
-
- nr_activate = move_pages_to_lru(lruvec, &l_active);
- nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
- /* Keep all free pages in l_active list */
+ nr_activate = move_pages_to_lru(&l_active);
+ nr_deactivate = move_pages_to_lru(&l_inactive);
+ /* Keep all free folios in l_active list */
list_splice(&l_inactive, &l_active);
+ local_irq_disable();
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
-
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
+ local_irq_enable();
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
@@ -2568,34 +2619,33 @@ static unsigned int reclaim_page_list(struct list_head *page_list,
return nr_reclaimed;
}
-unsigned long reclaim_pages(struct list_head *page_list)
+unsigned long reclaim_pages(struct list_head *folio_list)
{
int nid;
unsigned int nr_reclaimed = 0;
- LIST_HEAD(node_page_list);
- struct page *page;
+ LIST_HEAD(node_folio_list);
unsigned int noreclaim_flag;
- if (list_empty(page_list))
+ if (list_empty(folio_list))
return nr_reclaimed;
noreclaim_flag = memalloc_noreclaim_save();
- nid = page_to_nid(lru_to_page(page_list));
+ nid = folio_nid(lru_to_folio(folio_list));
do {
- page = lru_to_page(page_list);
+ struct folio *folio = lru_to_folio(folio_list);
- if (nid == page_to_nid(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &node_page_list);
+ if (nid == folio_nid(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &node_folio_list);
continue;
}
- nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
- nid = page_to_nid(lru_to_page(page_list));
- } while (!list_empty(page_list));
+ nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
- nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -4595,7 +4645,7 @@ void kswapd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
void kswapd_stop(int nid)
{
@@ -4846,7 +4896,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 373d2730fcf2..da7e389cf33c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1382,10 +1382,6 @@ const char * const vmstat_text[] = {
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- "vmacache_find_calls",
- "vmacache_find_hits",
-#endif
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
diff --git a/mm/workingset.c b/mm/workingset.c
index 592569a8974c..a5e84862fc86 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -625,7 +625,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)
goto err;
ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 71d6edcbea48..a7bad79cbbed 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2169,7 +2169,8 @@ static int zs_register_shrinker(struct zs_pool *pool)
pool->shrinker.batch = 0;
pool->shrinker.seeks = DEFAULT_SEEKS;
- return register_shrinker(&pool->shrinker);
+ return register_shrinker(&pool->shrinker, "mm-zspool:%s",
+ pool->name);
}
/**