summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2021-04-13 18:58:33 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2021-04-13 18:58:33 +1000
commitc03156f8d8b7420c9c53d5d37b5c1696a931cb6e (patch)
treeebfc85df4d9673aa974668b51aa2638a41f116da /include
parentf195cda9b95d03d27906f98a61456d47ced1c35b (diff)
parent3ad29d0ccd4c42afa271bc8529fb357adff944ec (diff)
downloadlinux-next-c03156f8d8b7420c9c53d5d37b5c1696a931cb6e.tar.gz
Merge branch 'akpm-current/current'
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/bitops/find.h108
-rw-r--r--include/asm-generic/bitops/le.h38
-rw-r--r--include/asm-generic/bitsperlong.h12
-rw-r--r--include/linux/align.h15
-rw-r--r--include/linux/async.h1
-rw-r--r--include/linux/bitmap.h11
-rw-r--r--include/linux/bitops.h12
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/buffer_head.h4
-rw-r--r--include/linux/cma.h4
-rw-r--r--include/linux/compaction.h1
-rw-r--r--include/linux/compat.h1
-rw-r--r--include/linux/compiler-gcc.h8
-rw-r--r--include/linux/crc8.h2
-rw-r--r--include/linux/cred.h1
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/gfp.h35
-rw-r--r--include/linux/highmem.h35
-rw-r--r--include/linux/huge_mm.h3
-rw-r--r--include/linux/hugetlb.h26
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/initrd.h2
-rw-r--r--include/linux/io-mapping.h3
-rw-r--r--include/linux/io.h9
-rw-r--r--include/linux/kasan.h53
-rw-r--r--include/linux/kernel.h9
-rw-r--r--include/linux/memcontrol.h294
-rw-r--r--include/linux/memory.h8
-rw-r--r--include/linux/memory_hotplug.h21
-rw-r--r--include/linux/memremap.h8
-rw-r--r--include/linux/migrate.h7
-rw-r--r--include/linux/mm.h76
-rw-r--r--include/linux/mmzone.h18
-rw-r--r--include/linux/page-flags-layout.h62
-rw-r--r--include/linux/pagemap.h15
-rw-r--r--include/linux/pagewalk.h4
-rw-r--r--include/linux/pgtable.h12
-rw-r--r--include/linux/proc_fs.h1
-rw-r--r--include/linux/profile.h3
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/sched/mm.h27
-rw-r--r--include/linux/shrinker.h7
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/smp.h8
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/userfaultfd_k.h49
-rw-r--r--include/linux/vm_event_item.h8
-rw-r--r--include/linux/vmalloc.h65
-rw-r--r--include/linux/vmstat.h24
-rw-r--r--include/linux/wait.h4
-rw-r--r--include/net/page_pool.h2
-rw-r--r--include/trace/events/cma.h68
-rw-r--r--include/trace/events/kmem.h24
-rw-r--r--include/trace/events/migrate.h25
-rw-r--r--include/trace/events/mmflags.h7
-rw-r--r--include/uapi/linux/mempolicy.h7
-rw-r--r--include/uapi/linux/userfaultfd.h36
58 files changed, 928 insertions, 401 deletions
diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h
index 9fdf21302fdf..0d132ee2a291 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -2,6 +2,13 @@
#ifndef _ASM_GENERIC_BITOPS_FIND_H_
#define _ASM_GENERIC_BITOPS_FIND_H_
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long nbits,
+ unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
#ifndef find_next_bit
/**
* find_next_bit - find the next set bit in a memory region
@@ -12,8 +19,22 @@
* Returns the bit number for the next set bit
* If no bits are set, returns @size.
*/
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
- size, unsigned long offset);
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = *addr & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
#endif
#ifndef find_next_and_bit
@@ -27,9 +48,23 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
* Returns the bit number for the next set bit
* If no bits are set, returns @size.
*/
-extern unsigned long find_next_and_bit(const unsigned long *addr1,
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size,
- unsigned long offset);
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
#endif
#ifndef find_next_zero_bit
@@ -42,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
* Returns the bit number of the next zero bit
* If no bits are zero, returns @size.
*/
-extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
- long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = *addr | ~GENMASK(size - 1, offset);
+ return val == ~0UL ? size : ffz(val);
+ }
+
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
#endif
#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
@@ -56,8 +105,17 @@ extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
* Returns the bit number of the first set bit.
* If no bits are set, returns @size.
*/
-extern unsigned long find_first_bit(const unsigned long *addr,
- unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr & GENMASK(size - 1, 0);
+
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_first_bit(addr, size);
+}
/**
* find_first_zero_bit - find the first cleared bit in a memory region
@@ -67,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
* Returns the bit number of the first cleared bit.
* If no bits are zero, returns @size.
*/
-extern unsigned long find_first_zero_bit(const unsigned long *addr,
- unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+ return val == ~0UL ? size : ffz(val);
+ }
+
+ return _find_first_zero_bit(addr, size);
+}
#else /* CONFIG_GENERIC_FIND_FIRST_BIT */
#ifndef find_first_bit
@@ -80,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr,
#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr & GENMASK(size - 1, 0);
+
+ return val ? __fls(val) : size;
+ }
+
+ return _find_last_bit(addr, size);
+}
+#endif
+
/**
* find_next_clump8 - find next 8-bit clump with set bits in a memory region
* @clump: location to store copy of found clump
diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h
index 188d3eba3ace..5a28629cbf4d 100644
--- a/include/asm-generic/bitops/le.h
+++ b/include/asm-generic/bitops/le.h
@@ -2,8 +2,10 @@
#ifndef _ASM_GENERIC_BITOPS_LE_H_
#define _ASM_GENERIC_BITOPS_LE_H_
+#include <asm-generic/bitops/find.h>
#include <asm/types.h>
#include <asm/byteorder.h>
+#include <linux/swab.h>
#if defined(__LITTLE_ENDIAN)
@@ -32,13 +34,41 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
#ifndef find_next_zero_bit_le
-extern unsigned long find_next_zero_bit_le(const void *addr,
- unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *(const unsigned long *)addr;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = swab(val) | ~GENMASK(size - 1, offset);
+ return val == ~0UL ? size : ffz(val);
+ }
+
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+}
#endif
#ifndef find_next_bit_le
-extern unsigned long find_next_bit_le(const void *addr,
- unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *(const unsigned long *)addr;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = swab(val) & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+}
#endif
#ifndef find_first_zero_bit_le
diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h
index 3905c1c93dc2..1023e2a4bd37 100644
--- a/include/asm-generic/bitsperlong.h
+++ b/include/asm-generic/bitsperlong.h
@@ -23,4 +23,16 @@
#define BITS_PER_LONG_LONG 64
#endif
+/*
+ * small_const_nbits(n) is true precisely when it is known at compile-time
+ * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows
+ * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps
+ * of size 0 are very rare, and a compile-time-known-size 0 is most likely
+ * a sign of error. They will be handled correctly by the bit/bitmap APIs,
+ * but using the out-of-line functions, so that the inline implementations
+ * can unconditionally dereference the pointer(s).
+ */
+#define small_const_nbits(nbits) \
+ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
#endif /* __ASM_GENERIC_BITS_PER_LONG */
diff --git a/include/linux/align.h b/include/linux/align.h
new file mode 100644
index 000000000000..2b4acec7b95a
--- /dev/null
+++ b/include/linux/align.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ALIGN_H
+#define _LINUX_ALIGN_H
+
+#include <linux/const.h>
+
+/* @a is a power of 2 value */
+#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
+#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif /* _LINUX_ALIGN_H */
diff --git a/include/linux/async.h b/include/linux/async.h
index 0a17cd27f348..cce4ad31e8fc 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev,
return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
}
-void async_unregister_domain(struct async_domain *domain);
extern void async_synchronize_full(void);
extern void async_synchronize_full_domain(struct async_domain *domain);
extern void async_synchronize_cookie(async_cookie_t cookie);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 73d039476fa4..a36cfcec4e77 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -4,8 +4,9 @@
#ifndef __ASSEMBLY__
+#include <linux/align.h>
#include <linux/bitops.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -229,14 +230,6 @@ int bitmap_print_to_pagebuf(bool list, char *buf,
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
-/*
- * The static inlines below do not handle constant nbits==0 correctly,
- * so make such users (should any ever turn up) call the out-of-line
- * versions.
- */
-#define small_const_nbits(nbits) \
- (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
-
static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a5a48303b0f1..26bf15e6cd35 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
})
#endif
-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @addr: The address to start the search at
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
- unsigned long size);
-#endif
-
#endif /* __KERNEL__ */
#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9a57cae9672e..d99011c4548f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -11,7 +11,6 @@
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
-#include <linux/pagemap.h>
#include <linux/backing-dev-defs.h>
#include <linux/wait.h>
#include <linux/mempool.h>
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6b47f94378c5..e7e99da31349 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(int cpu);
+bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
@@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus_cpu(int cpu) {}
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
#define buffer_heads_over_limit 0
#endif /* CONFIG_BLOCK */
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 217999c8a762..53fd8c3cdbd0 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
unsigned int order_per_bit,
const char *name,
struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
bool no_warn);
-extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ed4070ed41ef..4221888bdcd6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
}
#ifdef CONFIG_COMPACTION
-extern int sysctl_compact_memory;
extern unsigned int sysctl_compaction_proactiveness;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 5112c3e35782..197e0a180406 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -75,7 +75,6 @@
__diag_push(); \
__diag_ignore(GCC, 8, "-Wattribute-alias", \
"Type aliasing is used to sanitize syscall arguments");\
- asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(__se_compat_sys##name)))); \
ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 48750243db4c..5d97ef738a57 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -90,15 +90,11 @@
*/
#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
-/*
- * sparse (__CHECKER__) pretends to be gcc, but can't do constant
- * folding in __builtin_bswap*() (yet), so don't set these for it.
- */
-#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__)
+#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
#define __HAVE_BUILTIN_BSWAP32__
#define __HAVE_BUILTIN_BSWAP64__
#define __HAVE_BUILTIN_BSWAP16__
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
#if GCC_VERSION >= 70000
#define KASAN_ABI_VERSION 5
diff --git a/include/linux/crc8.h b/include/linux/crc8.h
index 13c8dabb0441..674045c59a04 100644
--- a/include/linux/crc8.h
+++ b/include/linux/crc8.h
@@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
* Williams, Ross N., ross<at>ross.net
* (see URL http://www.ross.net/crc/download/crc_v3.txt).
*/
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc);
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc);
#endif /* __CRC8_H_ */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index ac0e5f97d7d8..14971322e1a0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -53,7 +53,6 @@ do { \
groups_free(group_info); \
} while (0)
-extern struct group_info init_groups;
#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf4e90d3ab18..acef282b97c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
* @nrpages: Number of page entries, protected by the i_pages lock.
- * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
* @writeback_index: Writeback starts here.
* @a_ops: Methods.
* @flags: Error bits and flags (AS_*).
@@ -463,7 +462,6 @@ struct address_space {
struct rb_root_cached i_mmap;
struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages;
- unsigned long nrexceptional;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
@@ -2878,6 +2876,8 @@ static inline int filemap_fdatawait(struct address_space *mapping)
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
loff_t lend);
+extern bool filemap_range_needs_writeback(struct address_space *,
+ loff_t lstart, loff_t lend);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8572a1474e16..8a5f6c3d7dba 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -515,14 +515,25 @@ static inline int arch_make_page_accessible(struct page *page)
}
#endif
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
- nodemask_t *nodemask);
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask);
-static inline struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+ nodemask_t *nodemask, int nr_pages,
+ struct list_head *page_list,
+ struct page **page_array);
+
+/* Bulk allocate order-0 pages */
+static inline unsigned long
+alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+{
+ return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
+}
+
+static inline unsigned long
+alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
{
- return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
+ return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
}
/*
@@ -535,7 +546,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
- return __alloc_pages(gfp_mask, order, nid);
+ return __alloc_pages(gfp_mask, order, nid, NULL);
}
/*
@@ -553,13 +564,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
}
#ifdef CONFIG_NUMA
-extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
-
-static inline struct page *
-alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- return alloc_pages_current(gfp_mask, order);
-}
+struct page *alloc_pages(gfp_t gfp, unsigned int order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage);
@@ -652,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end,
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
int nid, nodemask_t *nodemask);
#endif
-void free_contig_range(unsigned long pfn, unsigned int nr_pages);
+void free_contig_range(unsigned long pfn, unsigned long nr_pages);
#ifdef CONFIG_CMA
/* CMA stuff */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 44170f312ae7..b06eeeaee975 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -146,9 +146,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
- void *addr = kmap_atomic(page);
+ void *addr = kmap_local_page(page);
clear_user_page(addr, vaddr, page);
- kunmap_atomic(addr);
+ kunmap_local(addr);
}
#endif
@@ -199,9 +199,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
static inline void clear_highpage(struct page *page)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
clear_page(kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
/*
@@ -216,7 +216,7 @@ static inline void zero_user_segments(struct page *page,
unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
unsigned int i;
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
@@ -227,7 +227,7 @@ static inline void zero_user_segments(struct page *page,
if (end2 > start2)
memset(kaddr + start2, 0, end2 - start2);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
for (i = 0; i < compound_nr(page); i++)
flush_dcache_page(page + i);
}
@@ -252,11 +252,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_user_page(vto, vfrom, vaddr, to);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
@@ -267,11 +267,11 @@ static inline void copy_highpage(struct page *to, struct page *from)
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_page(vto, vfrom);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
@@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
kunmap_local(to);
}
+static inline void memzero_page(struct page *page, size_t offset, size_t len)
+{
+ char *addr = kmap_atomic(page);
+ memset(addr + offset, 0, len);
+ kunmap_atomic(addr);
+}
+
#endif /* _LINUX_HIGHMEM_H */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ba973efcd369..9626fda5efce 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -87,9 +87,6 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
-#ifdef CONFIG_DEBUG_VM
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
-#endif
};
struct kobject;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cccd1aab69dd..09f1fd12a6fa 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,7 @@
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
+#include <linux/userfaultfd_k.h>
struct ctl_table;
struct user_struct;
@@ -134,11 +135,14 @@ void hugetlb_show_meminfo(void);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
+#ifdef CONFIG_USERFAULTFD
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
struct page **pagep);
+#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
@@ -152,7 +156,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud);
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
@@ -161,7 +166,7 @@ extern struct list_head huge_boot_pages;
/* arch callbacks */
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
@@ -187,6 +192,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);
bool is_hugetlb_entry_migration(pte_t pte);
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -308,16 +314,19 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
BUG();
}
+#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
struct page **pagep)
{
BUG();
return 0;
}
+#endif /* CONFIG_USERFAULTFD */
static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
unsigned long sz)
@@ -368,6 +377,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
return 0;
}
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+
#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
@@ -555,6 +566,7 @@ HPAGEFLAG(Freed, freed)
#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
+ struct mutex resize_lock;
int next_nid_to_alloc;
int next_nid_to_free;
unsigned int order;
@@ -1039,4 +1051,14 @@ static inline __init void hugetlb_cma_check(void)
}
#endif
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
+
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
+#endif
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b2412b4d4c20..40fc5813cf93 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,7 +25,6 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
extern struct nsproxy init_nsproxy;
-extern struct group_info init_groups;
extern struct cred init_cred;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index 85c15717af34..1bbe9af48dc3 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long);
#ifdef CONFIG_BLK_DEV_INITRD
extern void __init reserve_initrd_mem(void);
+extern void wait_for_initramfs(void);
#else
static inline void __init reserve_initrd_mem(void) {}
+static inline void wait_for_initramfs(void) {}
#endif
extern phys_addr_t phys_initrd_start;
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index c093e81310a9..e9743cfd8585 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -220,3 +220,6 @@ io_mapping_free(struct io_mapping *iomap)
}
#endif /* _LINUX_IO_MAPPING_H */
+
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size);
diff --git a/include/linux/io.h b/include/linux/io.h
index 61ff7d6278b6..9595151d800d 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
}
#endif
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-void __init ioremap_huge_init(void);
-int arch_ioremap_p4d_supported(void);
-int arch_ioremap_pud_supported(void);
-int arch_ioremap_pmd_supported(void);
-#else
-static inline void ioremap_huge_init(void) { }
-#endif
-
/*
* Managed iomap interface
*/
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e7c20c79b342..b1678a61e6a7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -30,7 +30,8 @@ struct kunit_kasan_expectation {
/* Software KASAN implementations use shadow memory. */
#ifdef CONFIG_KASAN_SW_TAGS
-#define KASAN_SHADOW_INIT 0xFF
+/* This matches KASAN_TAG_INVALID. */
+#define KASAN_SHADOW_INIT 0xFE
#else
#define KASAN_SHADOW_INIT 0
#endif
@@ -95,6 +96,11 @@ static __always_inline bool kasan_enabled(void)
return static_branch_likely(&kasan_flag_enabled);
}
+static inline bool kasan_has_integrated_init(void)
+{
+ return kasan_enabled();
+}
+
#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_enabled(void)
@@ -102,6 +108,11 @@ static inline bool kasan_enabled(void)
return true;
}
+static inline bool kasan_has_integrated_init(void)
+{
+ return false;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS */
slab_flags_t __kasan_never_merge(void);
@@ -119,20 +130,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
__kasan_unpoison_range(addr, size);
}
-void __kasan_alloc_pages(struct page *page, unsigned int order);
+void __kasan_alloc_pages(struct page *page, unsigned int order, bool init);
static __always_inline void kasan_alloc_pages(struct page *page,
- unsigned int order)
+ unsigned int order, bool init)
{
if (kasan_enabled())
- __kasan_alloc_pages(page, order);
+ __kasan_alloc_pages(page, order, init);
}
-void __kasan_free_pages(struct page *page, unsigned int order);
+void __kasan_free_pages(struct page *page, unsigned int order, bool init);
static __always_inline void kasan_free_pages(struct page *page,
- unsigned int order)
+ unsigned int order, bool init)
{
if (kasan_enabled())
- __kasan_free_pages(page, order);
+ __kasan_free_pages(page, order, init);
}
void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
@@ -192,11 +203,13 @@ static __always_inline void * __must_check kasan_init_slab_obj(
return (void *)object;
}
-bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
-static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+bool __kasan_slab_free(struct kmem_cache *s, void *object,
+ unsigned long ip, bool init);
+static __always_inline bool kasan_slab_free(struct kmem_cache *s,
+ void *object, bool init)
{
if (kasan_enabled())
- return __kasan_slab_free(s, object, _RET_IP_);
+ return __kasan_slab_free(s, object, _RET_IP_, init);
return false;
}
@@ -215,12 +228,12 @@ static __always_inline void kasan_slab_free_mempool(void *ptr)
}
void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
- void *object, gfp_t flags);
+ void *object, gfp_t flags, bool init);
static __always_inline void * __must_check kasan_slab_alloc(
- struct kmem_cache *s, void *object, gfp_t flags)
+ struct kmem_cache *s, void *object, gfp_t flags, bool init)
{
if (kasan_enabled())
- return __kasan_slab_alloc(s, object, flags);
+ return __kasan_slab_alloc(s, object, flags, init);
return object;
}
@@ -276,13 +289,17 @@ static inline bool kasan_enabled(void)
{
return false;
}
+static inline bool kasan_has_integrated_init(void)
+{
+ return false;
+}
static inline slab_flags_t kasan_never_merge(void)
{
return 0;
}
static inline void kasan_unpoison_range(const void *address, size_t size) {}
-static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
-static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {}
static inline void kasan_cache_create(struct kmem_cache *cache,
unsigned int *size,
slab_flags_t *flags) {}
@@ -298,14 +315,14 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
{
return (void *)object;
}
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init)
{
return false;
}
static inline void kasan_kfree_large(void *ptr) {}
static inline void kasan_slab_free_mempool(void *ptr) {}
static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
- gfp_t flags)
+ gfp_t flags, bool init)
{
return object;
}
@@ -330,7 +347,7 @@ static inline bool kasan_check_byte(const void *address)
#endif /* CONFIG_KASAN */
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
void kasan_unpoison_task_stack(struct task_struct *task);
#else
static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5b7ed6dc99ac..09035ac67d4b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -3,6 +3,7 @@
#define _LINUX_KERNEL_H
#include <stdarg.h>
+#include <linux/align.h>
#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
@@ -30,14 +31,6 @@
*/
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
-/* @a is a power of 2 value */
-#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
-#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a))
-#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
-#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
-#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
-#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
-
/* generic data direction definitions */
#define READ 0
#define WRITE 1
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c04d39a7967..c193be760709 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -76,10 +76,27 @@ enum mem_cgroup_events_target {
};
struct memcg_vmstats_percpu {
- long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
+ /* Local (CPU and cgroup) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_VM_EVENT_ITEMS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[MEMCG_NR_STAT];
+ unsigned long events_prev[NR_VM_EVENT_ITEMS];
+
+ /* Cgroup1: threshold notifications & softlimit tree updates */
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+ /* Aggregated (CPU and subtree) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_VM_EVENT_ITEMS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[MEMCG_NR_STAT];
+ unsigned long events_pending[NR_VM_EVENT_ITEMS];
};
struct mem_cgroup_reclaim_iter {
@@ -97,12 +114,13 @@ struct batched_lruvec_stat {
};
/*
- * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
- * which have elements charged to this memcg.
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to this memcg.
*/
-struct memcg_shrinker_map {
+struct shrinker_info {
struct rcu_head rcu;
- unsigned long map[];
+ atomic_long_t *nr_deferred;
+ unsigned long *map;
};
/*
@@ -128,7 +146,7 @@ struct mem_cgroup_per_node {
struct mem_cgroup_reclaim_iter iter;
- struct memcg_shrinker_map __rcu *shrinker_map;
+ struct shrinker_info __rcu *shrinker_info;
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
@@ -287,8 +305,8 @@ struct mem_cgroup {
MEMCG_PADDING(_pad1_);
- atomic_long_t vmstats[MEMCG_NR_STAT];
- atomic_long_t vmevents[NR_VM_EVENT_ITEMS];
+ /* memory.stat */
+ struct memcg_vmstats vmstats;
/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -315,10 +333,6 @@ struct mem_cgroup {
atomic_t moving_account;
struct task_struct *move_lock_task;
- /* Legacy local VM stats and events */
- struct memcg_vmstats_percpu __percpu *vmstats_local;
-
- /* Subtree VM stats and events (batched updates) */
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -358,6 +372,62 @@ enum page_memcg_data_flags {
#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+static inline bool PageMemcgKmem(struct page *page);
+
+/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+ return READ_ONCE(objcg->memcg);
+}
+
+/*
+ * __page_memcg - get the memory cgroup associated with a non-kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages or
+ * kmem pages.
+ */
+static inline struct mem_cgroup *__page_memcg(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+
+ return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * __page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper object cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages or
+ * LRU pages.
+ */
+static inline struct obj_cgroup *__page_objcg(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+ return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
/*
* page_memcg - get the memory cgroup associated with a page
* @page: a pointer to the page struct
@@ -367,20 +437,23 @@ enum page_memcg_data_flags {
* proper memory cgroup pointer. It's not safe to call this function
* against some type of pages, e.g. slab pages or ex-slab pages.
*
- * Any of the following ensures page and memcg binding stability:
+ * For a non-kmem page any of the following ensures page and memcg binding
+ * stability:
+ *
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ *
+ * For a kmem page a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem page from being released.
*/
static inline struct mem_cgroup *page_memcg(struct page *page)
{
- unsigned long memcg_data = page->memcg_data;
-
- VM_BUG_ON_PAGE(PageSlab(page), page);
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ if (PageMemcgKmem(page))
+ return obj_cgroup_memcg(__page_objcg(page));
+ else
+ return __page_memcg(page);
}
/*
@@ -394,11 +467,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
*/
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
+ unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
VM_BUG_ON_PAGE(PageSlab(page), page);
WARN_ON_ONCE(!rcu_read_lock_held());
- return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
- ~MEMCG_DATA_FLAGS_MASK);
+ if (memcg_data & MEMCG_DATA_KMEM) {
+ struct obj_cgroup *objcg;
+
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return obj_cgroup_memcg(objcg);
+ }
+
+ return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
/*
@@ -406,15 +487,21 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any page
+ * or NULL. This function unlike page_memcg() can take any page
* as an argument. It has to be used in cases when it's not known if a page
- * has an associated memory cgroup pointer or an object cgroups vector.
+ * has an associated memory cgroup pointer or an object cgroups vector or
+ * an object cgroup.
+ *
+ * For a non-kmem page any of the following ensures page and memcg binding
+ * stability:
*
- * Any of the following ensures page and memcg binding stability:
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ *
+ * For a kmem page a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem page from being released.
*/
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
@@ -427,9 +514,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
+ if (memcg_data & MEMCG_DATA_KMEM) {
+ struct obj_cgroup *objcg;
+
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return obj_cgroup_memcg(objcg);
+ }
+
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
+#ifdef CONFIG_MEMCG_KMEM
/*
* PageMemcgKmem - check if the page has MemcgKmem flag set
* @page: a pointer to the page struct
@@ -444,7 +539,6 @@ static inline bool PageMemcgKmem(struct page *page)
return page->memcg_data & MEMCG_DATA_KMEM;
}
-#ifdef CONFIG_MEMCG_KMEM
/*
* page_objcgs - get the object cgroups vector associated with a page
* @page: a pointer to the page struct
@@ -486,6 +580,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
}
#else
+static inline bool PageMemcgKmem(struct page *page)
+{
+ return false;
+}
+
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
return NULL;
@@ -596,18 +695,15 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
}
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ gfp_t gfp, swp_entry_t entry);
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-static struct mem_cgroup_per_node *
-mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
-{
- return memcg->nodeinfo[nid];
-}
-
/**
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
* @memcg: memcg of the wanted lruvec
@@ -631,7 +727,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
if (!memcg)
memcg = root_mem_cgroup;
- mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ mz = memcg->nodeinfo[pgdat->node_id];
lruvec = &mz->lruvec;
out:
/*
@@ -708,21 +804,15 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg)
percpu_ref_get(&objcg->refcnt);
}
-static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+ unsigned long nr)
{
- percpu_ref_put(&objcg->refcnt);
+ percpu_ref_get_many(&objcg->refcnt, nr);
}
-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
- return READ_ONCE(objcg->memcg);
+ percpu_ref_put(&objcg->refcnt);
}
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
@@ -867,43 +957,9 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
extern bool cgroup_memory_noswap;
#endif
-struct mem_cgroup *lock_page_memcg(struct page *page);
-void __unlock_page_memcg(struct mem_cgroup *memcg);
+void lock_page_memcg(struct page *page);
void unlock_page_memcg(struct page *page);
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = atomic_long_read(&memcg->vmstats[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
-static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
- int idx)
-{
- long x = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -979,10 +1035,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
local_irq_restore(flags);
}
-unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- gfp_t gfp_mask,
- unsigned long *total_scanned);
-
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count);
@@ -1063,13 +1115,15 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
void split_page_memcg(struct page *head, unsigned int nr);
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned);
+
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
#define MEM_CGROUP_ID_MAX 0
-struct mem_cgroup;
-
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
@@ -1139,6 +1193,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
return 0;
}
+static inline int mem_cgroup_swapin_charge_page(struct page *page,
+ struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+{
+ return 0;
+}
+
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
+}
+
static inline void mem_cgroup_uncharge(struct page *page)
{
}
@@ -1171,6 +1235,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page,
return lruvec == &pgdat->__lruvec;
}
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
@@ -1289,12 +1357,7 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}
-static inline struct mem_cgroup *lock_page_memcg(struct page *page)
-{
- return NULL;
-}
-
-static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
+static inline void lock_page_memcg(struct page *page)
{
}
@@ -1334,17 +1397,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- return 0;
-}
-
-static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
- int idx)
-{
- return 0;
-}
-
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
int idx,
int nr)
@@ -1390,18 +1442,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
mod_node_page_state(page_pgdat(page), idx, val);
}
-static inline
-unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- gfp_t gfp_mask,
- unsigned long *total_scanned)
-{
- return 0;
-}
-
-static inline void split_page_memcg(struct page *head, unsigned int nr)
-{
-}
-
static inline void count_memcg_events(struct mem_cgroup *memcg,
enum vm_event_item idx,
unsigned long count)
@@ -1424,8 +1464,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline void split_page_memcg(struct page *head, unsigned int nr)
+{
+}
+
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
{
+ return 0;
}
#endif /* CONFIG_MEMCG */
@@ -1563,10 +1611,10 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
return false;
}
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id);
+int alloc_shrinker_info(struct mem_cgroup *memcg);
+void free_shrinker_info(struct mem_cgroup *memcg);
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
+void reparent_shrinker_deferred(struct mem_cgroup *memcg);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1576,8 +1624,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
return false;
}
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id)
+static inline void set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id)
{
}
#endif
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4da95e684e20..97e92e8b556a 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,11 @@ struct memory_block {
int online_type; /* for passing data to online routine */
int nid; /* NID for this memory block */
struct device dev;
+ /*
+ * Number of vmemmap pages. These pages
+ * lay at the beginning of the memory block.
+ */
+ unsigned long nr_vmemmap_pages;
};
int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
#else
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+ unsigned long vmemmap_pages);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7288aa5ef73b..a85d4b7d15c2 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -56,6 +56,14 @@ typedef int __bitwise mhp_t;
#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
+
+/*
* Extended parameters for memory hotplug:
* altmap: alternative allocator for memmap array (optional)
* pgprot: page protection flags to apply to newly created page tables
@@ -101,11 +109,13 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
/* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid);
+ unsigned long nr_vmemmap_pages, int online_type,
+ int nid);
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long end_pfn);
extern void __offline_isolated_pages(unsigned long start_pfn,
- unsigned long end_pfn);
+ unsigned long end_pfn,
+ unsigned long buddy_start_pfn);
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
@@ -307,7 +317,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
extern void try_offline_node(int nid);
-extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ unsigned long nr_vmemmap_pages);
extern int remove_memory(int nid, u64 start, u64 size);
extern void __remove_memory(int nid, u64 start, u64 size);
extern int offline_and_remove_memory(int nid, u64 start, u64 size);
@@ -315,7 +326,8 @@ extern int offline_and_remove_memory(int nid, u64 start, u64 size);
#else
static inline void try_offline_node(int nid) {}
-static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ unsigned long nr_vmemmap_pages)
{
return -EINVAL;
}
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
struct mhp_params *params);
void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f5b464daeeca..b46f63dcaed3 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -17,7 +17,7 @@ struct device;
* @alloc: track pages consumed, private to vmemmap_populate()
*/
struct vmem_altmap {
- const unsigned long base_pfn;
+ unsigned long base_pfn;
const unsigned long end_pfn;
const unsigned long reserve;
unsigned long free;
@@ -131,6 +131,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -143,6 +144,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3a389633b68f..4bb4e519e3f5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -27,6 +27,7 @@ enum migrate_reason {
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
+ MR_LONGTERM_PIN,
MR_TYPES
};
@@ -43,10 +44,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
unsigned long private, enum migrate_mode mode, int reason);
extern struct page *alloc_migration_target(struct page *page, unsigned long private);
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-extern void putback_movable_page(struct page *page);
-extern void migrate_prep(void);
-extern void migrate_prep_local(void);
extern void migrate_page_states(struct page *newpage, struct page *page);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
@@ -66,9 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page,
static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
{ return -EBUSY; }
-static inline int migrate_prep(void) { return -ENOSYS; }
-static inline int migrate_prep_local(void) { return -ENOSYS; }
-
static inline void migrate_page_states(struct page *newpage, struct page *page)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5e4ead59e67f..c5f323b8d974 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
# define VM_GROWSUP VM_NONE
#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define VM_UFFD_MINOR_BIT 37
+# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
+#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+# define VM_UFFD_MINOR VM_NONE
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
@@ -432,8 +439,7 @@ extern unsigned int kobjsize(const void *objp);
extern pgprot_t protection_map[16];
/**
- * Fault flag definitions.
- *
+ * enum fault_flag - Fault flag definitions.
* @FAULT_FLAG_WRITE: Fault was a write fault.
* @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
* @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
@@ -464,16 +470,18 @@ extern pgprot_t protection_map[16];
* signals before a retry to make sure the continuous page faults can still be
* interrupted if necessary.
*/
-#define FAULT_FLAG_WRITE 0x01
-#define FAULT_FLAG_MKWRITE 0x02
-#define FAULT_FLAG_ALLOW_RETRY 0x04
-#define FAULT_FLAG_RETRY_NOWAIT 0x08
-#define FAULT_FLAG_KILLABLE 0x10
-#define FAULT_FLAG_TRIED 0x20
-#define FAULT_FLAG_USER 0x40
-#define FAULT_FLAG_REMOTE 0x80
-#define FAULT_FLAG_INSTRUCTION 0x100
-#define FAULT_FLAG_INTERRUPTIBLE 0x200
+enum fault_flag {
+ FAULT_FLAG_WRITE = 1 << 0,
+ FAULT_FLAG_MKWRITE = 1 << 1,
+ FAULT_FLAG_ALLOW_RETRY = 1 << 2,
+ FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
+ FAULT_FLAG_KILLABLE = 1 << 4,
+ FAULT_FLAG_TRIED = 1 << 5,
+ FAULT_FLAG_USER = 1 << 6,
+ FAULT_FLAG_REMOTE = 1 << 7,
+ FAULT_FLAG_INSTRUCTION = 1 << 8,
+ FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
+};
/*
* The default fault flags that should be used by most of the
@@ -485,6 +493,7 @@ extern pgprot_t protection_map[16];
/**
* fault_flag_allow_retry_first - check ALLOW_RETRY the first time
+ * @flags: Fault flags.
*
* This is mostly used for places where we want to try to avoid taking
* the mmap_lock for too long a time when waiting for another condition
@@ -495,7 +504,7 @@ extern pgprot_t protection_map[16];
* Return: true if the page fault allows retry and this is the first
* attempt of the fault handling; false otherwise.
*/
-static inline bool fault_flag_allow_retry_first(unsigned int flags)
+static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
return (flags & FAULT_FLAG_ALLOW_RETRY) &&
(!(flags & FAULT_FLAG_TRIED));
@@ -530,7 +539,7 @@ struct vm_fault {
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
};
- unsigned int flags; /* FAULT_FLAG_xxx flags
+ enum fault_flag flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
@@ -580,7 +589,7 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
- int (*mremap)(struct vm_area_struct *area, unsigned long flags);
+ int (*mremap)(struct vm_area_struct *area);
/*
* Called by mprotect() to make driver-specific permission
* checks before mprotect() is finalised. The VMA must not
@@ -1132,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page)
}
#endif
+static inline bool is_zone_movable_page(const struct page *page)
+{
+ return page_zonenum(page) == ZONE_MOVABLE;
+}
+
#ifdef CONFIG_DEV_PAGEMAP_OPS
void free_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
@@ -1265,13 +1279,16 @@ static inline void put_page(struct page *page)
void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
/**
- * page_maybe_dma_pinned() - report if a page is pinned for DMA.
+ * page_maybe_dma_pinned - Report if a page is pinned for DMA.
+ * @page: The page.
*
* This function checks if a page has been pinned via a call to
- * pin_user_pages*().
+ * a function in the pin_user_pages() family.
*
* For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
* because it means "definitely not pinned for DMA", but true means "probably
@@ -1289,9 +1306,8 @@ void unpin_user_pages(struct page **pages, unsigned long npages);
*
* For more information, please see Documentation/core-api/pin_user_pages.rst.
*
- * @page: pointer to page to be queried.
- * @Return: True, if it is likely that the page has been "dma-pinned".
- * False, if the page is definitely not dma-pinned.
+ * Return: True, if it is likely that the page has been "dma-pinned".
+ * False, if the page is definitely not dma-pinned.
*/
static inline bool page_maybe_dma_pinned(struct page *page)
{
@@ -1539,6 +1555,20 @@ static inline unsigned long page_to_section(const struct page *page)
}
#endif
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_pinnable_page(struct page *page)
+{
+ return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
+ is_zero_pfn(page_to_pfn(page));
+}
+#else
+static inline bool is_pinnable_page(struct page *page)
+{
+ return true;
+}
+#endif
+
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -1629,7 +1659,6 @@ static inline pgoff_t page_index(struct page *page)
bool page_mapped(struct page *page);
struct address_space *page_mapping(struct page *page);
-struct address_space *page_mapping_file(struct page *page);
/*
* Return true only if the page has been allocated with
@@ -2357,7 +2386,7 @@ extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(const char *str);
+extern void mem_init_print_info(void);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
@@ -2731,6 +2760,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
@@ -2790,7 +2821,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
* and return without waiting upon it */
#define FOLL_POPULATE 0x40 /* fault in page */
-#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..acea81553359 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -407,8 +407,13 @@ enum zone_type {
* to increase the number of THP/huge pages. Notable special cases are:
*
* 1. Pinned pages: (long-term) pinning of movable pages might
- * essentially turn such pages unmovable. Memory offlining might
- * retry a long time.
+ * essentially turn such pages unmovable. Therefore, we do not allow
+ * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
+ * faulted, they come from the right zone right away. However, it is
+ * still possible that address space already has pages in
+ * ZONE_MOVABLE at the time when pages are pinned (i.e. user has
+ * touches that memory before pinning). In such case we migrate them
+ * to a different zone. When migration fails - pinning fails.
* 2. memblock allocations: kernelcore/movablecore setups might create
* situations where ZONE_MOVABLE contains unmovable allocations
* after boot. Memory offlining and allocations fail early.
@@ -427,6 +432,15 @@ enum zone_type {
* techniques might use alloc_contig_range() to hide previously
* exposed pages from the buddy again (e.g., to implement some sort
* of memory unplug in virtio-mem).
+ * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
+ * situations where ZERO_PAGE(0) which is allocated differently
+ * on different platforms may end up in a movable zone. ZERO_PAGE(0)
+ * cannot be migrated.
+ * 7. Memory-hotplug: when using memmap_on_memory and onlining the
+ * memory to the MOVABLE zone, the vmemmap pages are also placed in
+ * such zone. Such pages cannot be really moved around as they are
+ * self-stored in the range, but they are treated as movable when
+ * the range they describe is about to be offlined.
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 7d4ec26d8a3e..ef1e3e736e14 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -21,16 +21,17 @@
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
#else
-#error ZONES_SHIFT -- too many zones configured adjust calculation
+#error ZONES_SHIFT "Too many zones configured"
#endif
+#define ZONES_WIDTH ZONES_SHIFT
+
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
-
-/* SECTION_SHIFT #bits space required to store a section # */
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
-
-#endif /* CONFIG_SPARSEMEM */
+#else
+#define SECTIONS_SHIFT 0
+#endif
#ifndef BUILD_VDSO32_64
/*
@@ -54,17 +55,28 @@
#define SECTIONS_WIDTH 0
#endif
-#define ZONES_WIDTH ZONES_SHIFT
-
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
-#else
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
-#endif
+#else
#define NODES_WIDTH 0
#endif
+/*
+ * Note that this #define MUST have a value so that it can be tested with
+ * the IS_ENABLED() macro.
+ */
+#if NODES_SHIFT != 0 && NODES_WIDTH == 0
+#define NODE_NOT_IN_PAGE_FLAGS 1
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+#define KASAN_TAG_WIDTH 8
+#else
+#define KASAN_TAG_WIDTH 0
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
#define LAST__PID_SHIFT 8
#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1)
@@ -77,36 +89,20 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-#define KASAN_TAG_WIDTH 8
-#else
-#define KASAN_TAG_WIDTH 0
-#endif
-
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
<= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
#endif
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
-#error "Not enough bits in page flags"
-#endif
-
-/*
- * We are going to use the flags for the page to node mapping if its in
- * there. This includes the case where there is no node, so it is implicit.
- * Note that this #define MUST have a value so that it can be tested with
- * the IS_ENABLED() macro.
- */
-#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
-#define NODE_NOT_IN_PAGE_FLAGS 1
+#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0
+#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
-#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
+ > BITS_PER_LONG - NR_PAGEFLAGS
+#error "Not enough bits in page flags"
#endif
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31dc6ceed3f4..350e1c9726f2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -18,6 +18,11 @@
struct pagevec;
+static inline bool mapping_empty(struct address_space *mapping)
+{
+ return xa_empty(&mapping->i_pages);
+}
+
/*
* Bits in mapping->flags.
*/
@@ -158,6 +163,16 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
void release_pages(struct page **pages, int nr);
/*
+ * For file cache pages, return the address_space, otherwise return NULL
+ */
+static inline struct address_space *page_mapping_file(struct page *page)
+{
+ if (unlikely(PageSwapCache(page)))
+ return NULL;
+ return page_mapping(page);
+}
+
+/*
* speculatively take a reference to a page.
* If the page is free (_refcount == 0), then _refcount is untouched, and 0
* is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb..ac7b38ad5903 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -7,7 +7,7 @@
struct mm_walk;
/**
- * mm_walk_ops - callbacks for walk_page_range
+ * struct mm_walk_ops - callbacks for walk_page_range
* @pgd_entry: if set, called for each non-empty PGD (top-level) entry
* @p4d_entry: if set, called for each non-empty P4D entry
* @pud_entry: if set, called for each non-empty PUD entry
@@ -71,7 +71,7 @@ enum page_walk_action {
};
/**
- * mm_walk - walk_page_range data
+ * struct mm_walk - walk_page_range data
* @ops: operation to call during the walk
* @mm: mm_struct representing the target process of page table walk
* @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..2194a9cd885c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
extern void untrack_pfn_moved(struct vm_area_struct *vma);
#endif
+#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
@@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
return zero_pfn;
}
#endif
+#else
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return 0;
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return 0;
+}
+#endif /* CONFIG_MMU */
#ifdef CONFIG_MMU
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 000cc0533c33..069c7fd95396 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -32,6 +32,7 @@ struct proc_ops {
ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *);
ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
+ /* mandatory unless nonseekable_open() or equivalent is used */
loff_t (*proc_lseek)(struct file *, loff_t, int);
int (*proc_release)(struct inode *, struct file *);
__poll_t (*proc_poll)(struct file *, struct poll_table_struct *);
diff --git a/include/linux/profile.h b/include/linux/profile.h
index bad18ca43150..fd18ca96f557 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -15,7 +15,6 @@
#define KVM_PROFILING 4
struct proc_dir_entry;
-struct pt_regs;
struct notifier_block;
#if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
@@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n);
int profile_event_register(enum profile_type, struct notifier_block * n);
int profile_event_unregister(enum profile_type, struct notifier_block * n);
-struct pt_regs;
-
#else
#define prof_on 0
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4aedf9b92469..395b295bf93d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -841,6 +841,10 @@ struct task_struct {
/* Stalled due to lack of memory */
unsigned in_memstall:1;
#endif
+#ifdef CONFIG_PAGE_OWNER
+ /* Used by page_owner=on to detect recursion in page tracking. */
+ unsigned in_page_owner:1;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
@@ -968,6 +972,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1378,6 +1383,13 @@ struct task_struct {
struct llist_head kretprobe_instances;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
@@ -1578,7 +1590,7 @@ extern struct pid *cad_pid;
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
-#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 90b2a0bce11c..e24b1fe348e3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk)
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_PIN implies !GFP_MOVABLE
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);
- if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
+ if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
@@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags)
flags &= ~(__GFP_IO | __GFP_FS);
else if (pflags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
+
+ if (pflags & PF_MEMALLOC_PIN)
+ flags &= ~__GFP_MOVABLE;
}
return flags;
}
@@ -271,29 +275,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
-#ifdef CONFIG_CMA
-static inline unsigned int memalloc_nocma_save(void)
+static inline unsigned int memalloc_pin_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+ unsigned int flags = current->flags & PF_MEMALLOC_PIN;
- current->flags |= PF_MEMALLOC_NOCMA;
+ current->flags |= PF_MEMALLOC_PIN;
return flags;
}
-static inline void memalloc_nocma_restore(unsigned int flags)
-{
- current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
-}
-#else
-static inline unsigned int memalloc_nocma_save(void)
-{
- return 0;
-}
-
-static inline void memalloc_nocma_restore(unsigned int flags)
+static inline void memalloc_pin_restore(unsigned int flags)
{
+ current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
}
-#endif
#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 0f80123650e2..1eac79ce57d4 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -79,13 +79,14 @@ struct shrinker {
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
/* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
-#define SHRINKER_MEMCG_AWARE (1 << 1)
+#define SHRINKER_REGISTERED (1 << 0)
+#define SHRINKER_NUMA_AWARE (1 << 1)
+#define SHRINKER_MEMCG_AWARE (1 << 2)
/*
* It just makes sense when the shrinker is also MEMCG_AWARE for now,
* non-MEMCG_AWARE shrinker should not have this flag set.
*/
-#define SHRINKER_NONSLAB (1 << 2)
+#define SHRINKER_NONSLAB (1 << 3)
extern int prealloc_shrinker(struct shrinker *shrinker);
extern void register_shrinker_prepared(struct shrinker *shrinker);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c97d788762c..2e09a0ac5d60 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -25,6 +25,8 @@
*/
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U)
+/* DEBUG: Silent bug reports */
+#define SLAB_SILENT_ERRORS ((slab_flags_t __force)0x00000200U)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U)
/* DEBUG: Poison objects */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dcde82a4434c..e4b51bb5bb83 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -133,6 +133,8 @@ struct kmem_cache {
unsigned int usersize; /* Usercopy region size */
struct kmem_cache_node *node[MAX_NUMNODES];
+
+ int errors; /* Number of errors in cache */
};
#ifdef CONFIG_SLUB_CPU_PARTIAL
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 84a0b4828f66..669e35c03be2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -56,6 +56,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
/*
+ * Cpus stopping functions in panic. All have default weak definitions.
+ * Architecture-dependent code may override them.
+ */
+void panic_smp_self_stop(void);
+void nmi_panic_self_stop(struct pt_regs *regs);
+void crash_smp_send_stop(void);
+
+/*
* Call a function on all processors
*/
static inline void on_each_cpu(smp_call_func_t func, void *info, int wait)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4cc6ec3bf0ab..144727041e78 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -10,8 +10,10 @@
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
+#include <uapi/linux/mempolicy.h>
#include <asm/page.h>
struct notifier_block;
@@ -339,6 +341,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
extern void lru_note_cost_page(struct page *);
extern void lru_cache_add(struct page *);
extern void mark_page_accessed(struct page *);
+
+extern atomic_t lru_disable_count;
+
+static inline bool lru_cache_disabled(void)
+{
+ return atomic_read(&lru_disable_count);
+}
+
+static inline void lru_cache_enable(void)
+{
+ atomic_dec(&lru_disable_count);
+}
+
+extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
@@ -378,6 +394,12 @@ extern int sysctl_min_slab_ratio;
#define node_reclaim_mode 0
#endif
+static inline bool node_reclaim_enabled(void)
+{
+ /* Is any node_reclaim_mode bit set? */
+ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
extern void check_move_unevictable_pages(struct pagevec *pvec);
extern int kswapd_run(int nid);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a8e5f3ea9bb2..794d1538b8ba 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -17,6 +17,9 @@
#include <linux/mm.h>
#include <asm-generic/pgtable_uffd.h>
+/* The set of all possible UFFD-related VM flags. */
+#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
+
/*
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
* new flags, since they might collide with O_* ones. We want
@@ -34,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+ /* A normal copy_from_user into the destination range. */
+ MCOPY_ATOMIC_NORMAL,
+ /* Don't copy; map the destination range to the zero page. */
+ MCOPY_ATOMIC_ZEROPAGE,
+ /* Just install pte(s) with the existing page(s) in the page cache. */
+ MCOPY_ATOMIC_CONTINUE,
+};
+
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
bool *mmap_changing, __u64 mode);
@@ -41,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long len, bool *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, bool *mmap_changing);
@@ -52,6 +73,22 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
}
+/*
+ * Never enable huge pmd sharing on some uffd registered vmas:
+ *
+ * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
+ *
+ * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
+ * VMAs which share huge pmds. (If you have two mappings to the same
+ * underlying pages, and fault in the non-UFFD-registered one with a write,
+ * with huge pmd sharing this would *also* setup the second UFFD-registered
+ * mapping, and we'd not get minor faults.)
+ */
+static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
+}
+
static inline bool userfaultfd_missing(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_UFFD_MISSING;
@@ -62,6 +99,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
return vma->vm_flags & VM_UFFD_WP;
}
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_MINOR;
+}
+
static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
pte_t pte)
{
@@ -76,7 +118,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
- return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+ return vma->vm_flags & __VM_UFFD_FLAGS;
}
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
@@ -123,6 +165,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+ return false;
+}
+
static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
pte_t pte)
{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..ae0dd1948c2b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -71,6 +71,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
#endif
+#ifdef CONFIG_CMA
+ CMA_ALLOC_SUCCESS,
+ CMA_ALLOC_FAIL,
+#endif
UNEVICTABLE_PGCULLED, /* culled to noreclaim list */
UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */
UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */
@@ -121,6 +125,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
SWAP_RA,
SWAP_RA_HIT,
#endif
+#ifdef CONFIG_X86
+ DIRECT_MAP_LEVEL2_SPLIT,
+ DIRECT_MAP_LEVEL3_SPLIT,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3de7be6dd17c..394d03cc0e92 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
+#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */
/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -54,6 +55,9 @@ struct vm_struct {
unsigned long size;
unsigned long flags;
struct page **pages;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ unsigned int page_order;
+#endif
unsigned int nr_pages;
phys_addr_t phys_addr;
const void *caller;
@@ -78,6 +82,28 @@ struct vmap_area {
};
};
+/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
+#ifndef arch_vmap_p4d_supported
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+ return false;
+}
+#endif
+
+#ifndef arch_vmap_pud_supported
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ return false;
+}
+#endif
+
+#ifndef arch_vmap_pmd_supported
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return false;
+}
+#endif
+
/*
* Highlevel APIs for driver use
*/
@@ -166,13 +192,27 @@ void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
+static inline bool is_vm_area_hugepages(const void *addr)
+{
+ /*
+ * This may not 100% tell if the area is mapped with > PAGE_SIZE
+ * page table entries, if for some reason the architecture indicates
+ * larger sizes are available but decides not to use them, nothing
+ * prevents that. This only indicates the size of the physical page
+ * allocated in the vmalloc layer.
+ */
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return find_vm_area(addr)->page_order > 0;
+#else
+ return false;
+#endif
+}
+
#ifdef CONFIG_MMU
-extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
- pgprot_t prot, struct page **pages);
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages);
-extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
-extern void unmap_kernel_range(unsigned long addr, unsigned long size);
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift);
+void vunmap_range(unsigned long addr, unsigned long end);
static inline void set_vm_flush_reset_perms(void *addr)
{
struct vm_struct *vm = find_vm_area(addr);
@@ -180,19 +220,8 @@ static inline void set_vm_flush_reset_perms(void *addr)
if (vm)
vm->flags |= VM_FLUSH_RESET_PERMS;
}
+
#else
-static inline int
-map_kernel_range_noflush(unsigned long start, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return size >> PAGE_SHIFT;
-}
-#define map_kernel_range map_kernel_range_noflush
-static inline void
-unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
-}
-#define unmap_kernel_range unmap_kernel_range_noflush
static inline void set_vm_flush_reset_perms(void *addr)
{
}
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..3299cd69e4ca 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -512,16 +512,10 @@ static inline void mod_lruvec_page_state(struct page *page,
#endif /* CONFIG_MEMCG */
-static inline void __inc_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
- __mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void __dec_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
+static inline void inc_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx)
{
- __mod_lruvec_state(lruvec, idx, -1);
+ mod_lruvec_state(lruvec, idx, 1);
}
static inline void __inc_lruvec_page_state(struct page *page,
@@ -536,18 +530,6 @@ static inline void __dec_lruvec_page_state(struct page *page,
__mod_lruvec_page_state(page, idx, -1);
}
-static inline void inc_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
- mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void dec_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
- mod_lruvec_state(lruvec, idx, -1);
-}
-
static inline void inc_lruvec_page_state(struct page *page,
enum node_stat_item idx)
{
diff --git a/include/linux/wait.h b/include/linux/wait.h
index fe10e8570a52..cffb6f8d3b0b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -555,7 +555,7 @@ do { \
({ \
int __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
@@ -581,7 +581,7 @@ do { \
({ \
long __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq, condition, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..6d517a37c18b 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -65,7 +65,7 @@
#define PP_ALLOC_CACHE_REFILL 64
struct pp_alloc_cache {
u32 count;
- void *cache[PP_ALLOC_CACHE_SIZE];
+ struct page *cache[PP_ALLOC_CACHE_SIZE];
};
struct page_pool_params {
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 5017a8829270..c3d354702cb0 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -8,28 +8,31 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
-TRACE_EVENT(cma_alloc,
+DECLARE_EVENT_CLASS(cma_alloc_class,
- TP_PROTO(unsigned long pfn, const struct page *page,
- unsigned int count, unsigned int align),
+ TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+ unsigned long count, unsigned int align),
- TP_ARGS(pfn, page, count, align),
+ TP_ARGS(name, pfn, page, count, align),
TP_STRUCT__entry(
+ __string(name, name)
__field(unsigned long, pfn)
__field(const struct page *, page)
- __field(unsigned int, count)
+ __field(unsigned long, count)
__field(unsigned int, align)
),
TP_fast_assign(
+ __assign_str(name, name);
__entry->pfn = pfn;
__entry->page = page;
__entry->count = count;
__entry->align = align;
),
- TP_printk("pfn=%lx page=%p count=%u align=%u",
+ TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u",
+ __get_str(name),
__entry->pfn,
__entry->page,
__entry->count,
@@ -38,29 +41,72 @@ TRACE_EVENT(cma_alloc,
TRACE_EVENT(cma_release,
- TP_PROTO(unsigned long pfn, const struct page *page,
- unsigned int count),
+ TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+ unsigned long count),
- TP_ARGS(pfn, page, count),
+ TP_ARGS(name, pfn, page, count),
TP_STRUCT__entry(
+ __string(name, name)
__field(unsigned long, pfn)
__field(const struct page *, page)
- __field(unsigned int, count)
+ __field(unsigned long, count)
),
TP_fast_assign(
+ __assign_str(name, name);
__entry->pfn = pfn;
__entry->page = page;
__entry->count = count;
),
- TP_printk("pfn=%lx page=%p count=%u",
+ TP_printk("name=%s pfn=%lx page=%p count=%lu",
+ __get_str(name),
__entry->pfn,
__entry->page,
__entry->count)
);
+TRACE_EVENT(cma_alloc_start,
+
+ TP_PROTO(const char *name, unsigned long count, unsigned int align),
+
+ TP_ARGS(name, count, align),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long, count)
+ __field(unsigned int, align)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->count = count;
+ __entry->align = align;
+ ),
+
+ TP_printk("name=%s count=%lu align=%u",
+ __get_str(name),
+ __entry->count,
+ __entry->align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
+
+ TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+ unsigned long count, unsigned int align),
+
+ TP_ARGS(name, pfn, page, count, align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
+
+ TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+ unsigned long count, unsigned int align),
+
+ TP_ARGS(name, pfn, page, count, align)
+);
+
#endif /* _TRACE_CMA_H */
/* This part must be outside protection */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 3a60b6b6db32..829a75692cc0 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -343,6 +343,26 @@ static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
#define __PTR_TO_HASHVAL
#endif
+#define TRACE_MM_PAGES \
+ EM(MM_FILEPAGES) \
+ EM(MM_ANONPAGES) \
+ EM(MM_SWAPENTS) \
+ EMe(MM_SHMEMPAGES)
+
+#undef EM
+#undef EMe
+
+#define EM(a) TRACE_DEFINE_ENUM(a);
+#define EMe(a) TRACE_DEFINE_ENUM(a);
+
+TRACE_MM_PAGES
+
+#undef EM
+#undef EMe
+
+#define EM(a) { a, #a },
+#define EMe(a) { a, #a }
+
TRACE_EVENT(rss_stat,
TP_PROTO(struct mm_struct *mm,
@@ -365,10 +385,10 @@ TRACE_EVENT(rss_stat,
__entry->size = (count << PAGE_SHIFT);
),
- TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
+ TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
__entry->mm_id,
__entry->curr,
- __entry->member,
+ __print_symbolic(__entry->member, TRACE_MM_PAGES),
__entry->size)
);
#endif /* _TRACE_KMEM_H */
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 4d434398d64d..9fb2a3bbcdfb 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@
EM( MR_SYSCALL, "syscall_or_cpuset") \
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
- EMe(MR_CONTIG_RANGE, "contig_range")
+ EM( MR_CONTIG_RANGE, "contig_range") \
+ EMe(MR_LONGTERM_PIN, "longterm_pin")
/*
* First define the enums in the above macros to be exported to userspace
@@ -81,6 +82,28 @@ TRACE_EVENT(mm_migrate_pages,
__print_symbolic(__entry->mode, MIGRATE_MODE),
__print_symbolic(__entry->reason, MIGRATE_REASON))
);
+
+TRACE_EVENT(mm_migrate_pages_start,
+
+ TP_PROTO(enum migrate_mode mode, int reason),
+
+ TP_ARGS(mode, reason),
+
+ TP_STRUCT__entry(
+ __field(enum migrate_mode, mode)
+ __field(int, reason)
+ ),
+
+ TP_fast_assign(
+ __entry->mode = mode;
+ __entry->reason = reason;
+ ),
+
+ TP_printk("mode=%s reason=%s",
+ __print_symbolic(__entry->mode, MIGRATE_MODE),
+ __print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+
#endif /* _TRACE_MIGRATE_H */
/* This part must be outside protection */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 67018d367b9f..629c7a0eaff2 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
#define IF_HAVE_VM_SOFTDIRTY(flag,name)
#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
+#else
+# define IF_HAVE_UFFD_MINOR(flag, name)
+#endif
+
#define __def_vmaflag_names \
{VM_READ, "read" }, \
{VM_WRITE, "write" }, \
@@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
{VM_MAYSHARE, "mayshare" }, \
{VM_GROWSDOWN, "growsdown" }, \
{VM_UFFD_MISSING, "uffd_missing" }, \
+IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \
{VM_PFNMAP, "pfnmap" }, \
{VM_DENYWRITE, "denywrite" }, \
{VM_UFFD_WP, "uffd_wp" }, \
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8948467b3992..4832fd0b5642 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,5 +64,12 @@ enum {
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI. New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
#endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 5f2d88212f7c..bafbeb1a2624 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,15 +19,19 @@
* means the userland is reading).
*/
#define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \
+ UFFDIO_REGISTER_MODE_WP | \
+ UFFDIO_REGISTER_MODE_MINOR)
#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
UFFD_FEATURE_EVENT_FORK | \
UFFD_FEATURE_EVENT_REMAP | \
- UFFD_FEATURE_EVENT_REMOVE | \
+ UFFD_FEATURE_EVENT_REMOVE | \
UFFD_FEATURE_EVENT_UNMAP | \
UFFD_FEATURE_MISSING_HUGETLBFS | \
UFFD_FEATURE_MISSING_SHMEM | \
UFFD_FEATURE_SIGBUS | \
- UFFD_FEATURE_THREAD_ID)
+ UFFD_FEATURE_THREAD_ID | \
+ UFFD_FEATURE_MINOR_HUGETLBFS)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -36,10 +40,12 @@
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE | \
- (__u64)1 << _UFFDIO_WRITEPROTECT)
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
+ (__u64)1 << _UFFDIO_CONTINUE)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
- (__u64)1 << _UFFDIO_COPY)
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_CONTINUE)
/*
* Valid ioctl command number range with this API is from 0x00 to
@@ -55,6 +61,7 @@
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_ZEROPAGE (0x04)
#define _UFFDIO_WRITEPROTECT (0x06)
+#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -73,6 +80,8 @@
struct uffdio_zeropage)
#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \
+ struct uffdio_continue)
/* read() structure */
struct uffd_msg {
@@ -127,6 +136,7 @@ struct uffd_msg {
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */
struct uffdio_api {
/* userland asks for an API number and the features to enable */
@@ -171,6 +181,10 @@ struct uffdio_api {
*
* UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
* be returned, if feature is not requested 0 will be returned.
+ *
+ * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+ * can be intercepted (via REGISTER_MODE_MINOR) for
+ * hugetlbfs-backed pages.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -181,6 +195,7 @@ struct uffdio_api {
#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
#define UFFD_FEATURE_SIGBUS (1<<7)
#define UFFD_FEATURE_THREAD_ID (1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
__u64 features;
__u64 ioctls;
@@ -195,6 +210,7 @@ struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2)
__u64 mode;
/*
@@ -257,6 +273,18 @@ struct uffdio_writeprotect {
__u64 mode;
};
+struct uffdio_continue {
+ struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 mapped;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/