diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2021-04-13 18:58:33 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2021-04-13 18:58:33 +1000 |
commit | c03156f8d8b7420c9c53d5d37b5c1696a931cb6e (patch) | |
tree | ebfc85df4d9673aa974668b51aa2638a41f116da /include | |
parent | f195cda9b95d03d27906f98a61456d47ced1c35b (diff) | |
parent | 3ad29d0ccd4c42afa271bc8529fb357adff944ec (diff) | |
download | linux-next-c03156f8d8b7420c9c53d5d37b5c1696a931cb6e.tar.gz |
Merge branch 'akpm-current/current'
Diffstat (limited to 'include')
58 files changed, 928 insertions, 401 deletions
diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 9fdf21302fdf..0d132ee2a291 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -2,6 +2,13 @@ #ifndef _ASM_GENERIC_BITOPS_FIND_H_ #define _ASM_GENERIC_BITOPS_FIND_H_ +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -12,8 +19,22 @@ * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_bit(const unsigned long *addr, unsigned long - size, unsigned long offset); +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} #endif #ifndef find_next_and_bit @@ -27,9 +48,23 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_and_bit(const unsigned long *addr1, +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} #endif #ifndef find_next_zero_bit @@ -42,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned - long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} #endif #ifdef CONFIG_GENERIC_FIND_FIRST_BIT @@ -56,8 +105,17 @@ extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -extern unsigned long find_first_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} /** * find_first_zero_bit - find the first cleared bit in a memory region @@ -67,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -extern unsigned long find_first_zero_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit @@ -80,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr, #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 188d3eba3ace..5a28629cbf4d 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,8 +2,10 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ +#include <asm-generic/bitops/find.h> #include <asm/types.h> #include <asm/byteorder.h> +#include <linux/swab.h> #if defined(__LITTLE_ENDIAN) @@ -32,13 +34,41 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #ifndef find_next_zero_bit_le -extern unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} #endif #ifndef find_next_bit_le -extern unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} #endif #ifndef find_first_zero_bit_le diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 3905c1c93dc2..1023e2a4bd37 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -23,4 +23,16 @@ #define BITS_PER_LONG_LONG 64 #endif +/* + * small_const_nbits(n) is true precisely when it is known at compile-time + * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows + * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps + * of size 0 are very rare, and a compile-time-known-size 0 is most likely + * a sign of error. They will be handled correctly by the bit/bitmap APIs, + * but using the out-of-line functions, so that the inline implementations + * can unconditionally dereference the pointer(s). + */ +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/linux/align.h b/include/linux/align.h new file mode 100644 index 000000000000..2b4acec7b95a --- /dev/null +++ b/include/linux/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ALIGN_H +#define _LINUX_ALIGN_H + +#include <linux/const.h> + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/async.h b/include/linux/async.h index 0a17cd27f348..cce4ad31e8fc 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev, return async_schedule_node_domain(func, dev, dev_to_node(dev), domain); } -void async_unregister_domain(struct async_domain *domain); extern void async_synchronize_full(void); extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 73d039476fa4..a36cfcec4e77 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -4,8 +4,9 @@ #ifndef __ASSEMBLY__ +#include <linux/align.h> #include <linux/bitops.h> -#include <linux/kernel.h> +#include <linux/limits.h> #include <linux/string.h> #include <linux/types.h> @@ -229,14 +230,6 @@ int bitmap_print_to_pagebuf(bool list, char *buf, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -/* - * The static inlines below do not handle constant nbits==0 correctly, - * so make such users (should any ever turn up) call the out-of-line - * versions. - */ -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a5a48303b0f1..26bf15e6cd35 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, }) #endif -#ifndef find_last_bit -/** - * find_last_bit - find the last set bit in a memory region - * @addr: The address to start the search at - * @size: The number of bits to search - * - * Returns the bit number of the last set bit, or size. - */ -extern unsigned long find_last_bit(const unsigned long *addr, - unsigned long size); -#endif - #endif /* __KERNEL__ */ #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9a57cae9672e..d99011c4548f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include <linux/minmax.h> #include <linux/timer.h> #include <linux/workqueue.h> -#include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/wait.h> #include <linux/mempool.h> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6b47f94378c5..e7e99da31349 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(int cpu); +bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c8a762..53fd8c3cdbd0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ed4070ed41ef..4221888bdcd6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order) } #ifdef CONFIG_COMPACTION -extern int sysctl_compact_memory; extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); diff --git a/include/linux/compat.h b/include/linux/compat.h index 5112c3e35782..197e0a180406 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -75,7 +75,6 @@ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ - asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 48750243db4c..5d97ef738a57 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,15 +90,11 @@ */ #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) -/* - * sparse (__CHECKER__) pretends to be gcc, but can't do constant - * folding in __builtin_bswap*() (yet), so don't set these for it. - */ -#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__) +#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ #define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */ +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ #if GCC_VERSION >= 70000 #define KASAN_ABI_VERSION 5 diff --git a/include/linux/crc8.h b/include/linux/crc8.h index 13c8dabb0441..674045c59a04 100644 --- a/include/linux/crc8.h +++ b/include/linux/crc8.h @@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial); * Williams, Ross N., ross<at>ross.net * (see URL http://www.ross.net/crc/download/crc_v3.txt). */ -u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc); +u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc); #endif /* __CRC8_H_ */ diff --git a/include/linux/cred.h b/include/linux/cred.h index ac0e5f97d7d8..14971322e1a0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -53,7 +53,6 @@ do { \ groups_free(group_info); \ } while (0) -extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); diff --git a/include/linux/fs.h b/include/linux/fs.h index bf4e90d3ab18..acef282b97c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. - * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). @@ -463,7 +462,6 @@ struct address_space { struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; - unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; @@ -2878,6 +2876,8 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_needs_writeback(struct address_space *, + loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8572a1474e16..8a5f6c3d7dba 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -515,14 +515,25 @@ static inline int arch_make_page_accessible(struct page *page) } #endif -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, - nodemask_t *nodemask); +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); -static inline struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array); + +/* Bulk allocate order-0 pages */ +static inline unsigned long +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +} + +static inline unsigned long +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { - return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } /* @@ -535,7 +546,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); - return __alloc_pages(gfp_mask, order, nid); + return __alloc_pages(gfp_mask, order, nid, NULL); } /* @@ -553,13 +564,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, } #ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); - -static inline struct page * -alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - return alloc_pages_current(gfp_mask, order); -} +struct page *alloc_pages(gfp_t gfp, unsigned int order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); @@ -652,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44170f312ae7..b06eeeaee975 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -146,9 +146,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif @@ -199,9 +199,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, static inline void clear_highpage(struct page *page) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); clear_page(kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } /* @@ -216,7 +216,7 @@ static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); @@ -227,7 +227,7 @@ static inline void zero_user_segments(struct page *page, if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); - kunmap_atomic(kaddr); + kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } @@ -252,11 +252,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from, { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -267,11 +267,11 @@ static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_local(to); } +static inline void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + #endif /* _LINUX_HIGHMEM_H */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ba973efcd369..9626fda5efce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,9 +87,6 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, -#ifdef CONFIG_DEBUG_VM - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, -#endif }; struct kobject; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..09f1fd12a6fa 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,7 @@ #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> +#include <linux/userfaultfd_k.h> struct ctl_table; struct user_struct; @@ -134,11 +135,14 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef CONFIG_USERFAULTFD int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); +#endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -152,7 +156,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -161,7 +166,7 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); @@ -187,6 +192,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -308,16 +314,19 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, BUG(); } +#ifdef CONFIG_USERFAULTFD static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { BUG(); return 0; } +#endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) @@ -368,6 +377,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support @@ -555,6 +566,7 @@ HPAGEFLAG(Freed, freed) #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { + struct mutex resize_lock; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; @@ -1039,4 +1051,14 @@ static inline __init void hugetlb_cma_check(void) } #endif +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); + +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b2412b4d4c20..40fc5813cf93 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct group_info init_groups; extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 85c15717af34..1bbe9af48dc3 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long); #ifdef CONFIG_BLK_DEV_INITRD extern void __init reserve_initrd_mem(void); +extern void wait_for_initramfs(void); #else static inline void __init reserve_initrd_mem(void) {} +static inline void wait_for_initramfs(void) {} #endif extern phys_addr_t phys_initrd_start; diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index c093e81310a9..e9743cfd8585 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -220,3 +220,6 @@ io_mapping_free(struct io_mapping *iomap) } #endif /* _LINUX_IO_MAPPING_H */ + +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size); diff --git a/include/linux/io.h b/include/linux/io.h index 61ff7d6278b6..9595151d800d 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -void __init ioremap_huge_init(void); -int arch_ioremap_p4d_supported(void); -int arch_ioremap_pud_supported(void); -int arch_ioremap_pmd_supported(void); -#else -static inline void ioremap_huge_init(void) { } -#endif - /* * Managed iomap interface */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index e7c20c79b342..b1678a61e6a7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -30,7 +30,8 @@ struct kunit_kasan_expectation { /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS -#define KASAN_SHADOW_INIT 0xFF +/* This matches KASAN_TAG_INVALID. */ +#define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif @@ -95,6 +96,11 @@ static __always_inline bool kasan_enabled(void) return static_branch_likely(&kasan_flag_enabled); } +static inline bool kasan_has_integrated_init(void) +{ + return kasan_enabled(); +} + #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -102,6 +108,11 @@ static inline bool kasan_enabled(void) return true; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} + #endif /* CONFIG_KASAN_HW_TAGS */ slab_flags_t __kasan_never_merge(void); @@ -119,20 +130,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size) __kasan_unpoison_range(addr, size); } -void __kasan_alloc_pages(struct page *page, unsigned int order); +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_alloc_pages(page, order); + __kasan_alloc_pages(page, order, init); } -void __kasan_free_pages(struct page *page, unsigned int order); +void __kasan_free_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_free_pages(page, order); + __kasan_free_pages(page, order, init); } void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, @@ -192,11 +203,13 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); -static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) +bool __kasan_slab_free(struct kmem_cache *s, void *object, + unsigned long ip, bool init); +static __always_inline bool kasan_slab_free(struct kmem_cache *s, + void *object, bool init) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_); + return __kasan_slab_free(s, object, _RET_IP_, init); return false; } @@ -215,12 +228,12 @@ static __always_inline void kasan_slab_free_mempool(void *ptr) } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, - void *object, gfp_t flags); + void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( - struct kmem_cache *s, void *object, gfp_t flags) + struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) - return __kasan_slab_alloc(s, object, flags); + return __kasan_slab_alloc(s, object, flags, init); return object; } @@ -276,13 +289,17 @@ static inline bool kasan_enabled(void) { return false; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} static inline slab_flags_t kasan_never_merge(void) { return 0; } static inline void kasan_unpoison_range(const void *address, size_t size) {} -static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} -static inline void kasan_free_pages(struct page *page, unsigned int order) {} +static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {} +static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} @@ -298,14 +315,14 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; } static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, - gfp_t flags) + gfp_t flags, bool init) { return object; } @@ -330,7 +347,7 @@ static inline bool kasan_check_byte(const void *address) #endif /* CONFIG_KASAN */ -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) void kasan_unpoison_task_stack(struct task_struct *task); #else static inline void kasan_unpoison_task_stack(struct task_struct *task) {} diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b7ed6dc99ac..09035ac67d4b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -3,6 +3,7 @@ #define _LINUX_KERNEL_H #include <stdarg.h> +#include <linux/align.h> #include <linux/limits.h> #include <linux/linkage.h> #include <linux/stddef.h> @@ -30,14 +31,6 @@ */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) - /* generic data direction definitions */ #define READ 0 #define WRITE 1 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c04d39a7967..c193be760709 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -76,10 +76,27 @@ enum mem_cgroup_events_target { }; struct memcg_vmstats_percpu { - long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; }; struct mem_cgroup_reclaim_iter { @@ -97,12 +114,13 @@ struct batched_lruvec_stat { }; /* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; }; /* @@ -128,7 +146,7 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter; - struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -287,8 +305,8 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; + /* memory.stat */ + struct memcg_vmstats vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -315,10 +333,6 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task; - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK @@ -358,6 +372,62 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) +static inline bool PageMemcgKmem(struct page *page); + +/* + * After the initialization objcg->memcg is always pointing at + * a valid memcg, but can be atomically swapped to the parent memcg. + * + * The caller must ensure that the returned memcg won't be released: + * e.g. acquire the rcu_read_lock or css_set_lock. + */ +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +{ + return READ_ONCE(objcg->memcg); +} + +/* + * __page_memcg - get the memory cgroup associated with a non-kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * kmem pages. + */ +static inline struct mem_cgroup *__page_memcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * __page_objcg - get the object cgroup associated with a kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper object cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * LRU pages. + */ +static inline struct obj_cgroup *__page_objcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + + return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + /* * page_memcg - get the memory cgroup associated with a page * @page: a pointer to the page struct @@ -367,20 +437,23 @@ enum page_memcg_data_flags { * proper memory cgroup pointer. It's not safe to call this function * against some type of pages, e.g. slab pages or ex-slab pages. * - * Any of the following ensures page and memcg binding stability: + * For a non-kmem page any of the following ensures page and memcg binding + * stability: + * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg(struct page *page) { - unsigned long memcg_data = page->memcg_data; - - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + if (PageMemcgKmem(page)) + return obj_cgroup_memcg(__page_objcg(page)); + else + return __page_memcg(page); } /* @@ -394,11 +467,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page) */ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { + unsigned long memcg_data = READ_ONCE(page->memcg_data); + VM_BUG_ON_PAGE(PageSlab(page), page); WARN_ON_ONCE(!rcu_read_lock_held()); - return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & - ~MEMCG_DATA_FLAGS_MASK); + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -406,15 +487,21 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) * @page: a pointer to the page struct * * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page + * or NULL. This function unlike page_memcg() can take any page * as an argument. It has to be used in cases when it's not known if a page - * has an associated memory cgroup pointer or an object cgroups vector. + * has an associated memory cgroup pointer or an object cgroups vector or + * an object cgroup. + * + * For a non-kmem page any of the following ensures page and memcg binding + * stability: * - * Any of the following ensures page and memcg binding stability: * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg_check(struct page *page) { @@ -427,9 +514,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +#ifdef CONFIG_MEMCG_KMEM /* * PageMemcgKmem - check if the page has MemcgKmem flag set * @page: a pointer to the page struct @@ -444,7 +539,6 @@ static inline bool PageMemcgKmem(struct page *page) return page->memcg_data & MEMCG_DATA_KMEM; } -#ifdef CONFIG_MEMCG_KMEM /* * page_objcgs - get the object cgroups vector associated with a page * @page: a pointer to the page struct @@ -486,6 +580,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline struct obj_cgroup **page_objcgs(struct page *page) { return NULL; @@ -596,18 +695,15 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) } int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry); +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -static struct mem_cgroup_per_node * -mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) -{ - return memcg->nodeinfo[nid]; -} - /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec @@ -631,7 +727,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* @@ -708,21 +804,15 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg) percpu_ref_get(&objcg->refcnt); } -static inline void obj_cgroup_put(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_put(&objcg->refcnt); + percpu_ref_get_many(&objcg->refcnt, nr); } -/* - * After the initialization objcg->memcg is always pointing at - * a valid memcg, but can be atomically swapped to the parent memcg. - * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. - */ -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - return READ_ONCE(objcg->memcg); + percpu_ref_put(&objcg->refcnt); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) @@ -867,43 +957,9 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif -struct mem_cgroup *lock_page_memcg(struct page *page); -void __unlock_page_memcg(struct mem_cgroup *memcg); +void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - long x = 0; - int cpu; - - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->stat[idx], cpu); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -979,10 +1035,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1063,13 +1115,15 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, unsigned int nr); +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 -struct mem_cgroup; - static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; @@ -1139,6 +1193,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_swapin_charge_page(struct page *page, + struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) +{ + return 0; +} + +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ +} + static inline void mem_cgroup_uncharge(struct page *page) { } @@ -1171,6 +1235,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page, return lruvec == &pgdat->__lruvec; } +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} + static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1289,12 +1357,7 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline struct mem_cgroup *lock_page_memcg(struct page *page) -{ - return NULL; -} - -static inline void __unlock_page_memcg(struct mem_cgroup *memcg) +static inline void lock_page_memcg(struct page *page) { } @@ -1334,17 +1397,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - return 0; -} - -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - return 0; -} - static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) @@ -1390,18 +1442,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, mod_node_page_state(page_pgdat(page), idx, val); } -static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} - -static inline void split_page_memcg(struct page *head, unsigned int nr) -{ -} - static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) @@ -1424,8 +1464,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline void split_page_memcg(struct page *head, unsigned int nr) +{ +} + +static inline +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { + return 0; } #endif /* CONFIG_MEMCG */ @@ -1563,10 +1611,10 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1576,8 +1624,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20..97e92e8b556a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7288aa5ef73b..a85d4b7d15c2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -56,6 +56,14 @@ typedef int __bitwise mhp_t; #define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + +/* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) * pgprot: page protection flags to apply to newly created page tables @@ -101,11 +109,13 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + unsigned long nr_vmemmap_pages, int online_type, + int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); + unsigned long end_pfn, + unsigned long buddy_start_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); @@ -307,7 +317,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + unsigned long nr_vmemmap_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); extern int offline_and_remove_memory(int nid, u64 start, u64 size); @@ -315,7 +326,8 @@ extern int offline_and_remove_memory(int nid, u64 start, u64 size); #else static inline void try_offline_node(int nid) {} -static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + unsigned long nr_vmemmap_pages) { return -EINVAL; } @@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca..b46f63dcaed3 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; @@ -131,6 +131,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -143,6 +144,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3a389633b68f..4bb4e519e3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ enum migrate_reason { MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, MR_CONTIG_RANGE, + MR_LONGTERM_PIN, MR_TYPES }; @@ -43,10 +44,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void putback_movable_page(struct page *page); -extern void migrate_prep(void); -extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -66,9 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline int migrate_prep(void) { return -ENOSYS; } -static inline int migrate_prep_local(void) { return -ENOSYS; } - static inline void migrate_page_states(struct page *newpage, struct page *page) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 5e4ead59e67f..c5f323b8d974 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -432,8 +439,7 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; /** - * Fault flag definitions. - * + * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. @@ -464,16 +470,18 @@ extern pgprot_t protection_map[16]; * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. */ -#define FAULT_FLAG_WRITE 0x01 -#define FAULT_FLAG_MKWRITE 0x02 -#define FAULT_FLAG_ALLOW_RETRY 0x04 -#define FAULT_FLAG_RETRY_NOWAIT 0x08 -#define FAULT_FLAG_KILLABLE 0x10 -#define FAULT_FLAG_TRIED 0x20 -#define FAULT_FLAG_USER 0x40 -#define FAULT_FLAG_REMOTE 0x80 -#define FAULT_FLAG_INSTRUCTION 0x100 -#define FAULT_FLAG_INTERRUPTIBLE 0x200 +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; /* * The default fault flags that should be used by most of the @@ -485,6 +493,7 @@ extern pgprot_t protection_map[16]; /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition @@ -495,7 +504,7 @@ extern pgprot_t protection_map[16]; * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ -static inline bool fault_flag_allow_retry_first(unsigned int flags) +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); @@ -530,7 +539,7 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ }; - unsigned int flags; /* FAULT_FLAG_xxx flags + enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ @@ -580,7 +589,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area, unsigned long flags); + int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -1132,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} + #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); @@ -1265,13 +1279,16 @@ static inline void put_page(struct page *page) void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); /** - * page_maybe_dma_pinned() - report if a page is pinned for DMA. + * page_maybe_dma_pinned - Report if a page is pinned for DMA. + * @page: The page. * * This function checks if a page has been pinned via a call to - * pin_user_pages*(). + * a function in the pin_user_pages() family. * * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably @@ -1289,9 +1306,8 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * @page: pointer to page to be queried. - * @Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. */ static inline bool page_maybe_dma_pinned(struct page *page) { @@ -1539,6 +1555,20 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_pinnable_page(struct page *page) +{ + return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || + is_zero_pfn(page_to_pfn(page)); +} +#else +static inline bool is_pinnable_page(struct page *page) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -1629,7 +1659,6 @@ static inline pgoff_t page_index(struct page *page) bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); -struct address_space *page_mapping_file(struct page *page); /* * Return true only if the page has been allocated with @@ -2357,7 +2386,7 @@ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); +extern void mem_init_print_info(void); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); @@ -2731,6 +2760,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); @@ -2790,7 +2821,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ #define FOLL_POPULATE 0x40 /* fault in page */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..acea81553359 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -407,8 +407,13 @@ enum zone_type { * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might - * essentially turn such pages unmovable. Memory offlining might - * retry a long time. + * essentially turn such pages unmovable. Therefore, we do not allow + * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and + * faulted, they come from the right zone right away. However, it is + * still possible that address space already has pages in + * ZONE_MOVABLE at the time when pages are pinned (i.e. user has + * touches that memory before pinning). In such case we migrate them + * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. @@ -427,6 +432,15 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create + * situations where ZERO_PAGE(0) which is allocated differently + * on different platforms may end up in a movable zone. ZERO_PAGE(0) + * cannot be migrated. + * 7. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d4ec26d8a3e..ef1e3e736e14 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -21,16 +21,17 @@ #elif MAX_NR_ZONES <= 8 #define ZONES_SHIFT 3 #else -#error ZONES_SHIFT -- too many zones configured adjust calculation +#error ZONES_SHIFT "Too many zones configured" #endif +#define ZONES_WIDTH ZONES_SHIFT + #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> - -/* SECTION_SHIFT #bits space required to store a section # */ #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) - -#endif /* CONFIG_SPARSEMEM */ +#else +#define SECTIONS_SHIFT 0 +#endif #ifndef BUILD_VDSO32_64 /* @@ -54,17 +55,28 @@ #define SECTIONS_WIDTH 0 #endif -#define ZONES_WIDTH ZONES_SHIFT - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT -#else -#ifdef CONFIG_SPARSEMEM_VMEMMAP +#elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" -#endif +#else #define NODES_WIDTH 0 #endif +/* + * Note that this #define MUST have a value so that it can be tested with + * the IS_ENABLED() macro. + */ +#if NODES_SHIFT != 0 && NODES_WIDTH == 0 +#define NODE_NOT_IN_PAGE_FLAGS 1 +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +#define KASAN_TAG_WIDTH 8 +#else +#define KASAN_TAG_WIDTH 0 +#endif + #ifdef CONFIG_NUMA_BALANCING #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) @@ -77,36 +89,20 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -#define KASAN_TAG_WIDTH 8 -#else -#define KASAN_TAG_WIDTH 0 -#endif - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 #endif -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS -#error "Not enough bits in page flags" -#endif - -/* - * We are going to use the flags for the page to node mapping if its in - * there. This includes the case where there is no node, so it is implicit. - * Note that this #define MUST have a value so that it can be tested with - * the IS_ENABLED() macro. - */ -#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) -#define NODE_NOT_IN_PAGE_FLAGS 1 +#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 -#define LAST_CPUPID_NOT_IN_PAGE_FLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ + > BITS_PER_LONG - NR_PAGEFLAGS +#error "Not enough bits in page flags" #endif #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 31dc6ceed3f4..350e1c9726f2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,11 @@ struct pagevec; +static inline bool mapping_empty(struct address_space *mapping) +{ + return xa_empty(&mapping->i_pages); +} + /* * Bits in mapping->flags. */ @@ -158,6 +163,16 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); /* + * For file cache pages, return the address_space, otherwise return NULL + */ +static inline struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + +/* * speculatively take a reference to a page. * If the page is free (_refcount == 0), then _refcount is untouched, and 0 * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b1cb6b753abb..ac7b38ad5903 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -7,7 +7,7 @@ struct mm_walk; /** - * mm_walk_ops - callbacks for walk_page_range + * struct mm_walk_ops - callbacks for walk_page_range * @pgd_entry: if set, called for each non-empty PGD (top-level) entry * @p4d_entry: if set, called for each non-empty P4D entry * @pud_entry: if set, called for each non-empty PUD entry @@ -71,7 +71,7 @@ enum page_walk_action { }; /** - * mm_walk - walk_page_range data + * struct mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772392a379..2194a9cd885c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif +#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr) return zero_pfn; } #endif +#else +static inline int is_zero_pfn(unsigned long pfn) +{ + return 0; +} + +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_MMU diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 000cc0533c33..069c7fd95396 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -32,6 +32,7 @@ struct proc_ops { ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + /* mandatory unless nonseekable_open() or equivalent is used */ loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); diff --git a/include/linux/profile.h b/include/linux/profile.h index bad18ca43150..fd18ca96f557 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -15,7 +15,6 @@ #define KVM_PROFILING 4 struct proc_dir_entry; -struct pt_regs; struct notifier_block; #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS) @@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n); int profile_event_register(enum profile_type, struct notifier_block * n); int profile_event_unregister(enum profile_type, struct notifier_block * n); -struct pt_regs; - #else #define prof_on 0 diff --git a/include/linux/sched.h b/include/linux/sched.h index 4aedf9b92469..395b295bf93d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -841,6 +841,10 @@ struct task_struct { /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -968,6 +972,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; @@ -1378,6 +1383,13 @@ struct task_struct { struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -1578,7 +1590,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 90b2a0bce11c..e24b1fe348e3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } @@ -271,29 +275,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) +static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + unsigned int flags = current->flags & PF_MEMALLOC_PIN; - current->flags |= PF_MEMALLOC_NOCMA; + current->flags |= PF_MEMALLOC_PIN; return flags; } -static inline void memalloc_nocma_restore(unsigned int flags) -{ - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; -} -#else -static inline unsigned int memalloc_nocma_save(void) -{ - return 0; -} - -static inline void memalloc_nocma_restore(unsigned int flags) +static inline void memalloc_pin_restore(unsigned int flags) { + current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c97d788762c..2e09a0ac5d60 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -25,6 +25,8 @@ */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) +/* DEBUG: Silent bug reports */ +#define SLAB_SILENT_ERRORS ((slab_flags_t __force)0x00000200U) /* DEBUG: Red zone objs in a cache */ #define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dcde82a4434c..e4b51bb5bb83 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -133,6 +133,8 @@ struct kmem_cache { unsigned int usersize; /* Usercopy region size */ struct kmem_cache_node *node[MAX_NUMNODES]; + + int errors; /* Number of errors in cache */ }; #ifdef CONFIG_SLUB_CPU_PARTIAL diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828f66..669e35c03be2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -56,6 +56,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, int smp_call_function_single_async(int cpu, call_single_data_t *csd); /* + * Cpus stopping functions in panic. All have default weak definitions. + * Architecture-dependent code may override them. + */ +void panic_smp_self_stop(void); +void nmi_panic_self_stop(struct pt_regs *regs); +void crash_smp_send_stop(void); + +/* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4cc6ec3bf0ab..144727041e78 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -10,8 +10,10 @@ #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> +#include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; @@ -339,6 +341,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); + +extern atomic_t lru_disable_count; + +static inline bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count); +} + +static inline void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} + +extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); @@ -378,6 +394,12 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a8e5f3ea9bb2..794d1538b8ba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -17,6 +17,9 @@ #include <linux/mm.h> #include <asm-generic/pgtable_uffd.h> +/* The set of all possible UFFD-related VM flags. */ +#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want @@ -34,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* + * The mode of operation for __mcopy_atomic and its helpers. + * + * This is almost an implementation detail (mcopy_atomic below doesn't take this + * as a parameter), but it's exposed here because memory-kind-specific + * implementations (e.g. hugetlbfs) need to know the mode of operation. + */ +enum mcopy_atomic_mode { + /* A normal copy_from_user into the destination range. */ + MCOPY_ATOMIC_NORMAL, + /* Don't copy; map the destination range to the zero page. */ + MCOPY_ATOMIC_ZEROPAGE, + /* Just install pte(s) with the existing page(s) in the page cache. */ + MCOPY_ATOMIC_CONTINUE, +}; + extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode); @@ -41,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, bool *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, bool *mmap_changing); @@ -52,6 +73,22 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } +/* + * Never enable huge pmd sharing on some uffd registered vmas: + * + * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. + * + * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for + * VMAs which share huge pmds. (If you have two mappings to the same + * underlying pages, and fault in the non-UFFD-registered one with a write, + * with huge pmd sharing this would *also* setup the second UFFD-registered + * mapping, and we'd not get minor faults.) + */ +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; @@ -62,6 +99,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_MINOR; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { @@ -76,7 +118,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, static inline bool userfaultfd_armed(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); + return vma->vm_flags & __VM_UFFD_FLAGS; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); @@ -123,6 +165,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..ae0dd1948c2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -71,6 +71,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, +#endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ @@ -121,6 +125,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA, SWAP_RA_HIT, #endif +#ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, +#endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3de7be6dd17c..394d03cc0e92 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ +#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -54,6 +55,9 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + unsigned int page_order; +#endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; @@ -78,6 +82,28 @@ struct vmap_area { }; }; +/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ +#ifndef arch_vmap_p4d_supported +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pud_supported +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pmd_supported +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return false; +} +#endif + /* * Highlevel APIs for driver use */ @@ -166,13 +192,27 @@ void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); +static inline bool is_vm_area_hugepages(const void *addr) +{ + /* + * This may not 100% tell if the area is mapped with > PAGE_SIZE + * page table entries, if for some reason the architecture indicates + * larger sizes are available but decides not to use them, nothing + * prevents that. This only indicates the size of the physical page + * allocated in the vmalloc layer. + */ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return find_vm_area(addr)->page_order > 0; +#else + return false; +#endif +} + #ifdef CONFIG_MMU -extern int map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages); -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages); -extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); -extern void unmap_kernel_range(unsigned long addr, unsigned long size); +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift); +void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); @@ -180,19 +220,8 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } + #else -static inline int -map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return size >> PAGE_SHIFT; -} -#define map_kernel_range map_kernel_range_noflush -static inline void -unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ -} -#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 506d625163a1..3299cd69e4ca 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -512,16 +512,10 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - __mod_lruvec_state(lruvec, idx, 1); -} - -static inline void __dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) { - __mod_lruvec_state(lruvec, idx, -1); + mod_lruvec_state(lruvec, idx, 1); } static inline void __inc_lruvec_page_state(struct page *page, @@ -536,18 +530,6 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - -static inline void dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, -1); -} - static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { diff --git a/include/linux/wait.h b/include/linux/wait.h index fe10e8570a52..cffb6f8d3b0b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -555,7 +555,7 @@ do { \ ({ \ int __ret = 0; \ might_sleep(); \ - if (!(condition)) \ + if (!(condition) && (timeout)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ @@ -581,7 +581,7 @@ do { \ ({ \ long __ret = 0; \ might_sleep(); \ - if (!(condition)) \ + if (!(condition) && (timeout)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b5b195305346..6d517a37c18b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -65,7 +65,7 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; + struct page *cache[PP_ALLOC_CACHE_SIZE]; }; struct page_pool_params { diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a8829270..c3d354702cb0 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,28 +8,31 @@ #include <linux/types.h> #include <linux/tracepoint.h> -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, + unsigned long count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,29 +41,72 @@ TRACE_EVENT(cma_alloc, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, - unsigned int count), + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, + unsigned long count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", + __get_str(name), __entry->pfn, __entry->page, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned long count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%lu align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, + + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, + unsigned long count, unsigned int align), + + TP_ARGS(name, pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, + unsigned long count, unsigned int align), + + TP_ARGS(name, pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 3a60b6b6db32..829a75692cc0 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -343,6 +343,26 @@ static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) #define __PTR_TO_HASHVAL #endif +#define TRACE_MM_PAGES \ + EM(MM_FILEPAGES) \ + EM(MM_ANONPAGES) \ + EM(MM_SWAPENTS) \ + EMe(MM_SHMEMPAGES) + +#undef EM +#undef EMe + +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +TRACE_MM_PAGES + +#undef EM +#undef EMe + +#define EM(a) { a, #a }, +#define EMe(a) { a, #a } + TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, @@ -365,10 +385,10 @@ TRACE_EVENT(rss_stat, __entry->size = (count << PAGE_SHIFT); ), - TP_printk("mm_id=%u curr=%d member=%d size=%ldB", + TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, - __entry->member, + __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d434398d64d..9fb2a3bbcdfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,8 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CONTIG_RANGE, "contig_range") + EM( MR_CONTIG_RANGE, "contig_range") \ + EMe(MR_LONGTERM_PIN, "longterm_pin") /* * First define the enums in the above macros to be exported to userspace @@ -81,6 +82,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..629c7a0eaff2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name}, +#else +# define IF_HAVE_UFFD_MINOR(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ {VM_UFFD_MISSING, "uffd_missing" }, \ +IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8948467b3992..4832fd0b5642 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,5 +64,12 @@ enum { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 5f2d88212f7c..bafbeb1a2624 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,15 +19,19 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ - UFFD_FEATURE_THREAD_ID) + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -36,10 +40,12 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_CONTINUE) /* * Valid ioctl command number range with this API is from 0x00 to @@ -55,6 +61,7 @@ #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -73,6 +80,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -127,6 +136,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -171,6 +181,10 @@ struct uffdio_api { * * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -181,6 +195,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) __u64 features; __u64 ioctls; @@ -195,6 +210,7 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* @@ -257,6 +273,18 @@ struct uffdio_writeprotect { __u64 mode; }; +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + /* * Flags for the userfaultfd(2) system call itself. */ |