diff options
401 files changed, 10087 insertions, 5876 deletions
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 13100fb3c26d..0535ae1f73e5 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be pre-created. Default: 1. 2) Set max number of compression streams - Regardless the value passed to this attribute, ZRAM will always - allocate multiple compression streams - one per online CPUs - thus - allowing several concurrent compression operations. The number of - allocated compression streams goes down when some of the CPUs - become offline. There is no single-compression-stream mode anymore, - unless you are running a UP system or has only 1 CPU online. - - To find out how many streams are currently available: +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available: cat /sys/block/zram0/max_comp_streams 3) Select compression algorithm - Using comp_algorithm device attribute one can see available and - currently selected (shown in square brackets) compression algorithms, - change selected compression algorithm (once the device is initialised - there is no way to change compression algorithm). +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). - Examples: +Examples: #show supported compression algorithms cat /sys/block/zram0/comp_algorithm lzo [lz4] @@ -83,17 +83,27 @@ pre-created. Default: 1. #select lzo compression algorithm echo lzo > /sys/block/zram0/comp_algorithm +For the time being, the `comp_algorithm' content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + 4) Set Disksize - Set disk size by writing the value to sysfs node 'disksize'. - The value can be either in bytes or you can use mem suffixes. - Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples: + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize Note: There is little point creating a zram of greater than twice the size of memory @@ -101,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. 5) Set memory limit: Optional - Set memory limit by writing the value to sysfs node 'mem_limit'. - The value can be either in bytes or you can use mem suffixes. - In addition, you could change the value in runtime. - Examples: - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples: + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit 6) Activate: mkswap /dev/zram0 diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index a38da93865c2..5b1c27e592bf 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -195,7 +195,9 @@ prototypes: int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); + bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); + void (*putback_page) (struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); @@ -218,7 +220,9 @@ invalidatepage: yes releasepage: yes freepage: yes direct_IO: +isolate_page: yes migratepage: yes (both) +putback_page: yes launder_page: yes is_partially_uptodate: yes error_remove_page: yes @@ -538,13 +542,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. ->map_pages() is called when VM asks to map easy accessible pages. -Filesystem should find and map pages associated with offsets from "pgoff" -till "max_pgoff". ->map_pages() is called with page table locked and must +Filesystem should find and map pages associated with offsets from "start_pgoff" +till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup -page table entry. Pointer to entry associated with offset "pgoff" is -passed in "pte" field in vm_fault structure. Pointers to entries for other -offsets should be calculated relative to "pte". +page table entry. Pointer to entry associated with the page is passed in +"pte" field in fault_env structure. Pointers to entries for other offsets +should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 5b21ef76f751..c0727dc36271 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -267,7 +267,8 @@ among NILFS2 files can be depicted as follows: `-- file (ino=yy) ( regular file, directory, or symlink ) -For detail on the format of each file, please see include/linux/nilfs2_fs.h. +For detail on the format of each file, please see nilfs2_ondisk.h +located at include/uapi/linux directory. There are no patents or other intellectual property that we protect with regard to the design of NILFS2. It is allowed to replicate the diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e8d00759bfa5..50fcf48f4d58 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -436,6 +436,7 @@ Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB AnonHugePages: 0 kB +ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB @@ -464,6 +465,8 @@ accessed. a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. +"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by +huge pages. "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. @@ -868,6 +871,9 @@ VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB AnonHugePages: 49152 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB + MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) @@ -912,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Shmem: Total memory used by shared memory (shmem) and tmpfs +ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated + with huge pages +ShmemPmdMapped: Shared memory mapped into userspace with huge pages Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 70a056fe51a3..c74aba40573f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -589,9 +589,14 @@ struct address_space_operations { int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* isolate a page for migration */ + bool (*isolate_page) (struct page *, isolate_mode_t); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); + /* put migration-failed page back to right list */ + void (*putback_page) (struct page *); int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); @@ -737,6 +742,10 @@ struct address_space_operations { and transfer data directly between the storage and the application's address space. + isolate_page: Called by the VM when isolating a movable non-lru page. + If page is successfully isolated, VM marks the page as PG_isolated + via __SetPageIsolated. + migrate_page: This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is signalling imminent failure) it will pass a new page @@ -744,6 +753,8 @@ struct address_space_operations { transfer any private data across and update any references that it has to the page. + putback_page: Called by the VM when isolated page's migration fails. + launder_page: Called before freeing a page - it writes back the dirty page. To prevent redirtying the page, it is kept locked during the whole operation. diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 56af5e43e9c0..81c7f2bb7daf 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -248,7 +248,7 @@ Code Seq#(hex) Include File Comments 'm' 00 drivers/scsi/megaraid/megaraid_ioctl.h conflict! 'm' 00-1F net/irda/irmod.h conflict! 'n' 00-7F linux/ncp_fs.h and fs/ncpfs/ioctl.c -'n' 80-8F linux/nilfs2_fs.h NILFS2 +'n' 80-8F uapi/linux/nilfs2_api.h NILFS2 'n' E0-FF linux/matroxfb.h matroxfb 'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 'o' 00-03 mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 71a291633828..0ebd0905f2a0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3160,8 +3160,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: <bool> (1/Y/y=enable, 0/N/n=disable) default: disabled - printk.time= Show timing data prefixed to each printk message line - Format: <bool> (1/Y/y=enable, 0/N/n=disable) + printk.time= Show timestamp prefixed to each printk message line + Format: <string> + (0/N/n = disable, 1/Y/y = local clock, + 2 = monotonic clock, 3 = real clock) processor.max_cstate= [HW,ACPI] Limit processor to maximum C-state diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index fea5c0864170..94bd9c11c4e0 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -142,5 +142,111 @@ Steps: 20. The new page is moved to the LRU and can be scanned by the swapper etc again. -Christoph Lameter, May 8, 2006. +C. Non-LRU page migration +------------------------- + +Although original migration aimed for reducing the latency of memory access +for NUMA, compaction who want to create high-order page is also main customer. + +Current problem of the implementation is that it is designed to migrate only +*LRU* pages. However, there are potential non-lru pages which can be migrated +in drivers, for example, zsmalloc, virtio-balloon pages. + +For virtio-balloon pages, some parts of migration code path have been hooked +up and added virtio-balloon specific functions to intercept migration logics. +It's too specific to a driver so other drivers who want to make their pages +movable would have to add own specific hooks in migration path. + +To overclome the problem, VM supports non-LRU page migration which provides +generic functions for non-LRU movable pages without driver specific hooks +migration path. + +If a driver want to make own pages movable, it should define three functions +which are function pointers of struct address_space_operations. + +1. bool (*isolate_page) (struct page *page, isolate_mode_t mode); + +What VM expects on isolate_page function of driver is to return *true* +if driver isolates page successfully. On returing true, VM marks the page +as PG_isolated so concurrent isolation in several CPUs skip the page +for isolation. If a driver cannot isolate the page, it should return *false*. + +Once page is successfully isolated, VM uses page.lru fields so driver +shouldn't expect to preserve values in that fields. + +2. int (*migratepage) (struct address_space *mapping, + struct page *newpage, struct page *oldpage, enum migrate_mode); + +After isolation, VM calls migratepage of driver with isolated page. +The function of migratepage is to move content of the old page to new page +and set up fields of struct page newpage. Keep in mind that you should +indicate to the VM the oldpage is no longer movable via __ClearPageMovable() +under page_lock if you migrated the oldpage successfully and returns +MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver +can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time +because VM interprets -EAGAIN as "temporal migration failure". On returning +any error except -EAGAIN, VM will give up the page migration without retrying +in this time. + +Driver shouldn't touch page.lru field VM using in the functions. + +3. void (*putback_page)(struct page *); + +If migration fails on isolated page, VM should return the isolated page +to the driver so VM calls driver's putback_page with migration failed page. +In this function, driver should put the isolated page back to the own data +structure. +4. non-lru movable page flags + +There are two page flags for supporting non-lru movable page. + +* PG_movable + +Driver should use the below function to make page movable under page_lock. + + void __SetPageMovable(struct page *page, struct address_space *mapping) + +It needs argument of address_space for registering migration family functions +which will be called by VM. Exactly speaking, PG_movable is not a real flag of +struct page. Rather than, VM reuses page->mapping's lower bits to represent it. + + #define PAGE_MAPPING_MOVABLE 0x2 + page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; + +so driver shouldn't access page->mapping directly. Instead, driver should +use page_mapping which mask off the low two bits of page->mapping under +page lock so it can get right struct address_space. + +For testing of non-lru movable page, VM supports __PageMovable function. +However, it doesn't guarantee to identify non-lru movable page because +page->mapping field is unified with other variables in struct page. +As well, if driver releases the page after isolation by VM, page->mapping +doesn't have stable value although it has PAGE_MAPPING_MOVABLE +(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether +page is LRU or non-lru movable once the page has been isolated. Because +LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also +good for just peeking to test non-lru movable pages before more expensive +checking with lock_page in pfn scanning to select victim. + +For guaranteeing non-lru movable page, VM provides PageMovable function. +Unlike __PageMovable, PageMovable functions validates page->mapping and +mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden +destroying of page->mapping. + +Driver using __SetPageMovable should clear the flag via __ClearMovablePage +under page_lock before the releasing the page. + +* PG_isolated + +To prevent concurrent isolation among several CPUs, VM marks isolated page +as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru +movable page, it can skip it. Driver doesn't need to manipulate the flag +because VM will set/clear it automatically. Keep in mind that if driver +sees PG_isolated page, it means the page have been isolated by VM so it +shouldn't touch page.lru field. +PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag +for own purpose. + +Christoph Lameter, May 8, 2006. +Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 7c871d6beb63..2ec6adb5a4ce 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -9,8 +9,8 @@ using huge pages for the backing of virtual memory with huge pages that supports the automatic promotion and demotion of page sizes and without the shortcomings of hugetlbfs. -Currently it only works for anonymous memory mappings but in the -future it can expand over the pagecache layer starting with tmpfs. +Currently it only works for anonymous memory mappings and tmpfs/shmem. +But in the future it can expand to other filesystems. The reason applications are running faster is because of two factors. The first factor is almost completely irrelevant and it's not @@ -57,10 +57,6 @@ miss is going to run faster. feature that applies to all dynamic high order allocations in the kernel) -- this initial support only offers the feature in the anonymous memory - regions but it'd be ideal to move it to tmpfs and the pagecache - later - Transparent Hugepage Support maximizes the usefulness of free memory if compared to the reservation approach of hugetlbfs by allowing all unused memory to be used as cache or other movable (or even unmovable @@ -94,21 +90,21 @@ madvise(MADV_HUGEPAGE) on their critical mmapped regions. == sysfs == -Transparent Hugepage Support can be entirely disabled (mostly for -debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to -avoid the risk of consuming more memory resources) or enabled system -wide. This can be achieved with one of: +Transparent Hugepage Support for anonymous memory can be entirely disabled +(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE +regions (to avoid the risk of consuming more memory resources) or enabled +system wide. This can be achieved with one of: echo always >/sys/kernel/mm/transparent_hugepage/enabled echo madvise >/sys/kernel/mm/transparent_hugepage/enabled echo never >/sys/kernel/mm/transparent_hugepage/enabled It's also possible to limit defrag efforts in the VM to generate -hugepages in case they're not immediately free to madvise regions or -to never try to defrag memory and simply fallback to regular pages -unless hugepages are immediately available. Clearly if we spend CPU -time to defrag memory, we would expect to gain even more by the fact -we use hugepages later instead of regular pages. This isn't always +anonymous hugepages in case they're not immediately free to madvise +regions or to never try to defrag memory and simply fallback to regular +pages unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact we +use hugepages later instead of regular pages. This isn't always guaranteed, but it may be more likely in case the allocation is for a MADV_HUGEPAGE region. @@ -133,9 +129,9 @@ that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. "never" should be self-explanatory. -By default kernel tries to use huge zero page on read page fault. -It's possible to disable huge zero page by writing 0 or enable it -back by writing 1: +By default kernel tries to use huge zero page on read page fault to +anonymous mapping. It's possible to disable huge zero page by writing 0 +or enable it back by writing 1: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page @@ -204,21 +200,67 @@ Support by passing the parameter "transparent_hugepage=always" or "transparent_hugepage=madvise" or "transparent_hugepage=never" (without "") to the kernel command line. +== Hugepages in tmpfs/shmem == + +You can control hugepage allocation policy in tmpfs with mount option +"huge=". It can have following values: + + - "always": + Attempt to allocate huge pages every time we need a new page; + + - "never": + Do not allocate huge pages; + + - "within_size": + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + + - "advise: + Only allocate huge pages if requested with fadvise()/madvise(); + +The default policy is "never". + +"mount -o remount,huge= /mountpoint" works fine after mount: remounting +huge=never will not attempt to break up huge pages at all, just stop more +from being allocated. + +There's also sysfs knob to control hugepage allocation policy for internal +shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount +is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or +MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. + +In addition to policies listed above, shmem_enabled allows two further +values: + + - "deny": + For use in emergencies, to force the huge option off from + all mounts; + - "force": + Force the huge option on for all - very useful for testing; + == Need of application restart == -The transparent_hugepage/enabled values only affect future -behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to -the regions registered in khugepaged. +The transparent_hugepage/enabled values and tmpfs mount option only affect +future behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to the +regions registered in khugepaged. == Monitoring usage == -The number of transparent huge pages currently used by the system is -available by reading the AnonHugePages field in /proc/meminfo. To -identify what applications are using transparent huge pages, it is -necessary to read /proc/PID/smaps and count the AnonHugePages fields -for each mapping. Note that reading the smaps file is expensive and -reading it frequently will incur overhead. +The number of anonymous transparent huge pages currently used by the +system is available by reading the AnonHugePages field in /proc/meminfo. +To identify what applications are using anonymous transparent huge pages, +it is necessary to read /proc/PID/smaps and count the AnonHugePages fields +for each mapping. + +The number of file transparent huge pages mapped to userspace is available +by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo. +To identify what applications are mapping file transparent huge pages, it +is necessary to read /proc/PID/smaps and count the FileHugeMapped fields +for each mapping. + +Note that reading the smaps file is expensive and reading it +frequently will incur overhead. There are a number of counters in /proc/vmstat that may be used to monitor how successfully the system is providing huge pages for use. @@ -238,6 +280,12 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range of pages that should be collapsed into one huge page but failed the allocation. +thp_file_alloc is incremented every time a file huge page is successfully +i allocated. + +thp_file_mapped is incremented every time a file huge page is mapped into + user address space. + thp_split_page is incremented every time a huge page is split into base pages. This can happen for a variety of reasons but a common reason is that a huge page is old and is being reclaimed. @@ -403,19 +451,27 @@ pages: on relevant sub-page of the compound page. - map/unmap of the whole compound page accounted in compound_mapcount - (stored in first tail page). + (stored in first tail page). For file huge pages, we also increment + ->_mapcount of all sub-pages in order to have race-free detection of + last unmap of subpages. -PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one. -This additional reference is required to get race-free detection of unmap of -subpages when we have them mapped with both PMDs and PTEs. +PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. + +For anonymous pages PageDoubleMap() also indicates ->_mapcount in all +subpages is offset up by one. This additional reference is required to +get race-free detection of unmap of subpages when we have them mapped with +both PMDs and PTEs. This is optimization required to lower overhead of per-subpage mapcount tracking. The alternative is alter ->_mapcount in all subpages on each map/unmap of the whole compound page. -We set PG_double_map when a PMD of the page got split for the first time, -but still have PMD mapping. The additional references go away with last -compound_mapcount. +For anonymous pages, we set PG_double_map when a PMD of the page got split +for the first time, but still have PMD mapping. The additional references +go away with last compound_mapcount. + +File pages get PG_double_map set on first map of the page with PTE and +goes away when the page gets evicted from page cache. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page @@ -427,7 +483,7 @@ sum of mapcount of all sub-pages plus one (split_huge_page caller must have reference for head page). split_huge_page uses migration entries to stabilize page->_refcount and -page->_mapcount. +page->_mapcount of anonymous pages. File pages just got unmapped. We safe against physical memory scanners too: the only legitimate way scanner can get reference to a page is get_page_unless_zero(). diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index fa3b527086fa..0026a8d33fc0 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -461,6 +461,27 @@ unevictable LRU is enabled, the work of compaction is mostly handled by the page migration code and the same work flow as described in MIGRATING MLOCKED PAGES will apply. +MLOCKING TRANSPARENT HUGE PAGES +------------------------------- + +A transparent huge page is represented by a single entry on an LRU list. +Therefore, we can only make unevictable an entire compound page, not +individual subpages. + +If a user tries to mlock() part of a huge page, we want the rest of the +page to be reclaimable. + +We cannot just split the page on partial mlock() as split_huge_page() can +fail and new intermittent failure mode for the syscall is undesirable. + +We handle this by keeping PTE-mapped huge pages on normal LRU lists: the +PMD on border of VM_LOCKED VMA will be split into PTE table. + +This way the huge page is accessible for vmscan. Under memory pressure the +page will be split, subpages which belong to VM_LOCKED VMAs will be moved +to unevictable LRU and the rest can be reclaimed. + +See also comment in follow_trans_huge_pmd(). mmap(MAP_LOCKED) SYSTEM CALL HANDLING ------------------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 760c48ca9f8b..68a90d0c2d3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8236,8 +8236,9 @@ T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ -F: include/linux/nilfs2_fs.h F: include/trace/events/nilfs2.h +F: include/uapi/linux/nilfs2_api.h +F: include/uapi/linux/nilfs2_ondisk.h NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER M: YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp> diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h index f54bdf658cd0..d3398f6ab74c 100644 --- a/arch/alpha/kernel/machvec_impl.h +++ b/arch/alpha/kernel/machvec_impl.h @@ -137,7 +137,7 @@ #define __initmv __initdata #define ALIAS_MV(x) #else -#define __initmv __initdata_refok +#define __initmv __refdata /* GCC actually has a syntax for defining aliases, but is under some delusion that you shouldn't be able to declare it extern somewhere diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 4a905bd667e2..83e9eee57a55 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -147,7 +147,7 @@ retry: /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index af63f4a13e60..e94e5aa33985 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -137,7 +137,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ if (unlikely(fatal_signal_pending(current))) { diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 8be930394750..399e2f223d25 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -220,7 +220,7 @@ void __init mem_init(void) /* * free_initmem: Free all the __init memory. */ -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(-1); } diff --git a/arch/arm/boot/dts/keystone.dtsi b/arch/arm/boot/dts/keystone.dtsi index e23f46d15c80..00cb314d5e4d 100644 --- a/arch/arm/boot/dts/keystone.dtsi +++ b/arch/arm/boot/dts/keystone.dtsi @@ -70,6 +70,14 @@ cpu_on = <0x84000003>; }; + psci { + compatible = "arm,psci"; + method = "smc"; + cpu_suspend = <0x84000001>; + cpu_off = <0x84000002>; + cpu_on = <0x84000003>; + }; + soc { #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig index d3260d7d5af1..55eba3576030 100644 --- a/arch/arm/configs/axm55xx_defconfig +++ b/arch/arm/configs/axm55xx_defconfig @@ -232,7 +232,7 @@ CONFIG_NFS_FSCACHE=y CONFIG_SUNRPC_DEBUG=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig index 79de828e49ad..cc55e4252fda 100644 --- a/arch/arm/configs/bcm2835_defconfig +++ b/arch/arm/configs/bcm2835_defconfig @@ -130,7 +130,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_BOOT_PRINTK_DELAY=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y diff --git a/arch/arm/configs/colibri_pxa270_defconfig b/arch/arm/configs/colibri_pxa270_defconfig index 0b9211b2b73b..99b706260084 100644 --- a/arch/arm/configs/colibri_pxa270_defconfig +++ b/arch/arm/configs/colibri_pxa270_defconfig @@ -156,7 +156,7 @@ CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/arm/configs/colibri_pxa300_defconfig b/arch/arm/configs/colibri_pxa300_defconfig index be02fe2b14cb..3a0b92b9a86b 100644 --- a/arch/arm/configs/colibri_pxa300_defconfig +++ b/arch/arm/configs/colibri_pxa300_defconfig @@ -58,7 +58,7 @@ CONFIG_INOTIFY=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig index 701677f9248c..d952d3d911b9 100644 --- a/arch/arm/configs/dove_defconfig +++ b/arch/arm/configs/dove_defconfig @@ -119,7 +119,7 @@ CONFIG_NLS_CODEPAGE_850=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_2=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig index c0dac0f0f804..306d1566d259 100644 --- a/arch/arm/configs/efm32_defconfig +++ b/arch/arm/configs/efm32_defconfig @@ -93,7 +93,7 @@ CONFIG_EXT2_FS=y CONFIG_ROMFS_FS=y CONFIG_ROMFS_BACKED_BY_MTD=y # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index 01986deef7c5..a5aa44c56bfe 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -245,7 +245,7 @@ CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig index ea316c4b890e..c4bdecd93207 100644 --- a/arch/arm/configs/ezx_defconfig +++ b/arch/arm/configs/ezx_defconfig @@ -379,7 +379,7 @@ CONFIG_NLS_ISO8859_15=m CONFIG_NLS_KOI8_R=m CONFIG_NLS_KOI8_U=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/h5000_defconfig b/arch/arm/configs/h5000_defconfig index 37903e3f0efc..e3507cb9835a 100644 --- a/arch/arm/configs/h5000_defconfig +++ b/arch/arm/configs/h5000_defconfig @@ -66,7 +66,7 @@ CONFIG_TMPFS=y CONFIG_JFFS2_FS=y CONFIG_JFFS2_COMPRESSION_OPTIONS=y # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set diff --git a/arch/arm/configs/hisi_defconfig b/arch/arm/configs/hisi_defconfig index b2e340b272ee..4dc0a4a26c2f 100644 --- a/arch/arm/configs/hisi_defconfig +++ b/arch/arm/configs/hisi_defconfig @@ -82,7 +82,7 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y diff --git a/arch/arm/configs/imote2_defconfig b/arch/arm/configs/imote2_defconfig index 18e59feaa307..672010d8cbaa 100644 --- a/arch/arm/configs/imote2_defconfig +++ b/arch/arm/configs/imote2_defconfig @@ -350,7 +350,7 @@ CONFIG_NLS_ISO8859_15=m CONFIG_NLS_KOI8_R=m CONFIG_NLS_KOI8_U=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 3219480b9dbf..4eaf02367ae3 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -368,7 +368,7 @@ CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig index 71b42e66488a..8d257bac6989 100644 --- a/arch/arm/configs/keystone_defconfig +++ b/arch/arm/configs/keystone_defconfig @@ -198,7 +198,7 @@ CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_SHIRQ=y CONFIG_DEBUG_USER=y diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig index 2ae00b09cfc2..c25ff23e2ac4 100644 --- a/arch/arm/configs/lpc18xx_defconfig +++ b/arch/arm/configs/lpc18xx_defconfig @@ -159,7 +159,7 @@ CONFIG_EXT2_FS=y # CONFIG_INOTIFY_USER is not set CONFIG_JFFS2_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig index a5b4920cd6d4..accbf109abcf 100644 --- a/arch/arm/configs/magician_defconfig +++ b/arch/arm/configs/magician_defconfig @@ -167,7 +167,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_1251=m CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y diff --git a/arch/arm/configs/mmp2_defconfig b/arch/arm/configs/mmp2_defconfig index f1cb95e58af0..0385e4a98cb1 100644 --- a/arch/arm/configs/mmp2_defconfig +++ b/arch/arm/configs/mmp2_defconfig @@ -81,7 +81,7 @@ CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig index a3cb76cfb828..32fab5a0c30e 100644 --- a/arch/arm/configs/moxart_defconfig +++ b/arch/arm/configs/moxart_defconfig @@ -124,7 +124,7 @@ CONFIG_EXT3_FS=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index a25565eda306..45265ff3cdcb 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -880,7 +880,7 @@ CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y diff --git a/arch/arm/configs/mvebu_v7_defconfig b/arch/arm/configs/mvebu_v7_defconfig index 486a4cabb0dd..d9dfcab96da9 100644 --- a/arch/arm/configs/mvebu_v7_defconfig +++ b/arch/arm/configs/mvebu_v7_defconfig @@ -147,7 +147,7 @@ CONFIG_NLS_CODEPAGE_850=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_2=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index 6e0f751be229..8a3d2f21c18f 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -166,7 +166,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_850=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_15=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_FRAME_WARN=2048 CONFIG_UNUSED_SYMBOLS=y diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index ac717cccd2b5..506f4fa57dc9 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -460,7 +460,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_SPLIT=y CONFIG_DEBUG_INFO_DWARF4=y diff --git a/arch/arm/configs/pxa168_defconfig b/arch/arm/configs/pxa168_defconfig index 74d7e0104f8d..5824b89de1c3 100644 --- a/arch/arm/configs/pxa168_defconfig +++ b/arch/arm/configs/pxa168_defconfig @@ -57,7 +57,7 @@ CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig index 5f337d7ceb5b..e60d65624f35 100644 --- a/arch/arm/configs/pxa3xx_defconfig +++ b/arch/arm/configs/pxa3xx_defconfig @@ -117,7 +117,7 @@ CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_SHIRQ=y diff --git a/arch/arm/configs/pxa910_defconfig b/arch/arm/configs/pxa910_defconfig index 3bb7771d3c19..a2f6f08e7b5e 100644 --- a/arch/arm/configs/pxa910_defconfig +++ b/arch/arm/configs/pxa910_defconfig @@ -65,7 +65,7 @@ CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index dc5517eaf09f..621c4dc1c642 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -734,7 +734,7 @@ CONFIG_NLS_ASCII=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_FRAME_WARN=0 diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig index c2dff4fd5fc4..a09a03f68daf 100644 --- a/arch/arm/configs/qcom_defconfig +++ b/arch/arm/configs/qcom_defconfig @@ -189,7 +189,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/raumfeld_defconfig b/arch/arm/configs/raumfeld_defconfig index 3d833aea545a..b836f8f0f066 100644 --- a/arch/arm/configs/raumfeld_defconfig +++ b/arch/arm/configs/raumfeld_defconfig @@ -196,7 +196,7 @@ CONFIG_NLS_ISO8859_15=y CONFIG_NLS_KOI8_R=y CONFIG_NLS_KOI8_U=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/arm/configs/shmobile_defconfig b/arch/arm/configs/shmobile_defconfig index baa07a46a88b..fbe95719397e 100644 --- a/arch/arm/configs/shmobile_defconfig +++ b/arch/arm/configs/shmobile_defconfig @@ -211,7 +211,7 @@ CONFIG_NFS_V4_1=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set # CONFIG_ARM_UNWIND is not set diff --git a/arch/arm/configs/socfpga_defconfig b/arch/arm/configs/socfpga_defconfig index 753f1a5defc7..e277dc607aff 100644 --- a/arch/arm/configs/socfpga_defconfig +++ b/arch/arm/configs/socfpga_defconfig @@ -101,7 +101,7 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig index 1e5ec2a0e4cf..44b6b037cd3f 100644 --- a/arch/arm/configs/stm32_defconfig +++ b/arch/arm/configs/stm32_defconfig @@ -61,7 +61,7 @@ CONFIG_STM32_DMA=y # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set CONFIG_NLS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig index 714da336ec86..6a1447cf8feb 100644 --- a/arch/arm/configs/sunxi_defconfig +++ b/arch/arm/configs/sunxi_defconfig @@ -148,6 +148,6 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_CRYPTO_DEV_SUN4I_SS=y diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig index 6012a1ec779f..bc80a47957c9 100644 --- a/arch/arm/configs/tegra_defconfig +++ b/arch/arm/configs/tegra_defconfig @@ -287,7 +287,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig index aaa95ab606a8..77460061cba4 100644 --- a/arch/arm/configs/u300_defconfig +++ b/arch/arm/configs/u300_defconfig @@ -63,7 +63,7 @@ CONFIG_VFAT_FS=y CONFIG_TMPFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/u8500_defconfig b/arch/arm/configs/u8500_defconfig index b7b09189f1c5..d95796881b89 100644 --- a/arch/arm/configs/u8500_defconfig +++ b/arch/arm/configs/u8500_defconfig @@ -128,7 +128,7 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm/configs/vt8500_v6_v7_defconfig b/arch/arm/configs/vt8500_v6_v7_defconfig index 1bfaa7bfc392..d5a2cc807940 100644 --- a/arch/arm/configs/vt8500_v6_v7_defconfig +++ b/arch/arm/configs/vt8500_v6_v7_defconfig @@ -84,6 +84,6 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_KERNEL=y CONFIG_LOCKUP_DETECTOR=y diff --git a/arch/arm/configs/xcep_defconfig b/arch/arm/configs/xcep_defconfig index 721832ffe2d7..5c8435db7ae9 100644 --- a/arch/arm/configs/xcep_defconfig +++ b/arch/arm/configs/xcep_defconfig @@ -85,7 +85,7 @@ CONFIG_NFS_V3=y CONFIG_NLS=m CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/zx_defconfig b/arch/arm/configs/zx_defconfig index d6253a48a9fa..2684bfa04232 100644 --- a/arch/arm/configs/zx_defconfig +++ b/arch/arm/configs/zx_defconfig @@ -97,7 +97,7 @@ CONFIG_TMPFS_POSIX_ACL=y CONFIG_NLS_CODEPAGE_936=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO=y diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h index c2b9b4bdec00..1869af6bac5c 100644 --- a/arch/arm/include/asm/kexec.h +++ b/arch/arm/include/asm/kexec.h @@ -53,6 +53,30 @@ static inline void crash_setup_regs(struct pt_regs *newregs, /* Function pointer to optional machine-specific reinitialization */ extern void (*kexec_reinit)(void); +static inline unsigned long phys_to_boot_phys(phys_addr_t phys) +{ + return phys_to_idmap(phys); +} +#define phys_to_boot_phys phys_to_boot_phys + +static inline phys_addr_t boot_phys_to_phys(unsigned long entry) +{ + return idmap_to_phys(entry); +} +#define boot_phys_to_phys boot_phys_to_phys + +static inline unsigned long page_to_boot_pfn(struct page *page) +{ + return page_to_pfn(page) + (arch_phys_to_idmap_offset >> PAGE_SHIFT); +} +#define page_to_boot_pfn page_to_boot_pfn + +static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) +{ + return pfn_to_page(boot_pfn - (arch_phys_to_idmap_offset >> PAGE_SHIFT)); +} +#define boot_pfn_to_page boot_pfn_to_page + #endif /* __ASSEMBLY__ */ #endif /* CONFIG_KEXEC */ diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 20febb368844..b2902a5cd780 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) static inline void clean_pte_table(pte_t *pte) { diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 3cadb726ec88..1e25cd80589e 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -209,17 +209,38 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) tlb_flush(tlb); } -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + if (tlb->nr == tlb->max) + return true; tlb->pages[tlb->nr++] = page; - VM_BUG_ON(tlb->nr > tlb->max); - return tlb->max - tlb->nr; + return false; } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (!__tlb_remove_page(tlb, page)) + if (__tlb_remove_page(tlb, page)) { tlb_flush_mmu(tlb); + __tlb_remove_page(tlb, page); + } +} + +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 59fd0e24c56b..b18c1ea56bed 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -57,7 +57,7 @@ int machine_kexec_prepare(struct kimage *image) for (i = 0; i < image->nr_segments; i++) { current_segment = &image->segment[i]; - if (!memblock_is_region_memory(current_segment->mem, + if (!memblock_is_region_memory(idmap_to_phys(current_segment->mem), current_segment->memsz)) return -EINVAL; diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 4289cf456b80..aca999e17184 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -848,10 +848,29 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) kernel_data.end = virt_to_phys(_end - 1); for_each_memblock(memory, region) { + phys_addr_t start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); + phys_addr_t end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; + unsigned long boot_alias_start; + + /* + * Some systems have a special memory alias which is only + * used for booting. We need to advertise this region to + * kexec-tools so they know where bootable RAM is located. + */ + boot_alias_start = phys_to_idmap(start); + if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { + res = memblock_virt_alloc(sizeof(*res), 0); + res->name = "System RAM (boot alias)"; + res->start = boot_alias_start; + res->end = phys_to_idmap(end); + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + } + res = memblock_virt_alloc(sizeof(*res), 0); res->name = "System RAM"; - res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); - res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; + res->start = start; + res->end = end; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); @@ -1000,9 +1019,25 @@ static void __init reserve_crashkernel(void) (unsigned long)(crash_base >> 20), (unsigned long)(total_mem >> 20)); + /* The crashk resource must always be located in normal mem */ crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); + + if (arm_has_idmap_alias()) { + /* + * If we have a special RAM alias for use at boot, we + * need to advertise to kexec tools where the alias is. + */ + static struct resource crashk_boot_res = { + .name = "Crash kernel (boot alias)", + .flags = IORESOURCE_BUSY | IORESOURCE_MEM, + }; + + crashk_boot_res.start = phys_to_idmap(crash_base); + crashk_boot_res.end = crashk_boot_res.start + crash_size - 1; + insert_resource(&iomem_resource, &crashk_boot_res); + } } #else static inline void reserve_crashkernel(void) {} diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c index 38b0da300dd5..ed9a01484030 100644 --- a/arch/arm/mach-integrator/impd1.c +++ b/arch/arm/mach-integrator/impd1.c @@ -320,11 +320,11 @@ static struct impd1_device impd1_devs[] = { #define IMPD1_VALID_IRQS 0x00000bffU /* - * As this module is bool, it is OK to have this as __init_refok() - no + * As this module is bool, it is OK to have this as __ref() - no * probe calls will be done after the initial system bootup, as devices * are discovered as part of the machine startup. */ -static int __init_refok impd1_probe(struct lm_device *dev) +static int __ref impd1_probe(struct lm_device *dev) { struct impd1_module *impd1; int irq_base; diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 45a05207b418..6af5430d0d97 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -343,7 +343,7 @@ void __init mv78xx0_init_early(void) DDR_WINDOW_CPU1_BASE, DDR_WINDOW_CPU_SZ); } -void __init_refok mv78xx0_timer_init(void) +void __ref mv78xx0_timer_init(void) { orion_time_init(BRIDGE_VIRT_BASE, BRIDGE_INT_TIMER1_CLR, IRQ_MV78XX0_TIMER_1, get_tclk()); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ff7ed5697d3e..4abc50952451 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -621,7 +621,7 @@ static void __free_from_contiguous(struct device *dev, struct page *page, dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); } -static inline pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot) +static inline pgprot_t __get_dma_pgprot(const struct dma_attrs *attrs, pgprot_t prot) { prot = dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs) ? pgprot_writecombine(prot) : @@ -732,7 +732,7 @@ static struct arm_dma_allocator remap_allocator = { static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, pgprot_t prot, bool is_coherent, - struct dma_attrs *attrs, const void *caller) + const struct dma_attrs *attrs, const void *caller) { u64 mask = get_coherent_dma_mask(dev); struct page *page = NULL; @@ -878,7 +878,7 @@ int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma, * Free a buffer as defined by the above mapping. */ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t handle, struct dma_attrs *attrs, + dma_addr_t handle, const struct dma_attrs *attrs, bool is_coherent) { struct page *page = pfn_to_page(dma_to_pfn(dev, handle)); @@ -1253,7 +1253,8 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping, static const int iommu_order_array[] = { 9, 8, 4, 0 }; static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, - gfp_t gfp, struct dma_attrs *attrs) + gfp_t gfp, + const struct dma_attrs *attrs) { struct page **pages; int count = size >> PAGE_SHIFT; @@ -1342,7 +1343,7 @@ error: } static int __iommu_free_buffer(struct device *dev, struct page **pages, - size_t size, struct dma_attrs *attrs) + size_t size, const struct dma_attrs *attrs) { int count = size >> PAGE_SHIFT; int i; @@ -1439,7 +1440,8 @@ static struct page **__atomic_get_pages(void *addr) return (struct page **)page; } -static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs) +static struct page **__iommu_get_pages(void *cpu_addr, + const struct dma_attrs *attrs) { struct vm_struct *area; @@ -1633,8 +1635,8 @@ static int __dma_direction_to_prot(enum dma_data_direction dir) */ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg, size_t size, dma_addr_t *handle, - enum dma_data_direction dir, struct dma_attrs *attrs, - bool is_coherent) + enum dma_data_direction dir, + const struct dma_attrs *attrs, bool is_coherent) { struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev); dma_addr_t iova, iova_base; @@ -1676,8 +1678,8 @@ fail: } static int __iommu_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs, - bool is_coherent) + enum dma_data_direction dir, + const struct dma_attrs *attrs, bool is_coherent) { struct scatterlist *s = sg, *dma = sg, *start = sg; int i, count = 0; @@ -1758,8 +1760,8 @@ int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg, } static void __iommu_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, struct dma_attrs *attrs, - bool is_coherent) + int nents, enum dma_data_direction dir, + const struct dma_attrs *attrs, bool is_coherent) { struct scatterlist *s; int i; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index ad5841856007..3a2e678b8d30 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -243,7 +243,7 @@ good_area: goto out; } - return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); + return handle_mm_fault(vma, addr & PAGE_MASK, flags); check_stack: /* Don't allow expansion below FIRST_USER_ADDRESS */ diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index b8d477321730..c1c1a5c67da1 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -23,7 +23,7 @@ #define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL) #define __pgd_free(pgd) kfree(pgd) #else -#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT, 2) +#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2) #define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) #endif diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 5c5970a1c2d2..e69051098435 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -361,7 +361,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_VIRTUALIZATION=y CONFIG_KVM=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 46a4157adc17..4499239e3ba9 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -32,7 +32,7 @@ static int swiotlb __read_mostly; -static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, +static pgprot_t __get_dma_pgprot(const struct dma_attrs *attrs, pgprot_t prot, bool coherent) { if (!coherent || dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs)) @@ -91,7 +91,7 @@ static int __free_from_pool(void *start, size_t size) static void *__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + const struct dma_attrs *attrs) { if (dev == NULL) { WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); @@ -121,7 +121,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, static void __dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + const struct dma_attrs *attrs) { bool freed; phys_addr_t paddr = dma_to_phys(dev, dma_handle); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index fc5a34a72c6d..ae24a481811e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -231,7 +231,7 @@ good_area: goto out; } - return handle_mm_fault(mm, vma, addr & PAGE_MASK, mm_flags); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); check_stack: if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index c03533937a9f..a4b7edac8f10 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -134,7 +134,7 @@ good_area: * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 166842de3dc7..b59cd7c3261a 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -112,7 +112,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU free_initmem_default(-1); diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 3066d40a6db1..112ef26c7f2e 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -168,7 +168,7 @@ retry: * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 61d99767fe16..614a46c413d2 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -164,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, ear0, flags); + fault = handle_mm_fault(vma, ear0, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 88977e42af0a..192584d5ac2f 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -93,7 +93,7 @@ void __init mem_init(void) * Todo: free pages between __init_begin and __init_end; possibly * some devtree related stuff as well. */ -void __init_refok free_initmem(void) +void __ref free_initmem(void) { } diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 8704c9320032..bd7c251e2bce 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -101,7 +101,7 @@ good_area: break; } - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 39d64e0df1de..77e541cf0e5d 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -205,17 +205,18 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) * must be delayed until after the TLB has been flushed (see comments at the beginning of * this file). */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + if (tlb->nr == tlb->max) + return true; + tlb->need_flush = 1; if (!tlb->nr && tlb->pages == tlb->local) __tlb_alloc_page(tlb); tlb->pages[tlb->nr++] = page; - VM_BUG_ON(tlb->nr > tlb->max); - - return tlb->max - tlb->nr; + return false; } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) @@ -235,8 +236,28 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (!__tlb_remove_page(tlb, page)) + if (__tlb_remove_page(tlb, page)) { tlb_flush_mmu(tlb); + __tlb_remove_page(tlb, page); + } +} + +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); } /* diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index b72cd7a07222..599507bcec91 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,7 +163,7 @@ void arch_crash_save_vmcoreinfo(void) #endif } -unsigned long paddr_vmcoreinfo_note(void) +phys_addr_t paddr_vmcoreinfo_note(void) { return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 07a4e32ae96a..eb9220cde76c 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, } /* Caller prevents this from being called after init */ -static void * __init_refok mca_bootmem(void) +static void * __ref mca_bootmem(void) { return __alloc_bootmem(sizeof(struct ia64_mca_cpu), KERNEL_STACK_SIZE, 0); diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 70b40d1205a6..fa6ad95e992e 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -159,7 +159,7 @@ retry: * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c index b727e693c805..23f26f4adfff 100644 --- a/arch/m32r/kernel/m32r_ksyms.c +++ b/arch/m32r/kernel/m32r_ksyms.c @@ -41,6 +41,9 @@ EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(smp_flush_tlb_page); #endif +extern int __ucmpdi2(unsigned long long a, unsigned long long b); +EXPORT_SYMBOL(__ucmpdi2); + /* compiler generated symbol */ extern void __ashldi3(void); extern void __ashrdi3(void); diff --git a/arch/m32r/lib/Makefile b/arch/m32r/lib/Makefile index d16b4e40d1ae..5889eb9610b5 100644 --- a/arch/m32r/lib/Makefile +++ b/arch/m32r/lib/Makefile @@ -3,5 +3,5 @@ # lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ - delay.o strlen.o usercopy.o csum_partial_copy.o - + delay.o strlen.o usercopy.o csum_partial_copy.o \ + ucmpdi2.o diff --git a/arch/m32r/lib/libgcc.h b/arch/m32r/lib/libgcc.h new file mode 100644 index 000000000000..b2f45d0172fe --- /dev/null +++ b/arch/m32r/lib/libgcc.h @@ -0,0 +1,24 @@ +#ifndef __ASM_LIBGCC_H +#define __ASM_LIBGCC_H + +#include <asm/byteorder.h> + +#ifdef __BIG_ENDIAN +struct DWstruct { + int high, low; +}; +#elif defined(__LITTLE_ENDIAN) +struct DWstruct { + int low, high; +}; +#else +#error I feel sick. +#endif + +typedef union { + struct DWstruct s; + long long ll; +} DWunion; + +#endif /* __ASM_LIBGCC_H */ + diff --git a/arch/m32r/lib/ucmpdi2.c b/arch/m32r/lib/ucmpdi2.c new file mode 100644 index 000000000000..9d3c682c89b5 --- /dev/null +++ b/arch/m32r/lib/ucmpdi2.c @@ -0,0 +1,17 @@ +#include "libgcc.h" + +int __ucmpdi2(unsigned long long a, unsigned long long b) +{ + const DWunion au = {.ll = a}; + const DWunion bu = {.ll = b}; + + if ((unsigned int)au.s.high < (unsigned int)bu.s.high) + return 0; + else if ((unsigned int)au.s.high > (unsigned int)bu.s.high) + return 2; + if ((unsigned int)au.s.low < (unsigned int)bu.s.low) + return 0; + else if ((unsigned int)au.s.low > (unsigned int)bu.s.low) + return 2; + return 1; +} diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 8f9875b7933d..a3785d3644c2 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -196,7 +196,7 @@ good_area: */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(mm, vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 6a94cdd0c830..bd66a0b20c6b 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -136,7 +136,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); pr_debug("handle_mm_fault returns %d\n", fault); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index f57edca63609..372783a67dda 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -133,7 +133,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 177dfc003643..abb678ccde6f 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -216,7 +216,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 77bc7c7e6522..434639f9a3a6 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -414,7 +414,7 @@ void __init *early_get_page(void) #endif /* CONFIG_MMU */ -void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref alloc_maybe_bootmem(size_t size, gfp_t mask) { if (mem_init_done) return kmalloc(size, mask); @@ -422,7 +422,7 @@ void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) return alloc_bootmem(size); } -void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index eb99fcc76088..cc732fe357ad 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -234,7 +234,7 @@ unsigned long iopa(unsigned long addr) return pa; } -__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index fad8e964f14c..e4a29db1c416 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -74,7 +74,7 @@ CONFIG_USB_HCD_BCMA=y CONFIG_USB_HCD_SSB=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_DEFAULT_ON=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_STRIP_ASM_SYMS=y diff --git a/arch/mips/configs/bmips_be_defconfig b/arch/mips/configs/bmips_be_defconfig index acf7785c4cdb..cabc4ea3f87f 100644 --- a/arch/mips/configs/bmips_be_defconfig +++ b/arch/mips/configs/bmips_be_defconfig @@ -33,7 +33,7 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_STANDALONE is not set # CONFIG_PREVENT_FIRMWARE_BUILD is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_BRCMSTB_GISB_ARB=y CONFIG_MTD=y CONFIG_MTD_BCM63XX_PARTS=y diff --git a/arch/mips/configs/bmips_stb_defconfig b/arch/mips/configs/bmips_stb_defconfig index 4eb5d6e9cf8f..080066254b8e 100644 --- a/arch/mips/configs/bmips_stb_defconfig +++ b/arch/mips/configs/bmips_stb_defconfig @@ -34,7 +34,7 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_STANDALONE is not set # CONFIG_PREVENT_FIRMWARE_BUILD is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_BRCMSTB_GISB_ARB=y CONFIG_MTD=y CONFIG_MTD_CFI=y diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index 43e0ba24470c..bf164feefed2 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -148,7 +148,7 @@ CONFIG_NLS_ISO8859_15=y CONFIG_NLS_KOI8_R=y CONFIG_NLS_KOI8_U=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index 5da76e0e120f..3b1f697e7bf1 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -404,7 +404,7 @@ CONFIG_NLS_ISO8859_15=m CONFIG_NLS_KOI8_R=m CONFIG_NLS_KOI8_U=m CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_FRAME_WARN=1024 CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index 7f95c4b3ab2c..b4dfece16396 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -329,7 +329,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_936=y CONFIG_NLS_ASCII=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_FRAME_WARN=1024 CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/mips/configs/nlm_xlp_defconfig b/arch/mips/configs/nlm_xlp_defconfig index b496c25fced6..4cfd1d4b9d5d 100644 --- a/arch/mips/configs/nlm_xlp_defconfig +++ b/arch/mips/configs/nlm_xlp_defconfig @@ -553,7 +553,7 @@ CONFIG_NLS_ISO8859_14=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_KOI8_R=m CONFIG_NLS_KOI8_U=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_FRAME_WARN=1024 diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig index 8e99ad807a57..16628fa58ccc 100644 --- a/arch/mips/configs/nlm_xlr_defconfig +++ b/arch/mips/configs/nlm_xlr_defconfig @@ -513,7 +513,7 @@ CONFIG_NLS_ISO8859_14=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_KOI8_R=m CONFIG_NLS_KOI8_U=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_UNUSED_SYMBOLS=y diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig index 8b7429127a1d..022907e3df17 100644 --- a/arch/mips/configs/pistachio_defconfig +++ b/arch/mips/configs/pistachio_defconfig @@ -299,7 +299,7 @@ CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_ASCII=m CONFIG_NLS_ISO8859_1=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0 diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index d7bb8cce1068..e9dd20a123ca 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -172,7 +172,7 @@ CONFIG_NLS_ISO8859_15=y CONFIG_NLS_KOI8_R=y CONFIG_NLS_KOI8_U=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_STRIP_ASM_SYMS=y CONFIG_READABLE_ASM=y diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig index d14ae2fa7d13..005754cbee06 100644 --- a/arch/mips/configs/rt305x_defconfig +++ b/arch/mips/configs/rt305x_defconfig @@ -145,7 +145,7 @@ CONFIG_JFFS2_COMPRESSION_OPTIONS=y CONFIG_SQUASHFS=y # CONFIG_SQUASHFS_ZLIB is not set CONFIG_SQUASHFS_XZ=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_STRIP_ASM_SYMS=y diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig index 8987846240f7..c293c78d7686 100644 --- a/arch/mips/configs/xway_defconfig +++ b/arch/mips/configs/xway_defconfig @@ -143,7 +143,7 @@ CONFIG_JFFS2_COMPRESSION_OPTIONS=y CONFIG_SQUASHFS=y # CONFIG_SQUASHFS_ZLIB is not set CONFIG_SQUASHFS_XZ=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 4b88fa031891..9560ad731120 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -153,7 +153,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 9b58eb5fd0d5..a5509e7dcad2 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -504,7 +504,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void (*free_init_pages_eva)(void *begin, void *end) = NULL; -void __init_refok free_initmem(void) +void __ref free_initmem(void) { prom_free_prom_memory(); /* diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c index a77698ff2b6f..1f6bc9a3036c 100644 --- a/arch/mips/txx9/generic/pci.c +++ b/arch/mips/txx9/generic/pci.c @@ -268,7 +268,7 @@ static int txx9_i8259_irq_setup(int irq) return err; } -static void __init_refok quirk_slc90e66_bridge(struct pci_dev *dev) +static void __ref quirk_slc90e66_bridge(struct pci_dev *dev) { int irq; /* PCI/ISA Bridge interrupt */ u8 reg_64; diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 4a1d181ed32f..f23781d6bbb3 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -254,7 +254,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index b51878b0c6b8..affc4eb3f89e 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -131,7 +131,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index e75c75d249d6..c92fe4234009 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -89,7 +89,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(-1); } diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 230ac20ae794..e94cd225e816 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -163,7 +163,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 5b2a95116e8f..fa60b81aee3e 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -38,7 +38,7 @@ static unsigned int fixmaps_used __initdata; * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -void __iomem *__init_refok +void __iomem *__ref __ioremap(phys_addr_t addr, unsigned long size, pgprot_t prot) { phys_addr_t p; @@ -116,7 +116,7 @@ void iounmap(void *addr) * the memblock infrastructure. */ -pte_t __init_refok *pte_alloc_one_kernel(struct mm_struct *mm, +pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 7e0792658952..4f5ab058b05e 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -279,7 +279,7 @@ CONFIG_NLS_ASCII=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_2=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_STRIP_ASM_SYMS=y CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_FS=y diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 16dbe81c97c9..163af2c31d76 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -239,7 +239,7 @@ good_area: * fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/powerpc/configs/40x/virtex_defconfig b/arch/powerpc/configs/40x/virtex_defconfig index bcb0c4d854db..bb076ef2e5da 100644 --- a/arch/powerpc/configs/40x/virtex_defconfig +++ b/arch/powerpc/configs/40x/virtex_defconfig @@ -72,7 +72,7 @@ CONFIG_CRC_CCITT=y CONFIG_FONTS=y CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/virtex5_defconfig b/arch/powerpc/configs/44x/virtex5_defconfig index 53d0300b3390..f5cf6571c518 100644 --- a/arch/powerpc/configs/44x/virtex5_defconfig +++ b/arch/powerpc/configs/44x/virtex5_defconfig @@ -71,7 +71,7 @@ CONFIG_CRC_CCITT=y CONFIG_FONTS=y CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/warp_defconfig b/arch/powerpc/configs/44x/warp_defconfig index ee434375fc24..e925959ed912 100644 --- a/arch/powerpc/configs/44x/warp_defconfig +++ b/arch/powerpc/configs/44x/warp_defconfig @@ -93,7 +93,7 @@ CONFIG_NLS_ISO8859_15=y CONFIG_NLS_UTF8=y CONFIG_CRC_CCITT=y CONFIG_CRC_T10DIF=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/powerpc/configs/52xx/cm5200_defconfig b/arch/powerpc/configs/52xx/cm5200_defconfig index 19fad0e0016e..3cc0a427a05c 100644 --- a/arch/powerpc/configs/52xx/cm5200_defconfig +++ b/arch/powerpc/configs/52xx/cm5200_defconfig @@ -74,7 +74,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_CRYPTO_ECB=y diff --git a/arch/powerpc/configs/52xx/lite5200b_defconfig b/arch/powerpc/configs/52xx/lite5200b_defconfig index 5f40ba92a39a..5a24f604e5de 100644 --- a/arch/powerpc/configs/52xx/lite5200b_defconfig +++ b/arch/powerpc/configs/52xx/lite5200b_defconfig @@ -60,7 +60,7 @@ CONFIG_TMPFS=y CONFIG_NFS_FS=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set diff --git a/arch/powerpc/configs/52xx/motionpro_defconfig b/arch/powerpc/configs/52xx/motionpro_defconfig index 909e185a88d1..e7ef6c8ae75b 100644 --- a/arch/powerpc/configs/52xx/motionpro_defconfig +++ b/arch/powerpc/configs/52xx/motionpro_defconfig @@ -86,7 +86,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set diff --git a/arch/powerpc/configs/52xx/tqm5200_defconfig b/arch/powerpc/configs/52xx/tqm5200_defconfig index efab8388ea92..5a91b504fcd3 100644 --- a/arch/powerpc/configs/52xx/tqm5200_defconfig +++ b/arch/powerpc/configs/52xx/tqm5200_defconfig @@ -88,7 +88,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index 6c6c60f1aba4..d4dbe36fd2d5 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -92,7 +92,7 @@ CONFIG_CIFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_CRC_CCITT=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y diff --git a/arch/powerpc/configs/mpc5200_defconfig b/arch/powerpc/configs/mpc5200_defconfig index 9fd041bfd778..4b7dc3363367 100644 --- a/arch/powerpc/configs/mpc5200_defconfig +++ b/arch/powerpc/configs/mpc5200_defconfig @@ -128,7 +128,7 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index 8f94782eb907..b025444fb2ac 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -171,7 +171,7 @@ CONFIG_NFSD_V4=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_CRC_CCITT=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index 34eaf528fa87..2d9ee188fc93 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -113,7 +113,7 @@ CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_CRC_CCITT=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index ee09e99097f0..9bd87f269d6d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -71,10 +71,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *shift) { - if (!arch_irqs_disabled()) { - pr_info("%s called with irq enabled\n", __func__); - dump_stack(); - } + VM_WARN(!arch_irqs_disabled(), + "%s called with irq enabled\n", __func__); return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); } diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index 60b0b3fc8fc1..a58abe4afbd1 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -6,7 +6,7 @@ #include <asm/setup.h> -void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 6527882ce05e..bb0354222b11 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -75,7 +75,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a67c6d781c52..a4db22f65021 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -429,7 +429,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { if (fault & VM_FAULT_SIGSEGV) goto bad_area; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 7f922f557936..0ae0572bc239 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -79,7 +79,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 8dd78f4e1af4..774c11366a22 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -360,12 +360,12 @@ static int pmac_late_init(void) machine_late_initcall(powermac, pmac_late_init); /* - * This is __init_refok because we check for "initializing" before + * This is __ref because we check for "initializing" before * touching any of the __init sensitive things and "initializing" * will be false after __init time. This can't be __init because it * can be called whenever a disk is first accessed. */ -void __init_refok note_bootable_part(dev_t dev, int part, int goodness) +void __ref note_bootable_part(dev_t dev, int part, int goodness) { char *p; diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index 3f175e8aedb4..57caaf11a83f 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -189,7 +189,7 @@ fail_malloc: return result; } -static int __init_refok ps3_setup_uhc_device( +static int __ref ps3_setup_uhc_device( const struct ps3_repository_device *repo, enum ps3_match_id match_id, enum ps3_interrupt_type interrupt_type, enum ps3_reg_type reg_type) { diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index ed5234ed8d3f..5ebd3f018295 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -112,7 +112,7 @@ int msi_bitmap_reserve_dt_hwirqs(struct msi_bitmap *bmp) return 0; } -int __init_refok msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, +int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, struct device_node *of_node) { int size; diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 889ea3450210..4d84551f6443 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -542,7 +542,7 @@ CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m CONFIG_DLM=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 1bcfd764910a..c5a0eae4dbf2 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -534,7 +534,7 @@ CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m CONFIG_DLM=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 13ff090139c8..4dc40af0e6e5 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -535,7 +535,7 @@ CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m CONFIG_DLM=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 4366a3e3e754..573b2e9855ff 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -55,7 +55,7 @@ CONFIG_RAW_DRIVER=y # CONFIG_INOTIFY_USER is not set CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 7a92e69c50bc..15711de10403 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -87,10 +87,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb, * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu */ + return false; /* avoid calling tlb_flush_mmu */ } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) @@ -98,6 +98,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) free_page_and_swap_cache(page); } +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); +} + /* * pte_free_tlb frees a pte table and clears the CRSTE for the * page table from the tlb. diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d13a2428a827..a58bca62a93b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -458,7 +458,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 37a6c2e0e969..995b71e4db4b 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -111,7 +111,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 9fbce49ad3bd..444c26c0f750 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -91,7 +91,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig index e1077a041ac3..e588da2ba566 100644 --- a/arch/sh/configs/edosk7760_defconfig +++ b/arch/sh/configs/edosk7760_defconfig @@ -109,7 +109,7 @@ CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_15=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_UNUSED_SYMBOLS=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 36642ec2cb97..280695c64d3e 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -215,7 +215,7 @@ CONFIG_NLS_ASCII=m CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig index ae998c7e2ee0..09aac57eddb7 100644 --- a/arch/sh/configs/se7722_defconfig +++ b/arch/sh/configs/se7722_defconfig @@ -53,7 +53,7 @@ CONFIG_EXT3_FS=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_HUGETLBFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index 9bdcf72ec06a..bdb7783a63ee 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -144,7 +144,7 @@ CONFIG_NFSD_V3=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_932=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index 01c9a91ee896..79723309fcfa 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -142,7 +142,7 @@ CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_932=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_DEBUG_FS=y diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index d5462b7bc514..84563e39a5b8 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -221,7 +221,7 @@ pcibios_bus_report_status_early(struct pci_channel *hose, * We can't use pci_find_device() here since we are * called from interrupt context. */ -static void __init_refok +static void __ref pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask, int warn) { @@ -256,7 +256,7 @@ pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask, pcibios_bus_report_status(dev->subordinate, status_mask, warn); } -void __init_refok pcibios_report_status(unsigned int status_mask, int warn) +void __ref pcibios_report_status(unsigned int status_mask, int warn) { struct pci_channel *hose; diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 62f80d2a9df9..025cdb1032f6 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -101,7 +101,7 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu */ + return false; /* avoid calling tlb_flush_mmu */ } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) @@ -109,6 +109,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_page(tlb, page); } +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); +} + #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 79d8276377d1..9bf876780cef 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -487,7 +487,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) if (mm_fault_error(regs, error_code, address, fault)) diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 0c99ec2e7ed8..d09ddfe58fd8 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -34,7 +34,7 @@ * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -void __iomem * __init_refok +void __iomem * __ref __ioremap_caller(phys_addr_t phys_addr, unsigned long size, pgprot_t pgprot, void *caller) { diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 3583d676a916..e4bbc76eb63c 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -202,7 +202,7 @@ CONFIG_EXT3_FS_SECURITY=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_HUGETLBFS=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index b6c559cbd64d..4714061d6cd3 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -241,7 +241,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; @@ -411,7 +411,7 @@ good_area: if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(mm, vma, address, flags)) { + switch (handle_mm_fault(vma, address, flags)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index cb841a33da59..6c43b924a7a2 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -436,7 +436,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto exit_exception; diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 26734214818c..beba986589e5 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -434,7 +434,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 16eb63fac57d..821ff0acfe17 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -102,7 +102,7 @@ static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { tlb->need_flush = 1; free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu */ + return false; /* avoid calling tlb_flush_mmu */ } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) @@ -110,6 +110,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_page(tlb, page); } +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); +} + /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 98783dd0fa2e..ad8f206ab5e8 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -73,7 +73,7 @@ good_area: do { int fault; - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 2ec3d3adcefc..6c7f70bcaae3 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -194,7 +194,7 @@ good_area: * If for any reason at all we couldn't handle the fault, make * sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags); return fault; check_stack: diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 5fa6ee2c2dde..6e7906d10ede 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -285,7 +285,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index d28bdabcc87e..1f912dccc1d7 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -284,7 +284,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 # CONFIG_ENABLE_WARN_DEPRECATED is not set CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 574c23cf761a..b6d425999f99 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -81,7 +81,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + page = alloc_pages(gfp, 0); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -125,7 +129,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (pud_t *)get_zeroed_page(gfp); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5a294e48b185..fc3389fc47a2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d1fa7cd2374..48c7a68f6171 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1353,7 +1353,7 @@ good_area: * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dffd162db0a4..02bd20d5ff59 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -207,7 +207,7 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too. */ -static void __init_refok adjust_range_page_size_mask(struct map_range *mr, +static void __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range) { int i; @@ -395,7 +395,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, +unsigned long __ref init_memory_mapping(unsigned long start, unsigned long end) { struct map_range mr[NR_RANGE_MR]; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index aa0ff4b02a96..3feec5af4e67 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,7 +6,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(PGALLOC_GFP); + return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -207,9 +207,13 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; + gfp_t gfp = PGALLOC_GFP; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; for(i = 0; i < PREALLOCATED_PMDS; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); + pmd_t *pmd = (pmd_t *)__get_free_page(gfp); if (!pmd) failed = true; if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 524142117296..5fdacb322ceb 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -44,7 +44,7 @@ early_initcall(early_efi_map_fb); * In case earlyprintk=efi,keep we have the whole framebuffer mapped already * so just return the offset efi_fb + start. */ -static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +static __ref void *early_efi_map(unsigned long start, unsigned long len) { unsigned long base; @@ -56,7 +56,7 @@ static __init_refok void *early_efi_map(unsigned long start, unsigned long len) return early_ioremap(base + start, len); } -static __init_refok void early_efi_unmap(void *addr, unsigned long len) +static __ref void early_efi_unmap(void *addr, unsigned long len) { if (!efi_fb) early_iounmap(addr, len); diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index c4904db15582..bc0e69f64cf7 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -123,7 +123,7 @@ CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index d9444f01f4da..f6d88f81e233 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -110,7 +110,7 @@ CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index 337d5ba2d285..072733684308 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -107,7 +107,7 @@ CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y # CONFIG_FRAME_POINTER is not set diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index 61f943c95619..efbbaf94add1 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -114,7 +114,7 @@ CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7f4a1fdb1502..2725e08ef353 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -110,7 +110,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/block/genhd.c b/block/genhd.c index 9f42526b4d62..eb05dfc1dffe 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -877,7 +877,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index b108f1358a32..4305ee9db4b2 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -309,7 +309,7 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr) * During early init (when acpi_gbl_permanent_mmap has not been set yet) this * routine simply calls __acpi_map_table() to get the job done. */ -void __iomem *__init_refok +void __iomem *__ref acpi_os_map_iomem(acpi_physical_address phys, acpi_size size) { struct acpi_ioremap *map; @@ -362,8 +362,7 @@ out: } EXPORT_SYMBOL_GPL(acpi_os_map_iomem); -void *__init_refok -acpi_os_map_memory(acpi_physical_address phys, acpi_size size) +void *__ref acpi_os_map_memory(acpi_physical_address phys, acpi_size size) { return (void *)acpi_os_map_iomem(phys, size); } diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 773fc3099769..22d1760a4278 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -46,7 +46,8 @@ MODULE_LICENSE("GPL"); extern struct builtin_fw __start_builtin_fw[]; extern struct builtin_fw __end_builtin_fw[]; -static bool fw_get_builtin_firmware(struct firmware *fw, const char *name) +static bool fw_get_builtin_firmware(struct firmware *fw, const char *name, + void *buf, size_t size) { struct builtin_fw *b_fw; @@ -54,6 +55,9 @@ static bool fw_get_builtin_firmware(struct firmware *fw, const char *name) if (strcmp(name, b_fw->name) == 0) { fw->size = b_fw->size; fw->data = b_fw->data; + + if (buf && fw->size <= size) + memcpy(buf, fw->data, fw->size); return true; } } @@ -74,7 +78,9 @@ static bool fw_is_builtin_firmware(const struct firmware *fw) #else /* Module case - no builtin firmware support */ -static inline bool fw_get_builtin_firmware(struct firmware *fw, const char *name) +static inline bool fw_get_builtin_firmware(struct firmware *fw, + const char *name, void *buf, + size_t size) { return false; } @@ -112,6 +118,7 @@ static inline long firmware_loading_timeout(void) #define FW_OPT_FALLBACK 0 #endif #define FW_OPT_NO_WARN (1U << 3) +#define FW_OPT_NOCACHE (1U << 4) struct firmware_cache { /* firmware_buf instance will be added into the below list */ @@ -143,6 +150,7 @@ struct firmware_buf { unsigned long status; void *data; size_t size; + size_t allocated_size; #ifdef CONFIG_FW_LOADER_USER_HELPER bool is_paged_buf; bool need_uevent; @@ -178,7 +186,8 @@ static DEFINE_MUTEX(fw_lock); static struct firmware_cache fw_cache; static struct firmware_buf *__allocate_fw_buf(const char *fw_name, - struct firmware_cache *fwc) + struct firmware_cache *fwc, + void *dbuf, size_t size) { struct firmware_buf *buf; @@ -194,6 +203,8 @@ static struct firmware_buf *__allocate_fw_buf(const char *fw_name, kref_init(&buf->ref); buf->fwc = fwc; + buf->data = dbuf; + buf->allocated_size = size; init_completion(&buf->completion); #ifdef CONFIG_FW_LOADER_USER_HELPER INIT_LIST_HEAD(&buf->pending_list); @@ -217,7 +228,8 @@ static struct firmware_buf *__fw_lookup_buf(const char *fw_name) static int fw_lookup_and_allocate_buf(const char *fw_name, struct firmware_cache *fwc, - struct firmware_buf **buf) + struct firmware_buf **buf, void *dbuf, + size_t size) { struct firmware_buf *tmp; @@ -229,7 +241,7 @@ static int fw_lookup_and_allocate_buf(const char *fw_name, *buf = tmp; return 1; } - tmp = __allocate_fw_buf(fw_name, fwc); + tmp = __allocate_fw_buf(fw_name, fwc, dbuf, size); if (tmp) list_add(&tmp->list, &fwc->head); spin_unlock(&fwc->lock); @@ -261,6 +273,7 @@ static void __fw_free_buf(struct kref *ref) vfree(buf->pages); } else #endif + if (!buf->allocated_size) vfree(buf->data); kfree_const(buf->fw_id); kfree(buf); @@ -301,13 +314,21 @@ static void fw_finish_direct_load(struct device *device, mutex_unlock(&fw_lock); } -static int fw_get_filesystem_firmware(struct device *device, - struct firmware_buf *buf) +static int +fw_get_filesystem_firmware(struct device *device, struct firmware_buf *buf) { loff_t size; int i, len; int rc = -ENOENT; char *path; + enum kernel_read_file_id id = READING_FIRMWARE; + size_t msize = INT_MAX; + + /* Already populated data member means we're loading into a buffer */ + if (buf->data) { + id = READING_FIRMWARE_PREALLOC_BUFFER; + msize = buf->allocated_size; + } path = __getname(); if (!path) @@ -326,8 +347,8 @@ static int fw_get_filesystem_firmware(struct device *device, } buf->size = 0; - rc = kernel_read_file_from_path(path, &buf->data, &size, - INT_MAX, READING_FIRMWARE); + rc = kernel_read_file_from_path(path, &buf->data, &size, msize, + id); if (rc) { if (rc == -ENOENT) dev_dbg(device, "loading %s failed with error %d\n", @@ -691,6 +712,38 @@ out: static DEVICE_ATTR(loading, 0644, firmware_loading_show, firmware_loading_store); +static void firmware_rw_buf(struct firmware_buf *buf, char *buffer, + loff_t offset, size_t count, bool read) +{ + if (read) + memcpy(buffer, buf->data + offset, count); + else + memcpy(buf->data + offset, buffer, count); +} + +static void firmware_rw(struct firmware_buf *buf, char *buffer, + loff_t offset, size_t count, bool read) +{ + while (count) { + void *page_data; + int page_nr = offset >> PAGE_SHIFT; + int page_ofs = offset & (PAGE_SIZE-1); + int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); + + page_data = kmap(buf->pages[page_nr]); + + if (read) + memcpy(buffer, page_data + page_ofs, page_cnt); + else + memcpy(page_data + page_ofs, buffer, page_cnt); + + kunmap(buf->pages[page_nr]); + buffer += page_cnt; + offset += page_cnt; + count -= page_cnt; + } +} + static ssize_t firmware_data_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buffer, loff_t offset, size_t count) @@ -715,21 +768,11 @@ static ssize_t firmware_data_read(struct file *filp, struct kobject *kobj, ret_count = count; - while (count) { - void *page_data; - int page_nr = offset >> PAGE_SHIFT; - int page_ofs = offset & (PAGE_SIZE-1); - int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); - - page_data = kmap(buf->pages[page_nr]); - - memcpy(buffer, page_data + page_ofs, page_cnt); + if (buf->data) + firmware_rw_buf(buf, buffer, offset, count, true); + else + firmware_rw(buf, buffer, offset, count, true); - kunmap(buf->pages[page_nr]); - buffer += page_cnt; - offset += page_cnt; - count -= page_cnt; - } out: mutex_unlock(&fw_lock); return ret_count; @@ -804,29 +847,23 @@ static ssize_t firmware_data_write(struct file *filp, struct kobject *kobj, goto out; } - retval = fw_realloc_buffer(fw_priv, offset + count); - if (retval) - goto out; - - retval = count; - - while (count) { - void *page_data; - int page_nr = offset >> PAGE_SHIFT; - int page_ofs = offset & (PAGE_SIZE - 1); - int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); - - page_data = kmap(buf->pages[page_nr]); - - memcpy(page_data + page_ofs, buffer, page_cnt); + if (buf->data) { + if (offset + count > buf->allocated_size) { + retval = -ENOMEM; + goto out; + } + firmware_rw_buf(buf, buffer, offset, count, false); + retval = count; + } else { + retval = fw_realloc_buffer(fw_priv, offset + count); + if (retval) + goto out; - kunmap(buf->pages[page_nr]); - buffer += page_cnt; - offset += page_cnt; - count -= page_cnt; + retval = count; + firmware_rw(buf, buffer, offset, count, false); } - buf->size = max_t(size_t, offset, buf->size); + buf->size = max_t(size_t, offset + count, buf->size); out: mutex_unlock(&fw_lock); return retval; @@ -894,7 +931,8 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, struct firmware_buf *buf = fw_priv->buf; /* fall back on userspace loading */ - buf->is_paged_buf = true; + if (!buf->data) + buf->is_paged_buf = true; dev_set_uevent_suppress(f_dev, true); @@ -929,7 +967,7 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, if (is_fw_load_aborted(buf)) retval = -EAGAIN; - else if (!buf->data) + else if (buf->is_paged_buf && !buf->data) retval = -ENOMEM; device_del(f_dev); @@ -1012,7 +1050,7 @@ static int sync_cached_firmware_buf(struct firmware_buf *buf) */ static int _request_firmware_prepare(struct firmware **firmware_p, const char *name, - struct device *device) + struct device *device, void *dbuf, size_t size) { struct firmware *firmware; struct firmware_buf *buf; @@ -1025,12 +1063,12 @@ _request_firmware_prepare(struct firmware **firmware_p, const char *name, return -ENOMEM; } - if (fw_get_builtin_firmware(firmware, name)) { + if (fw_get_builtin_firmware(firmware, name, dbuf, size)) { dev_dbg(device, "using built-in %s\n", name); return 0; /* assigned */ } - ret = fw_lookup_and_allocate_buf(name, &fw_cache, &buf); + ret = fw_lookup_and_allocate_buf(name, &fw_cache, &buf, dbuf, size); /* * bind with 'buf' now to avoid warning in failure path @@ -1070,14 +1108,16 @@ static int assign_firmware_buf(struct firmware *fw, struct device *device, * should be fixed in devres or driver core. */ /* don't cache firmware handled without uevent */ - if (device && (opt_flags & FW_OPT_UEVENT)) + if (device && (opt_flags & FW_OPT_UEVENT) && + !(opt_flags & FW_OPT_NOCACHE)) fw_add_devm_name(device, buf->fw_id); /* * After caching firmware image is started, let it piggyback * on request firmware. */ - if (buf->fwc->state == FW_LOADER_START_CACHE) { + if (!(opt_flags & FW_OPT_NOCACHE) && + buf->fwc->state == FW_LOADER_START_CACHE) { if (fw_cache_piggyback_on_request(buf->fw_id)) kref_get(&buf->ref); } @@ -1091,7 +1131,8 @@ static int assign_firmware_buf(struct firmware *fw, struct device *device, /* called from request_firmware() and request_firmware_work_func() */ static int _request_firmware(const struct firmware **firmware_p, const char *name, - struct device *device, unsigned int opt_flags) + struct device *device, void *buf, size_t size, + unsigned int opt_flags) { struct firmware *fw = NULL; long timeout; @@ -1105,7 +1146,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name, goto out; } - ret = _request_firmware_prepare(&fw, name, device); + ret = _request_firmware_prepare(&fw, name, device, buf, size); if (ret <= 0) /* error or already assigned */ goto out; @@ -1184,7 +1225,7 @@ request_firmware(const struct firmware **firmware_p, const char *name, /* Need to pin this module until return */ __module_get(THIS_MODULE); - ret = _request_firmware(firmware_p, name, device, + ret = _request_firmware(firmware_p, name, device, NULL, 0, FW_OPT_UEVENT | FW_OPT_FALLBACK); module_put(THIS_MODULE); return ret; @@ -1208,7 +1249,7 @@ int request_firmware_direct(const struct firmware **firmware_p, int ret; __module_get(THIS_MODULE); - ret = _request_firmware(firmware_p, name, device, + ret = _request_firmware(firmware_p, name, device, NULL, 0, FW_OPT_UEVENT | FW_OPT_NO_WARN); module_put(THIS_MODULE); return ret; @@ -1216,6 +1257,36 @@ int request_firmware_direct(const struct firmware **firmware_p, EXPORT_SYMBOL_GPL(request_firmware_direct); /** + * request_firmware_into_buf - load firmware into a previously allocated buffer + * @firmware_p: pointer to firmware image + * @name: name of firmware file + * @device: device for which firmware is being loaded and DMA region allocated + * @buf: address of buffer to load firmware into + * @size: size of buffer + * + * This function works pretty much like request_firmware(), but it doesn't + * allocate a buffer to hold the firmware data. Instead, the firmware + * is loaded directly into the buffer pointed to by @buf and the @firmware_p + * data member is pointed at @buf. + * + * This function doesn't cache firmware either. + */ +int +request_firmware_into_buf(const struct firmware **firmware_p, const char *name, + struct device *device, void *buf, size_t size) +{ + int ret; + + __module_get(THIS_MODULE); + ret = _request_firmware(firmware_p, name, device, buf, size, + FW_OPT_UEVENT | FW_OPT_FALLBACK | + FW_OPT_NOCACHE); + module_put(THIS_MODULE); + return ret; +} +EXPORT_SYMBOL(request_firmware_into_buf); + +/** * release_firmware: - release the resource associated with a firmware image * @fw: firmware resource to release **/ @@ -1247,7 +1318,7 @@ static void request_firmware_work_func(struct work_struct *work) fw_work = container_of(work, struct firmware_work, work); - _request_firmware(&fw, fw_work->name, fw_work->device, + _request_firmware(&fw, fw_work->name, fw_work->device, NULL, 0, fw_work->opt_flags); fw_work->cont(fw, fw_work->context); put_device(fw_work->device); /* taken in request_firmware_nowait() */ @@ -1380,7 +1451,7 @@ static int uncache_firmware(const char *fw_name) pr_debug("%s: %s\n", __func__, fw_name); - if (fw_get_builtin_firmware(&fw, fw_name)) + if (fw_get_builtin_firmware(&fw, fw_name, NULL, 0)) return 0; buf = fw_lookup_buf(fw_name); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f46dba8b7092..dc75de9059cd 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -391,6 +391,7 @@ static ssize_t show_valid_zones(struct device *dev, unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; struct page *first_page; struct zone *zone; + int zone_shift = 0; start_pfn = section_nr_to_pfn(mem->start_section_nr); end_pfn = start_pfn + nr_pages; @@ -402,21 +403,26 @@ static ssize_t show_valid_zones(struct device *dev, zone = page_zone(first_page); - if (zone_idx(zone) == ZONE_MOVABLE - 1) { - /*The mem block is the last memoryblock of this zone.*/ - if (end_pfn == zone_end_pfn(zone)) - return sprintf(buf, "%s %s\n", - zone->name, (zone + 1)->name); + /* MMOP_ONLINE_KEEP */ + sprintf(buf, "%s", zone->name); + + /* MMOP_ONLINE_KERNEL */ + zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL); + if (zone_shift) { + strcat(buf, " "); + strcat(buf, (zone + zone_shift)->name); } - if (zone_idx(zone) == ZONE_MOVABLE) { - /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ - if (start_pfn == zone->zone_start_pfn) - return sprintf(buf, "%s %s\n", - zone->name, (zone - 1)->name); + /* MMOP_ONLINE_MOVABLE */ + zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE); + if (zone_shift) { + strcat(buf, " "); + strcat(buf, (zone + zone_shift)->name); } - return sprintf(buf, "%s\n", zone->name); + strcat(buf, "\n"); + + return strlen(buf); } static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); #endif diff --git a/drivers/base/node.c b/drivers/base/node.c index 560751bad294..ed0ef0f69489 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -113,6 +113,8 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d SUnreclaim: %8lu kB\n" #ifdef CONFIG_TRANSPARENT_HUGEPAGE "Node %d AnonHugePages: %8lu kB\n" + "Node %d ShmemHugePages: %8lu kB\n" + "Node %d ShmemPmdMapped: %8lu kB\n" #endif , nid, K(node_page_state(nid, NR_FILE_DIRTY)), @@ -131,10 +133,13 @@ static ssize_t node_read_meminfo(struct device *dev, node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) - , nid, - K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR)); + nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(nid, NR_ANON_THPS) * + HPAGE_PMD_NR), + nid, K(node_page_state(nid, NR_SHMEM_THPS) * + HPAGE_PMD_NR), + nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * + HPAGE_PMD_NR)); #else nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); #endif @@ -359,7 +364,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #define page_initialized(page) (page->lru.next) -static int __init_refok get_nid_for_pfn(unsigned long pfn) +static int __ref get_nid_for_pfn(unsigned long pfn) { struct page *page; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 386ba3d1a6ee..b8ecba6dcd3b 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,8 +1,7 @@ config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS + depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO + select CRYPTO_LZO default n help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -14,13 +13,3 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - -config ZRAM_LZ4_COMPRESS - bool "Enable LZ4 algorithm support" - depends on ZRAM - select LZ4_COMPRESS - select LZ4_DECOMPRESS - default n - help - This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index be0763ff57a2..9e2b79e9a990 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,5 +1,3 @@ -zram-y := zcomp_lzo.o zcomp.o zram_drv.o - -zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o +zram-y := zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b51a816d766b..4b5cd3a7b2b6 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -14,108 +14,150 @@ #include <linux/wait.h> #include <linux/sched.h> #include <linux/cpu.h> +#include <linux/crypto.h> #include "zcomp.h" -#include "zcomp_lzo.h" -#ifdef CONFIG_ZRAM_LZ4_COMPRESS -#include "zcomp_lz4.h" -#endif -static struct zcomp_backend *backends[] = { - &zcomp_lzo, -#ifdef CONFIG_ZRAM_LZ4_COMPRESS - &zcomp_lz4, +static const char * const backends[] = { + "lzo", +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) + "lz4", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) + "deflate", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) + "lz4hc", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_842) + "842", #endif NULL }; -static struct zcomp_backend *find_backend(const char *compress) -{ - int i = 0; - while (backends[i]) { - if (sysfs_streq(compress, backends[i]->name)) - break; - i++; - } - return backends[i]; -} - -static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +static void zcomp_strm_free(struct zcomp_strm *zstrm) { - if (zstrm->private) - comp->backend->destroy(zstrm->private); + if (!IS_ERR_OR_NULL(zstrm->tfm)) + crypto_free_comp(zstrm->tfm); free_pages((unsigned long)zstrm->buffer, 1); kfree(zstrm); } /* - * allocate new zcomp_strm structure with ->private initialized by + * allocate new zcomp_strm structure with ->tfm initialized by * backend, return NULL on error */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); if (!zstrm) return NULL; - zstrm->private = comp->backend->create(flags); + zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); - if (!zstrm->private || !zstrm->buffer) { - zcomp_strm_free(comp, zstrm); + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { + zcomp_strm_free(zstrm); zstrm = NULL; } return zstrm; } +bool zcomp_available_algorithm(const char *comp) +{ + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i])) + return true; + i++; + } + + /* + * Crypto does not ignore a trailing new line symbol, + * so make sure you don't supply a string containing + * one. + * This also means that we permit zcomp initialisation + * with any compressing algorithm known to crypto api. + */ + return crypto_has_comp(comp, 0, 0) == 1; +} + /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { + bool known_algorithm = false; ssize_t sz = 0; int i = 0; - while (backends[i]) { - if (!strcmp(comp, backends[i]->name)) + for (; backends[i]; i++) { + if (!strcmp(comp, backends[i])) { + known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]->name); - else + "[%s] ", backends[i]); + } else { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]->name); - i++; + "%s ", backends[i]); + } } + + /* + * Out-of-tree module known to crypto api or a missing + * entry in `backends'. + */ + if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", comp); + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } -bool zcomp_available_algorithm(const char *comp) -{ - return find_backend(comp) != NULL; -} - -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { return *get_cpu_ptr(comp->stream); } -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +void zcomp_stream_put(struct zcomp *comp) { put_cpu_ptr(comp->stream); } -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len) +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len) { - return comp->backend->compress(src, zstrm->buffer, dst_len, - zstrm->private); + /* + * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized + * because sometimes we can endup having a bigger compressed data + * due to various reasons: for example compression algorithms tend + * to add some padding to the compressed buffer. Speaking of padding, + * comp algorithm `842' pads the compressed length to multiple of 8 + * and returns -ENOSP when the dst memory is not big enough, which + * is not something that ZRAM wants to see. We can handle the + * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we + * receive -ERRNO from the compressing backend we can't help it + * anymore. To make `842' happy we need to tell the exact size of + * the dst buffer, zram_drv will take care of the fact that + * compressed buffer is too big. + */ + *dst_len = PAGE_SIZE * 2; + + return crypto_comp_compress(zstrm->tfm, + src, PAGE_SIZE, + zstrm->buffer, dst_len); } -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst) +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst) { - return comp->backend->decompress(src, src_len, dst); + unsigned int dst_len = PAGE_SIZE; + + return crypto_comp_decompress(zstrm->tfm, + src, src_len, + dst, &dst_len); } static int __zcomp_cpu_notifier(struct zcomp *comp, @@ -127,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_PREPARE: if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) break; - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + zstrm = zcomp_strm_alloc(comp); if (IS_ERR_OR_NULL(zstrm)) { pr_err("Can't allocate a compression stream\n"); return NOTIFY_BAD; @@ -138,7 +180,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_CANCELED: zstrm = *per_cpu_ptr(comp->stream, cpu); if (!IS_ERR_OR_NULL(zstrm)) - zcomp_strm_free(comp, zstrm); + zcomp_strm_free(zstrm); *per_cpu_ptr(comp->stream, cpu) = NULL; break; default: @@ -209,18 +251,16 @@ void zcomp_destroy(struct zcomp *comp) struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; - struct zcomp_backend *backend; int error; - backend = find_backend(compress); - if (!backend) + if (!zcomp_available_algorithm(compress)) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->backend = backend; + comp->name = compress; error = zcomp_init(comp); if (error) { kfree(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index ffd88cb747fe..478cac2ed465 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -13,33 +13,15 @@ struct zcomp_strm { /* compression/decompression buffer */ void *buffer; - /* - * The private data of the compression stream, only compression - * stream backend can touch this (e.g. compression algorithm - * working memory) - */ - void *private; -}; - -/* static compression backend */ -struct zcomp_backend { - int (*compress)(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private); - - int (*decompress)(const unsigned char *src, size_t src_len, - unsigned char *dst); - - void *(*create)(gfp_t flags); - void (*destroy)(void *private); - - const char *name; + struct crypto_comp *tfm; }; /* dynamic per-device compression frontend */ struct zcomp { struct zcomp_strm * __percpu *stream; - struct zcomp_backend *backend; struct notifier_block notifier; + + const char *name; }; ssize_t zcomp_available_show(const char *comp, char *buf); @@ -48,14 +30,14 @@ bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); +void zcomp_stream_put(struct zcomp *comp); -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len); +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len); -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst); +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst); bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c deleted file mode 100644 index 0110086accba..000000000000 --- a/drivers/block/zram/zcomp_lz4.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/lz4.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> - -#include "zcomp_lz4.h" - -static void *zcomp_lz4_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZ4_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZ4_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void zcomp_lz4_destroy(void *private) -{ - kvfree(private); -} - -static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - /* return : Success if return 0 */ - return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); -} - -static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - /* return : Success if return 0 */ - return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); -} - -struct zcomp_backend zcomp_lz4 = { - .compress = zcomp_lz4_compress, - .decompress = zcomp_lz4_decompress, - .create = zcomp_lz4_create, - .destroy = zcomp_lz4_destroy, - .name = "lz4", -}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h deleted file mode 100644 index 60613fb29dd8..000000000000 --- a/drivers/block/zram/zcomp_lz4.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZ4_H_ -#define _ZCOMP_LZ4_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lz4; - -#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c deleted file mode 100644 index ed7a1f0549ec..000000000000 --- a/drivers/block/zram/zcomp_lzo.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/lzo.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> - -#include "zcomp_lzo.h" - -static void *lzo_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZO1X_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZO1X_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void lzo_destroy(void *private) -{ - kvfree(private); -} - -static int lzo_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); - return ret == LZO_E_OK ? 0 : ret; -} - -static int lzo_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); - return ret == LZO_E_OK ? 0 : ret; -} - -struct zcomp_backend zcomp_lzo = { - .compress = lzo_compress, - .decompress = lzo_decompress, - .create = lzo_create, - .destroy = lzo_destroy, - .name = "lzo", -}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h deleted file mode 100644 index 128c5807fa14..000000000000 --- a/drivers/block/zram/zcomp_lzo.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZO_H_ -#define _ZCOMP_LZO_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lzo; - -#endif /* _ZCOMP_LZO_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e5e5d19f2172..7454cf188c8e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); + char compressor[CRYPTO_MAX_ALG_NAME]; size_t sz; - if (!zcomp_available_algorithm(buf)) + strlcpy(compressor, buf, sizeof(compressor)); + /* ignore trailing newline */ + sz = strlen(compressor); + if (sz > 0 && compressor[sz - 1] == '\n') + compressor[sz - 1] = 0x00; + + if (!zcomp_available_algorithm(compressor)) return -EINVAL; down_write(&zram->init_lock); @@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev, pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } - strlcpy(zram->compressor, buf, sizeof(zram->compressor)); - - /* ignore trailing newline */ - sz = strlen(zram->compressor); - if (sz > 0 && zram->compressor[sz - 1] == '\n') - zram->compressor[sz - 1] = 0x00; + strlcpy(zram->compressor, compressor, sizeof(compressor)); up_write(&zram->init_lock); return len; } @@ -563,7 +565,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - size_t size; + unsigned int size; bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; @@ -576,10 +578,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) + if (size == PAGE_SIZE) { copy_page(mem, cmem); - else - ret = zcomp_decompress(zram->comp, cmem, size, mem); + } else { + struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); + + ret = zcomp_decompress(zstrm, cmem, size, mem); + zcomp_stream_put(zram->comp); + } zs_unmap_object(meta->mem_pool, handle); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); @@ -646,7 +652,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { int ret = 0; - size_t clen; + unsigned int clen; unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; @@ -695,8 +701,8 @@ compress_again: goto out; } - zstrm = zcomp_strm_find(zram->comp); - ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + zstrm = zcomp_stream_get(zram->comp); + ret = zcomp_compress(zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; @@ -732,19 +738,21 @@ compress_again: handle = zs_malloc(meta->mem_pool, clen, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | - __GFP_HIGHMEM); + __GFP_HIGHMEM | + __GFP_MOVABLE); if (!handle) { - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; atomic64_inc(&zram->stats.writestall); handle = zs_malloc(meta->mem_pool, clen, - GFP_NOIO | __GFP_HIGHMEM); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (handle) goto compress_again; - pr_err("Error allocating memory for compressed page: %u, size=%zu\n", + pr_err("Error allocating memory for compressed page: %u, size=%u\n", index, clen); ret = -ENOMEM; goto out; @@ -769,7 +777,7 @@ compress_again: memcpy(cmem, src, clen); } - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); @@ -789,7 +797,7 @@ compress_again: atomic64_inc(&zram->stats.pages_stored); out: if (zstrm) - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) kfree(uncmem); return ret; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 3f5bf66a27e4..74fcf10da374 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -15,8 +15,9 @@ #ifndef _ZRAM_DRV_H_ #define _ZRAM_DRV_H_ -#include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/zsmalloc.h> +#include <linux/crypto.h> #include "zcomp.h" @@ -113,7 +114,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - char compressor[10]; + char compressor[CRYPTO_MAX_ALG_NAME]; /* * zram is claimed so open request will be failed */ diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 71025c2f6bbb..9656f1095c19 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/highmem.h> #include <linux/backing-dev.h> +#include <linux/shmem_fs.h> #include <linux/splice.h> #include <linux/pfn.h> #include <linux/export.h> @@ -661,6 +662,28 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma) return 0; } +static unsigned long get_unmapped_area_zero(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ +#ifdef CONFIG_MMU + if (flags & MAP_SHARED) { + /* + * mmap_zero() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge; + * and pass NULL for file as in mmap.c's get_unmapped_area(), + * so as not to confuse shmem with our handle on "/dev/zero". + */ + return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); + } + + /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +#else + return -ENOSYS; +#endif +} + static ssize_t write_full(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -768,6 +791,7 @@ static const struct file_operations zero_fops = { .read_iter = read_iter_zero, .write_iter = write_iter_zero, .mmap = mmap_zero, + .get_unmapped_area = get_unmapped_area_zero, #ifndef CONFIG_MMU .mmap_capabilities = zero_mmap_capabilities, #endif diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 89cc700fbc37..97ae60fa1584 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -250,7 +250,7 @@ struct clk_lookup_alloc { char con_id[MAX_CON_ID]; }; -static struct clk_lookup * __init_refok +static struct clk_lookup * __ref vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, va_list ap) { @@ -287,7 +287,7 @@ vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt, return cl; } -struct clk_lookup * __init_refok +struct clk_lookup * __ref clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) { struct clk_lookup *cl; diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index b891a129b275..9eacd4d1c736 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -211,13 +211,9 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, } dax_dev->dev = dev; - rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev); - if (rc) { - unregister_dax_dev(dev); - return rc; - } + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - return 0; + return rc; err_create: ida_simple_remove(&dax_minor_ida, minor); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 55d510e36cd1..f5bbbcac0735 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -102,21 +102,19 @@ static int dax_pmem_probe(struct device *dev) if (rc) return rc; - rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref); - if (rc) { - dax_pmem_percpu_exit(&dax_pmem->ref); + rc = devm_add_action_or_reset(dev, dax_pmem_percpu_exit, + &dax_pmem->ref); + if (rc) return rc; - } addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap); if (IS_ERR(addr)) return PTR_ERR(addr); - rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref); - if (rc) { - dax_pmem_percpu_kill(&dax_pmem->ref); + rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill, + &dax_pmem->ref); + if (rc) return rc; - } nd_region = to_nd_region(dev->parent); dax_region = alloc_dax_region(dev, nd_region->id, &res, diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 0d52ceb0846d..594849a3a9be 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -538,8 +538,7 @@ static void do_fault(struct work_struct *work) if (access_error(vma, fault)) goto out; - ret = handle_mm_fault(mm, vma, address, flags); - + ret = handle_mm_fault(vma, address, flags); out: up_read(&mm->mmap_sem); diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index d9939fa9b588..8ebb3530afa7 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -583,7 +583,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (access_error(vma, req)) goto invalid; - ret = handle_mm_fault(svm->mm, vma, address, + ret = handle_mm_fault(vma, address, req->wr_req ? FAULT_FLAG_WRITE : 0); if (ret & VM_FAULT_ERROR) goto invalid; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 3cd68152ddf8..1efc9103bfd8 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2340,23 +2340,11 @@ static struct memstick_driver msb_driver = { .resume = msb_resume }; -static int major; - static int __init msb_init(void) { - int rc = register_blkdev(0, DRIVER_NAME); - - if (rc < 0) { - pr_err("failed to register major (error %d)\n", rc); - return rc; - } - - major = rc; - rc = memstick_register_driver(&msb_driver); - if (rc) { - unregister_blkdev(major, DRIVER_NAME); + int rc = memstick_register_driver(&msb_driver); + if (rc) pr_err("failed to register memstick driver (error %d)\n", rc); - } return rc; } @@ -2364,7 +2352,6 @@ static int __init msb_init(void) static void __exit msb_exit(void) { memstick_unregister_driver(&msb_driver); - unregister_blkdev(major, DRIVER_NAME); idr_destroy(&msb_disk_idr); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9d7cee463fd6..8a0267f3031f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -47,9 +47,6 @@ unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_major; -module_param(nvme_major, int, 0); - static int nvme_char_major; module_param(nvme_char_major, int, 0); @@ -1475,8 +1472,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - disk->major = nvme_major; - disk->first_minor = 0; disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; @@ -1887,16 +1882,10 @@ int __init nvme_core_init(void) { int result; - result = register_blkdev(nvme_major, "nvme"); - if (result < 0) - return result; - else if (result > 0) - nvme_major = result; - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); if (result < 0) - goto unregister_blkdev; + return result; else if (result > 0) nvme_char_major = result; @@ -1910,8 +1899,6 @@ int __init nvme_core_init(void) unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); return result; } @@ -1919,7 +1906,6 @@ void nvme_core_exit(void) { class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev(nvme_major, "nvme"); } MODULE_LICENSE("GPL"); diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 5f70fee59a94..d6ff5e82377d 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -1086,7 +1086,7 @@ out: return err; } -static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev, +static void __ref pcifront_backend_changed(struct xenbus_device *xdev, enum xenbus_state be_state) { struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index e5139402e7f8..52bbd27e93ae 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored) struct oom_control oc = { .zonelist = node_zonelist(first_memory_node, gfp_mask), .nodemask = NULL, + .memcg = NULL, .gfp_mask = gfp_mask, .order = -1, }; diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c index 10fbfd8ab963..b6bc4a0bda2a 100644 --- a/drivers/video/logo/logo.c +++ b/drivers/video/logo/logo.c @@ -36,11 +36,11 @@ static int __init fb_logo_late_init(void) late_initcall(fb_logo_late_init); -/* logo's are marked __initdata. Use __init_refok to tell +/* logo's are marked __initdata. Use __ref to tell * modpost that it is intended that this function uses data * marked __initdata. */ -const struct linux_logo * __init_refok fb_find_logo(int depth) +const struct linux_logo * __ref fb_find_logo(int depth) { const struct linux_logo *logo = NULL; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 476c0e3a7150..888d5f8322ce 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -30,6 +30,7 @@ #include <linux/oom.h> #include <linux/wait.h> #include <linux/mm.h> +#include <linux/mount.h> /* * Balloon device works in 4K page units. So each page is pointed to by @@ -45,6 +46,10 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +#ifdef CONFIG_BALLOON_COMPACTION +static struct vfsmount *balloon_mnt; +#endif + struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; @@ -490,6 +495,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, return MIGRATEPAGE_SUCCESS; } + +static struct dentry *balloon_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops, + BALLOON_KVM_MAGIC); +} + +static struct file_system_type balloon_fs = { + .name = "balloon-kvm", + .mount = balloon_mount, + .kill_sb = kill_anon_super, +}; + #endif /* CONFIG_BALLOON_COMPACTION */ static int virtballoon_probe(struct virtio_device *vdev) @@ -519,9 +542,6 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->vdev = vdev; balloon_devinfo_init(&vb->vb_dev_info); -#ifdef CONFIG_BALLOON_COMPACTION - vb->vb_dev_info.migratepage = virtballoon_migratepage; -#endif err = init_vqs(vb); if (err) @@ -531,13 +551,33 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->nb); if (err < 0) - goto out_oom_notify; + goto out_del_vqs; + +#ifdef CONFIG_BALLOON_COMPACTION + balloon_mnt = kern_mount(&balloon_fs); + if (IS_ERR(balloon_mnt)) { + err = PTR_ERR(balloon_mnt); + unregister_oom_notifier(&vb->nb); + goto out_del_vqs; + } + + vb->vb_dev_info.migratepage = virtballoon_migratepage; + vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); + if (IS_ERR(vb->vb_dev_info.inode)) { + err = PTR_ERR(vb->vb_dev_info.inode); + kern_unmount(balloon_mnt); + unregister_oom_notifier(&vb->nb); + vb->vb_dev_info.inode = NULL; + goto out_del_vqs; + } + vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; +#endif virtio_device_ready(vdev); return 0; -out_oom_notify: +out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); @@ -571,6 +611,8 @@ static void virtballoon_remove(struct virtio_device *vdev) cancel_work_sync(&vb->update_balloon_stats_work); remove_common(vb); + if (vb->vb_dev_info.inode) + iput(vb->vb_dev_info.inode); kfree(vb); } diff --git a/drivers/w1/slaves/w1_ds2406.c b/drivers/w1/slaves/w1_ds2406.c index d488961a8c90..51f2f66d6555 100644 --- a/drivers/w1/slaves/w1_ds2406.c +++ b/drivers/w1/slaves/w1_ds2406.c @@ -153,16 +153,4 @@ static struct w1_family w1_family_12 = { .fid = W1_FAMILY_DS2406, .fops = &w1_f12_fops, }; - -static int __init w1_f12_init(void) -{ - return w1_register_family(&w1_family_12); -} - -static void __exit w1_f12_exit(void) -{ - w1_unregister_family(&w1_family_12); -} - -module_init(w1_f12_init); -module_exit(w1_f12_exit); +module_w1_family(w1_family_12); diff --git a/drivers/w1/slaves/w1_ds2408.c b/drivers/w1/slaves/w1_ds2408.c index 7dfa0e11688a..aec5958e66e9 100644 --- a/drivers/w1/slaves/w1_ds2408.c +++ b/drivers/w1/slaves/w1_ds2408.c @@ -351,16 +351,4 @@ static struct w1_family w1_family_29 = { .fid = W1_FAMILY_DS2408, .fops = &w1_f29_fops, }; - -static int __init w1_f29_init(void) -{ - return w1_register_family(&w1_family_29); -} - -static void __exit w1_f29_exit(void) -{ - w1_unregister_family(&w1_family_29); -} - -module_init(w1_f29_init); -module_exit(w1_f29_exit); +module_w1_family(w1_family_29); diff --git a/drivers/w1/slaves/w1_ds2413.c b/drivers/w1/slaves/w1_ds2413.c index ee28fc1ff390..f2e1c51533b9 100644 --- a/drivers/w1/slaves/w1_ds2413.c +++ b/drivers/w1/slaves/w1_ds2413.c @@ -135,16 +135,4 @@ static struct w1_family w1_family_3a = { .fid = W1_FAMILY_DS2413, .fops = &w1_f3a_fops, }; - -static int __init w1_f3a_init(void) -{ - return w1_register_family(&w1_family_3a); -} - -static void __exit w1_f3a_exit(void) -{ - w1_unregister_family(&w1_family_3a); -} - -module_init(w1_f3a_init); -module_exit(w1_f3a_exit); +module_w1_family(w1_family_3a); diff --git a/drivers/w1/slaves/w1_ds2423.c b/drivers/w1/slaves/w1_ds2423.c index 7e41b7d91fb5..4ab54fd9dde2 100644 --- a/drivers/w1/slaves/w1_ds2423.c +++ b/drivers/w1/slaves/w1_ds2423.c @@ -138,19 +138,7 @@ static struct w1_family w1_family_1d = { .fid = W1_COUNTER_DS2423, .fops = &w1_f1d_fops, }; - -static int __init w1_f1d_init(void) -{ - return w1_register_family(&w1_family_1d); -} - -static void __exit w1_f1d_exit(void) -{ - w1_unregister_family(&w1_family_1d); -} - -module_init(w1_f1d_init); -module_exit(w1_f1d_exit); +module_w1_family(w1_family_1d); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mika Laitio <lamikr@pilppa.org>"); diff --git a/drivers/w1/slaves/w1_ds2431.c b/drivers/w1/slaves/w1_ds2431.c index 9c4ff9d28adc..80572cb63ba8 100644 --- a/drivers/w1/slaves/w1_ds2431.c +++ b/drivers/w1/slaves/w1_ds2431.c @@ -288,19 +288,7 @@ static struct w1_family w1_family_2d = { .fid = W1_EEPROM_DS2431, .fops = &w1_f2d_fops, }; - -static int __init w1_f2d_init(void) -{ - return w1_register_family(&w1_family_2d); -} - -static void __exit w1_f2d_fini(void) -{ - w1_unregister_family(&w1_family_2d); -} - -module_init(w1_f2d_init); -module_exit(w1_f2d_fini); +module_w1_family(w1_family_2d); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bernhard Weirich <bernhard.weirich@riedel.net>"); diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c index 72319a968a9e..6cf378c89ecb 100644 --- a/drivers/w1/slaves/w1_ds2433.c +++ b/drivers/w1/slaves/w1_ds2433.c @@ -305,16 +305,4 @@ static struct w1_family w1_family_23 = { .fid = W1_EEPROM_DS2433, .fops = &w1_f23_fops, }; - -static int __init w1_f23_init(void) -{ - return w1_register_family(&w1_family_23); -} - -static void __exit w1_f23_fini(void) -{ - w1_unregister_family(&w1_family_23); -} - -module_init(w1_f23_init); -module_exit(w1_f23_fini); +module_w1_family(w1_family_23); diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c index d9079d48d112..ffa37f773b3b 100644 --- a/drivers/w1/slaves/w1_ds2760.c +++ b/drivers/w1/slaves/w1_ds2760.c @@ -121,25 +121,14 @@ static const struct attribute_group *w1_ds2760_groups[] = { NULL, }; -static DEFINE_IDA(bat_ida); - static int w1_ds2760_add_slave(struct w1_slave *sl) { int ret; - int id; struct platform_device *pdev; - id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL); - if (id < 0) { - ret = id; - goto noid; - } - - pdev = platform_device_alloc("ds2760-battery", id); - if (!pdev) { - ret = -ENOMEM; - goto pdev_alloc_failed; - } + pdev = platform_device_alloc("ds2760-battery", PLATFORM_DEVID_AUTO); + if (!pdev) + return -ENOMEM; pdev->dev.parent = &sl->dev; ret = platform_device_add(pdev); @@ -148,24 +137,19 @@ static int w1_ds2760_add_slave(struct w1_slave *sl) dev_set_drvdata(&sl->dev, pdev); - goto success; + return 0; pdev_add_failed: platform_device_put(pdev); -pdev_alloc_failed: - ida_simple_remove(&bat_ida, id); -noid: -success: + return ret; } static void w1_ds2760_remove_slave(struct w1_slave *sl) { struct platform_device *pdev = dev_get_drvdata(&sl->dev); - int id = pdev->id; platform_device_unregister(pdev); - ida_simple_remove(&bat_ida, id); } static struct w1_family_ops w1_ds2760_fops = { @@ -178,28 +162,13 @@ static struct w1_family w1_ds2760_family = { .fid = W1_FAMILY_DS2760, .fops = &w1_ds2760_fops, }; - -static int __init w1_ds2760_init(void) -{ - pr_info("1-Wire driver for the DS2760 battery monitor chip - (c) 2004-2005, Szabolcs Gyurko\n"); - ida_init(&bat_ida); - return w1_register_family(&w1_ds2760_family); -} - -static void __exit w1_ds2760_exit(void) -{ - w1_unregister_family(&w1_ds2760_family); - ida_destroy(&bat_ida); -} +module_w1_family(w1_ds2760_family); EXPORT_SYMBOL(w1_ds2760_read); EXPORT_SYMBOL(w1_ds2760_write); EXPORT_SYMBOL(w1_ds2760_store_eeprom); EXPORT_SYMBOL(w1_ds2760_recall_eeprom); -module_init(w1_ds2760_init); -module_exit(w1_ds2760_exit); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Szabolcs Gyurko <szabolcs.gyurko@tlt.hu>"); MODULE_DESCRIPTION("1-wire Driver Dallas 2760 battery monitor chip"); diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c index 50e85f7929d4..f5c2aa429a92 100644 --- a/drivers/w1/slaves/w1_ds2780.c +++ b/drivers/w1/slaves/w1_ds2780.c @@ -113,25 +113,14 @@ static const struct attribute_group *w1_ds2780_groups[] = { NULL, }; -static DEFINE_IDA(bat_ida); - static int w1_ds2780_add_slave(struct w1_slave *sl) { int ret; - int id; struct platform_device *pdev; - id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL); - if (id < 0) { - ret = id; - goto noid; - } - - pdev = platform_device_alloc("ds2780-battery", id); - if (!pdev) { - ret = -ENOMEM; - goto pdev_alloc_failed; - } + pdev = platform_device_alloc("ds2780-battery", PLATFORM_DEVID_AUTO); + if (!pdev) + return -ENOMEM; pdev->dev.parent = &sl->dev; ret = platform_device_add(pdev); @@ -144,19 +133,15 @@ static int w1_ds2780_add_slave(struct w1_slave *sl) pdev_add_failed: platform_device_put(pdev); -pdev_alloc_failed: - ida_simple_remove(&bat_ida, id); -noid: + return ret; } static void w1_ds2780_remove_slave(struct w1_slave *sl) { struct platform_device *pdev = dev_get_drvdata(&sl->dev); - int id = pdev->id; platform_device_unregister(pdev); - ida_simple_remove(&bat_ida, id); } static struct w1_family_ops w1_ds2780_fops = { @@ -169,21 +154,7 @@ static struct w1_family w1_ds2780_family = { .fid = W1_FAMILY_DS2780, .fops = &w1_ds2780_fops, }; - -static int __init w1_ds2780_init(void) -{ - ida_init(&bat_ida); - return w1_register_family(&w1_ds2780_family); -} - -static void __exit w1_ds2780_exit(void) -{ - w1_unregister_family(&w1_ds2780_family); - ida_destroy(&bat_ida); -} - -module_init(w1_ds2780_init); -module_exit(w1_ds2780_exit); +module_w1_family(w1_ds2780_family); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Clifton Barnes <cabarnes@indesign-llc.com>"); diff --git a/drivers/w1/slaves/w1_ds2781.c b/drivers/w1/slaves/w1_ds2781.c index 1eb98fb1688d..9c03e014cf9e 100644 --- a/drivers/w1/slaves/w1_ds2781.c +++ b/drivers/w1/slaves/w1_ds2781.c @@ -17,7 +17,6 @@ #include <linux/types.h> #include <linux/platform_device.h> #include <linux/mutex.h> -#include <linux/idr.h> #include "../w1.h" #include "../w1_int.h" @@ -111,25 +110,14 @@ static const struct attribute_group *w1_ds2781_groups[] = { NULL, }; -static DEFINE_IDA(bat_ida); - static int w1_ds2781_add_slave(struct w1_slave *sl) { int ret; - int id; struct platform_device *pdev; - id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL); - if (id < 0) { - ret = id; - goto noid; - } - - pdev = platform_device_alloc("ds2781-battery", id); - if (!pdev) { - ret = -ENOMEM; - goto pdev_alloc_failed; - } + pdev = platform_device_alloc("ds2781-battery", PLATFORM_DEVID_AUTO); + if (!pdev) + return -ENOMEM; pdev->dev.parent = &sl->dev; ret = platform_device_add(pdev); @@ -142,19 +130,15 @@ static int w1_ds2781_add_slave(struct w1_slave *sl) pdev_add_failed: platform_device_put(pdev); -pdev_alloc_failed: - ida_simple_remove(&bat_ida, id); -noid: + return ret; } static void w1_ds2781_remove_slave(struct w1_slave *sl) { struct platform_device *pdev = dev_get_drvdata(&sl->dev); - int id = pdev->id; platform_device_unregister(pdev); - ida_simple_remove(&bat_ida, id); } static struct w1_family_ops w1_ds2781_fops = { @@ -167,21 +151,7 @@ static struct w1_family w1_ds2781_family = { .fid = W1_FAMILY_DS2781, .fops = &w1_ds2781_fops, }; - -static int __init w1_ds2781_init(void) -{ - ida_init(&bat_ida); - return w1_register_family(&w1_ds2781_family); -} - -static void __exit w1_ds2781_exit(void) -{ - w1_unregister_family(&w1_ds2781_family); - ida_destroy(&bat_ida); -} - -module_init(w1_ds2781_init); -module_exit(w1_ds2781_exit); +module_w1_family(w1_ds2781_family); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Renata Sayakhova <renata@oktetlabs.ru>"); diff --git a/drivers/w1/slaves/w1_ds28e04.c b/drivers/w1/slaves/w1_ds28e04.c index 365d6dff21de..5e348d38ec5c 100644 --- a/drivers/w1/slaves/w1_ds28e04.c +++ b/drivers/w1/slaves/w1_ds28e04.c @@ -427,16 +427,4 @@ static struct w1_family w1_family_1C = { .fid = W1_FAMILY_DS28E04, .fops = &w1_f1C_fops, }; - -static int __init w1_f1C_init(void) -{ - return w1_register_family(&w1_family_1C); -} - -static void __exit w1_f1C_fini(void) -{ - w1_unregister_family(&w1_family_1C); -} - -module_init(w1_f1C_init); -module_exit(w1_f1C_fini); +module_w1_family(w1_family_1C); diff --git a/drivers/w1/w1_family.h b/drivers/w1/w1_family.h index ed5dcb80a1f7..10a7a0767187 100644 --- a/drivers/w1/w1_family.h +++ b/drivers/w1/w1_family.h @@ -88,4 +88,16 @@ struct w1_family * w1_family_registered(u8); void w1_unregister_family(struct w1_family *); int w1_register_family(struct w1_family *); +/** + * module_w1_driver() - Helper macro for registering a 1-Wire families + * @__w1_family: w1_family struct + * + * Helper macro for 1-Wire families which do not do anything special in module + * init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_w1_family(__w1_family) \ + module_driver(__w1_family, w1_register_family, \ + w1_unregister_family) + #endif /* __W1_FAMILY_H */ diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 53a085fca00c..66620713242a 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -195,7 +195,7 @@ static void selfballoon_process(struct work_struct *work) MB2PAGES(selfballoon_reserved_mb); #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ - if (frontswap_selfshrinking && frontswap_enabled) + if (frontswap_selfshrinking) goal_pages += frontswap_curr_pages(); #endif if (cur_pages > goal_pages) @@ -230,7 +230,7 @@ static void selfballoon_process(struct work_struct *work) reset_timer = true; } #ifdef CONFIG_FRONTSWAP - if (frontswap_selfshrinking && frontswap_enabled) { + if (frontswap_selfshrinking) { frontswap_selfshrink(); reset_timer = true; } diff --git a/fs/befs/befs.h b/fs/befs/befs.h index e0f59263a96d..c5c6cd13709c 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -140,18 +140,6 @@ befs_iaddrs_per_block(struct super_block *sb) return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); } -static inline int -befs_iaddr_is_empty(const befs_inode_addr *iaddr) -{ - return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); -} - -static inline size_t -befs_brun_size(struct super_block *sb, befs_block_run run) -{ - return BEFS_SB(sb)->block_size * run.len; -} - #include "endian.h" #endif /* _LINUX_BEFS_H */ diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index af1bc19b7c85..26cc417cbdce 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -326,7 +326,7 @@ befs_find_brun_indirect(struct super_block *sb, /* Examine blocks of the indirect run one at a time */ for (i = 0; i < indirect.len; i++) { - indirblock = befs_bread(sb, indirblockno + i); + indirblock = sb_bread(sb, indirblockno + i); if (indirblock == NULL) { befs_debug(sb, "---> %s failed to read " "disk block %lu from the indirect brun", @@ -471,7 +471,7 @@ befs_find_brun_dblindirect(struct super_block *sb, } dbl_indir_block = - befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + sb_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { befs_error(sb, "%s couldn't read the " @@ -499,7 +499,7 @@ befs_find_brun_dblindirect(struct super_block *sb, } indir_block = - befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + sb_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { befs_error(sb, "%s couldn't read the indirect block " "at blockno %lu", __func__, (unsigned long) diff --git a/fs/befs/io.c b/fs/befs/io.c index 523c8af2d770..4223b779fcc4 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -59,27 +59,3 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } - -struct buffer_head * -befs_bread(struct super_block *sb, befs_blocknr_t block) -{ - struct buffer_head *bh; - - befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); - - bh = sb_bread(sb, block); - - if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", - (unsigned long)block); - goto error; - } - - befs_debug(sb, "<--- %s", __func__); - - return bh; - - error: - befs_debug(sb, "<--- %s ERROR", __func__); - return NULL; -} diff --git a/fs/befs/io.h b/fs/befs/io.h index 9b78266b6aa5..78d7bc6e60de 100644 --- a/fs/befs/io.h +++ b/fs/befs/io.h @@ -5,5 +5,3 @@ struct buffer_head *befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr); -struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); - diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7da05b159ade..619b998e9a1e 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -318,7 +318,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_ino->i_inode_num.allocation_group, befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); - bh = befs_bread(sb, inode->i_ino); + bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " "inode = %lu", inode->i_ino); @@ -436,10 +436,9 @@ befs_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (befs_inode_cachep == NULL) { - pr_err("%s: Couldn't initialize inode slabcache\n", __func__); + if (befs_inode_cachep == NULL) return -ENOMEM; - } + return 0; } @@ -524,8 +523,6 @@ befs_utf2nls(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); - *out_len = 0; return -ENOMEM; } @@ -604,7 +601,6 @@ befs_nls2utf(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -760,19 +756,19 @@ befs_fill_super(struct super_block *sb, void *data, int silent) long ret = -EINVAL; const unsigned long sb_block = 0; const off_t x86_sb_off = 512; + int blocksize; save_mount_options(sb, data); sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); - if (sb->s_fs_info == NULL) { - pr_err("(%s): Unable to allocate memory for private " - "portion of superblock. Bailing.\n", sb->s_id); + if (sb->s_fs_info == NULL) goto unacquire_none; - } + befs_sb = BEFS_SB(sb); if (!parse_options((char *) data, &befs_sb->mount_opts)) { - befs_error(sb, "cannot parse mount options"); + if (!silent) + befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } @@ -793,10 +789,15 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ - sb_min_blocksize(sb, 1024); + blocksize = sb_min_blocksize(sb, 1024); + if (!blocksize) { + befs_error(sb, "unable to set blocksize"); + goto unacquire_priv_sbp; + } if (!(bh = sb_bread(sb, sb_block))) { - befs_error(sb, "unable to read superblock"); + if (!silent) + befs_error(sb, "unable to read superblock"); goto unacquire_priv_sbp; } @@ -820,9 +821,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %llu " - "is larger than the host can use", - befs_sb->num_blocks); + if (!silent) + befs_error(sb, "blocks count: %llu is larger than the host can use", + befs_sb->num_blocks); goto unacquire_priv_sbp; } @@ -841,7 +842,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); if (!sb->s_root) { - befs_error(sb, "get root inode failed"); + if (!silent) + befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } @@ -870,9 +872,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) unacquire_priv_sbp: kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); + sb->s_fs_info = NULL; unacquire_none: - sb->s_fs_info = NULL; return ret; } diff --git a/fs/befs/super.c b/fs/befs/super.c index aeafc4d84278..9d1b56cd6f19 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -14,7 +14,7 @@ #include "super.h" /** - * load_befs_sb -- Read from disk and properly byteswap all the fields + * befs_load_sb -- Read from disk and properly byteswap all the fields * of the befs superblock * * diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 950334dced04..e3b80b6cd032 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4179,7 +4179,8 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { + page->index, + readahead_gfp_mask(mapping))) { put_page(page); continue; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d4890b6dc22d..579e41b350a2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3366,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); INIT_LIST_HEAD(tmplist); diff --git a/fs/compat.c b/fs/compat.c index be6e48b0a46c..e07968974391 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -54,20 +54,6 @@ #include <asm/ioctls.h> #include "internal.h" -int compat_log = 1; - -int compat_printk(const char *fmt, ...) -{ - va_list ap; - int ret; - if (!compat_log) - return 0; - va_start(ap, fmt); - ret = vprintk(fmt, ap); - va_end(ap); - return ret; -} - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. diff --git a/fs/exec.c b/fs/exec.c index ca239fc86d8d..a1789cd684bf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -866,7 +866,8 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, goto out; } - *buf = vmalloc(i_size); + if (id != READING_FIRMWARE_PREALLOC_BUFFER) + *buf = vmalloc(i_size); if (!*buf) { ret = -ENOMEM; goto out; @@ -897,8 +898,10 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, out_free: if (ret < 0) { - vfree(*buf); - *buf = NULL; + if (id != READING_FIRMWARE_PREALLOC_BUFFER) { + vfree(*buf); + *buf = NULL; + } } out: diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 2ced5a823354..ec9073c81fdf 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) + readahead_gfp_mask(mapping))) goto next_page; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 74caf14ae410..aced24b986f6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -996,7 +996,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) + page->index, + readahead_gfp_mask(mapping))) goto next_page; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 989a2cef6b76..e21d20bc8a54 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -981,6 +981,42 @@ void inode_io_list_del(struct inode *inode) } /* + * mark an inode as under writeback on the sb + */ +void sb_mark_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (list_empty(&inode->i_wb_list)) { + list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); + trace_sb_mark_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* + * clear an inode as under writeback on the sb + */ +void sb_clear_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (!list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (!list_empty(&inode->i_wb_list)) { + list_del_init(&inode->i_wb_list); + trace_sb_clear_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -2154,7 +2190,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); */ static void wait_sb_inodes(struct super_block *sb) { - struct inode *inode, *old_inode = NULL; + LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from @@ -2163,38 +2199,60 @@ static void wait_sb_inodes(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); - spin_lock(&sb->s_inode_list_lock); /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Splice the writeback list onto a temporary list to avoid waiting on + * inodes that have started writeback after this point. + * + * Use rcu_read_lock() to keep the inodes around until we have a + * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as + * the local list because inodes can be dropped from either by writeback + * completion. + */ + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); + list_splice_init(&sb->s_inodes_wb, &sync_list); + + /* + * Data integrity sync. Must wait for all pages under writeback, because + * there may have been pages dirtied before our sync call, but which had + * writeout started before we write it out. In which case, the inode + * may not be on the dirty list, but we still have to wait for that + * writeout. */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + while (!list_empty(&sync_list)) { + struct inode *inode = list_first_entry(&sync_list, struct inode, + i_wb_list); struct address_space *mapping = inode->i_mapping; + /* + * Move each inode back to the wb list before we drop the lock + * to preserve consistency between i_wb_list and the mapping + * writeback tag. Writeback completion is responsible to remove + * the inode from either list once the writeback tag is cleared. + */ + list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); + + /* + * The mapping can appear untagged while still on-list since we + * do not have the mapping lock. Skip it here, wb completion + * will remove it. + */ + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + continue; + + spin_unlock_irq(&sb->s_inode_wblist_lock); + spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (mapping->nrpages == 0)) { + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); + + spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; + rcu_read_unlock(); /* * We keep the error status of individual mapping so that @@ -2205,10 +2263,13 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&sb->s_inode_list_lock); + iput(inode); + + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); } - spin_unlock(&sb->s_inode_list_lock); - iput(old_inode); + spin_unlock_irq(&sb->s_inode_wblist_lock); + rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..e171f7b5f9e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -365,6 +365,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -507,6 +508,7 @@ void clear_inode(struct inode *inode) BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } diff --git a/fs/mpage.c b/fs/mpage.c index 37b28280ad04..2ca1f39c8cba 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -72,6 +72,8 @@ mpage_alloc(struct block_device *bdev, { struct bio *bio; + /* Restrict the given (page cache) mask for slab allocations */ + gfp_flags &= GFP_KERNEL; bio = bio_alloc(gfp_flags, nr_vecs); if (bio == NULL && (current->flags & PF_MEMALLOC)) { @@ -363,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); map_bh.b_state = 0; map_bh.b_size = 0; diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 1a85d94f5b25..2c90e285d7c6 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -622,10 +622,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)req->pr_entry_nr, - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); @@ -663,10 +663,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)req->pr_entry_nr, - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); @@ -772,10 +772,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)entry_nrs[j], - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)entry_nrs[j]); } else { n++; } @@ -816,12 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) for (k = 0; k < nempties; k++) { ret = nilfs_palloc_delete_entry_block(inode, last_nrs[k]); - if (ret && ret != -ENOENT) { - nilfs_warning(inode->i_sb, __func__, - "failed to delete block of entry %llu: ino=%lu, err=%d", - (unsigned long long)last_nrs[k], - (unsigned long)inode->i_ino, ret); - } + if (ret && ret != -ENOENT) + nilfs_msg(inode->i_sb, KERN_WARNING, + "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", + ret, (unsigned long long)last_nrs[k], + inode->i_ino); } desc_kaddr = kmap_atomic(desc_bh->b_page); @@ -835,12 +834,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (nfree == nilfs_palloc_entries_per_group(inode)) { ret = nilfs_palloc_delete_bitmap_block(inode, group); - if (ret && ret != -ENOENT) { - nilfs_warning(inode->i_sb, __func__, - "failed to delete bitmap block of group %lu: ino=%lu, err=%d", - group, - (unsigned long)inode->i_ino, ret); - } + if (ret && ret != -ENOENT) + nilfs_msg(inode->i_sb, KERN_WARNING, + "error %d deleting bitmap block of group=%lu, ino=%lu", + ret, group, inode->i_ino); } } return 0; diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index f2a7877e0c8c..01fb1831ca25 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -41,8 +41,8 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, struct inode *inode = bmap->b_inode; if (err == -EINVAL) { - nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)", inode->i_ino); + __nilfs_error(inode->i_sb, fname, + "broken bmap (inode number=%lu)", inode->i_ino); err = -EIO; } return err; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b6a4c8f93ac8..2b6ffbe5997a 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */ #include "alloc.h" #include "dat.h" diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 4cca998ec7a0..d5c23da43513 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -41,7 +41,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) struct inode *inode = NILFS_BTNC_I(btnc); struct buffer_head *bh; - bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) return NULL; @@ -70,7 +70,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, struct page *page; int err; - bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) return -ENOMEM; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 982d1e3df3a5..2e315f9f2e51 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -339,12 +339,14 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, * nilfs_btree_node_broken - verify consistency of btree node * @node: btree node block to be examined * @size: node size (in bytes) + * @inode: host inode of btree * @blocknr: block number * * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. */ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, - size_t size, sector_t blocknr) + size_t size, struct inode *inode, + sector_t blocknr) { int level, flags, nchildren; int ret = 0; @@ -358,9 +360,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, (flags & NILFS_BTREE_NODE_ROOT) || nchildren < 0 || nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { - printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " - "level = %d, flags = 0x%x, nchildren = %d\n", - (unsigned long long)blocknr, level, flags, nchildren); + nilfs_msg(inode->i_sb, KERN_CRIT, + "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", + inode->i_ino, (unsigned long long)blocknr, level, + flags, nchildren); ret = 1; } return ret; @@ -369,12 +372,12 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, /** * nilfs_btree_root_broken - verify consistency of btree root node * @node: btree root node to be examined - * @ino: inode number + * @inode: host inode of btree * * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. */ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, - unsigned long ino) + struct inode *inode) { int level, flags, nchildren; int ret = 0; @@ -387,8 +390,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { - pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", - ino, level, flags, nchildren); + nilfs_msg(inode->i_sb, KERN_CRIT, + "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d", + inode->i_ino, level, flags, nchildren); ret = 1; } return ret; @@ -396,13 +400,15 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, int nilfs_btree_broken_node_block(struct buffer_head *bh) { + struct inode *inode; int ret; if (buffer_nilfs_checked(bh)) return 0; + inode = bh->b_page->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, - bh->b_size, bh->b_blocknr); + bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) set_buffer_nilfs_checked(bh); return ret; @@ -448,13 +454,15 @@ nilfs_btree_get_node(const struct nilfs_bmap *btree, return node; } -static int -nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +static int nilfs_btree_bad_node(const struct nilfs_bmap *btree, + struct nilfs_btree_node *node, int level) { if (unlikely(nilfs_btree_node_get_level(node) != level)) { dump_stack(); - printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", - nilfs_btree_node_get_level(node), level); + nilfs_msg(btree->b_inode->i_sb, KERN_CRIT, + "btree level mismatch (ino=%lu): %d != %d", + btree->b_inode->i_ino, + nilfs_btree_node_get_level(node), level); return 1; } return 0; @@ -509,6 +517,9 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, out_no_wait: if (!buffer_uptodate(bh)) { + nilfs_msg(btree->b_inode->i_sb, KERN_ERR, + "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)", + btree->b_inode->i_ino, (unsigned long long)ptr); brelse(bh); return -EIO; } @@ -568,7 +579,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, return ret; node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_bad_node(node, level)) + if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; if (!found) found = nilfs_btree_node_lookup(node, key, &index); @@ -616,7 +627,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_bad_node(node, level)) + if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(node, index, ncmax); @@ -2072,8 +2083,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { if (unlikely(ret == -ENOENT)) - printk(KERN_CRIT "%s: key = %llu, level == %d\n", - __func__, (unsigned long long)key, level); + nilfs_msg(btree->b_inode->i_sb, KERN_CRIT, + "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", + btree->b_inode->i_ino, + (unsigned long long)key, level); goto out; } @@ -2110,12 +2123,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, if (level < NILFS_BTREE_LEVEL_NODE_MIN || level >= NILFS_BTREE_LEVEL_MAX) { dump_stack(); - printk(KERN_WARNING - "%s: invalid btree level: %d (key=%llu, ino=%lu, " - "blocknr=%llu)\n", - __func__, level, (unsigned long long)key, - NILFS_BMAP_I(btree)->vfs_inode.i_ino, - (unsigned long long)bh->b_blocknr); + nilfs_msg(btree->b_inode->i_sb, KERN_WARNING, + "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)", + level, (unsigned long long)key, + btree->b_inode->i_ino, + (unsigned long long)bh->b_blocknr); return; } @@ -2394,8 +2406,7 @@ int nilfs_btree_init(struct nilfs_bmap *bmap) __nilfs_btree_init(bmap); - if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), - bmap->b_inode->i_ino)) + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; return ret; } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index df1a25faa83b..2184e47fa4bf 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/list.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_btree_node */ #include "btnode.h" #include "bmap.h" diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 8a3d3b65af3f..a15a1601e931 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -21,7 +21,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "cpfile.h" @@ -332,9 +331,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, int ret, ncps, nicps, nss, count, i; if (unlikely(start == 0 || start > end)) { - printk(KERN_ERR "%s: invalid range of checkpoint numbers: " - "[%llu, %llu)\n", __func__, - (unsigned long long)start, (unsigned long long)end); + nilfs_msg(cpfile->i_sb, KERN_ERR, + "cannot delete checkpoints: invalid range [%llu, %llu)", + (unsigned long long)start, (unsigned long long)end); return -EINVAL; } @@ -386,9 +385,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, cpfile, cno); if (ret == 0) continue; - printk(KERN_ERR - "%s: cannot delete block\n", - __func__); + nilfs_msg(cpfile->i_sb, KERN_ERR, + "error %d deleting checkpoint block", + ret); break; } } @@ -991,14 +990,12 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, int err; if (cpsize > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large checkpoint size: %zu bytes.\n", - cpsize); + nilfs_msg(sb, KERN_ERR, + "too large checkpoint size: %zu bytes", cpsize); return -EINVAL; } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { - printk(KERN_ERR - "NILFS: too small checkpoint size: %zu bytes.\n", - cpsize); + nilfs_msg(sb, KERN_ERR, + "too small checkpoint size: %zu bytes", cpsize); return -EINVAL; } diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 0249744ae234..6eca972f9673 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -21,7 +21,8 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_api.h> /* nilfs_cpstat */ +#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 7367610ea807..dffedb2f8817 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -349,10 +349,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) kaddr = kmap_atomic(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { - printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, - (unsigned long long)vblocknr, - (unsigned long long)le64_to_cpu(entry->de_start), - (unsigned long long)le64_to_cpu(entry->de_end)); + nilfs_msg(dat->i_sb, KERN_CRIT, + "%s: invalid vblocknr = %llu, [%llu, %llu)", + __func__, (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); kunmap_atomic(kaddr); brelse(entry_bh); return -EINVAL; @@ -479,14 +480,12 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, int err; if (entry_size > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large DAT entry size: %zu bytes.\n", - entry_size); + nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes", + entry_size); return -EINVAL; } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { - printk(KERN_ERR - "NILFS: too small DAT entry size: %zu bytes.\n", - entry_size); + nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes", + entry_size); return -EINVAL; } diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index abbfdabcabea..57dc6cf466d0 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -22,6 +22,7 @@ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */ struct nilfs_palloc_req; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index e506f4f7120a..908ebbf0ac7e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -42,6 +42,28 @@ #include "nilfs.h" #include "page.h" +static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) +{ + unsigned int len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == NILFS_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 nilfs_rec_len_to_disk(unsigned int len) +{ +#if (PAGE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(NILFS_MAX_REC_LEN); + + BUG_ON(len > (1 << 16)); +#endif + return cpu_to_le16(len); +} + /* * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have @@ -140,10 +162,9 @@ out: /* Too bad, we had an error */ Ebadsize: - nilfs_error(sb, "nilfs_check_page", + nilfs_error(sb, "size of directory #%lu is not a multiple of chunk size", - dir->i_ino - ); + dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -157,19 +178,18 @@ Enamelen: Espan: error = "directory entry across blocks"; bad_entry: - nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, - (unsigned long) le64_to_cpu(p->inode), + nilfs_error(sb, + "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index << PAGE_SHIFT) + offs, + (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); - nilfs_error(sb, "nilfs_check_page", - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs, - (unsigned long) le64_to_cpu(p->inode)); + nilfs_error(sb, + "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", + dir->i_ino, (page->index << PAGE_SHIFT) + offs, + (unsigned long)le64_to_cpu(p->inode)); fail: SetPageError(page); return false; @@ -267,8 +287,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) struct page *page = nilfs_get_page(inode, n); if (IS_ERR(page)) { - nilfs_error(sb, __func__, "bad page in #%lu", - inode->i_ino); + nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } @@ -278,8 +297,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) NILFS_DIR_REC_LEN(1); for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { - nilfs_error(sb, __func__, - "zero-length directory entry"); + nilfs_error(sb, "zero-length directory entry"); nilfs_put_page(page); return -EIO; } @@ -345,7 +363,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, kaddr += nilfs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "zero-length directory entry"); nilfs_put_page(page); goto out; @@ -360,7 +378,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, n = 0; /* next page is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); @@ -469,7 +487,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; } if (de->rec_len == 0) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "zero-length directory entry"); err = -EIO; goto out_unlock; @@ -541,7 +559,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) while ((char *)de < (char *)dir) { if (de->rec_len == 0) { - nilfs_error(inode->i_sb, __func__, + nilfs_error(inode->i_sb, "zero-length directory entry"); err = -EIO; goto out; @@ -628,7 +646,7 @@ int nilfs_empty_dir(struct inode *inode) while ((char *)de <= kaddr) { if (de->rec_len == 0) { - nilfs_error(inode->i_sb, __func__, + nilfs_error(inode->i_sb, "zero-length directory entry (kaddr=%p, de=%p)", kaddr, de); goto not_empty; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 251a44928405..96e3ed0d9652 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -337,14 +337,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { - printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, - (unsigned long long)key); + nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT, + "%s (ino=%lu): invalid key: %llu", __func__, + bmap->b_inode->i_ino, (unsigned long long)key); return -EINVAL; } ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { - printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, - (unsigned long long)ptr); + nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT, + "%s (ino=%lu): invalid pointer: %llu", __func__, + bmap->b_inode->i_ino, (unsigned long long)ptr); return -EINVAL; } diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index 3015a6e78724..cfe85e848bba 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -24,16 +24,6 @@ #include "bmap.h" -/** - * struct nilfs_direct_node - direct node - * @dn_flags: flags - * @dn_pad: padding - */ -struct nilfs_direct_node { - __u8 dn_flags; - __u8 pad[7]; -}; - #define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) #define NILFS_DIRECT_KEY_MIN 0 #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index e9148f94d696..853a831dcde0 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -148,8 +148,15 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) { wait_on_buffer(bh); - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { + struct inode *inode = bh->b_page->mapping->host; + + nilfs_msg(inode->i_sb, KERN_ERR, + "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", + buffer_nilfs_node(bh) ? "node" : "data", + inode->i_ino, (unsigned long long)bh->b_blocknr); return -EIO; + } if (buffer_dirty(bh)) return -EEXIST; diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 1d2b1805327a..b8fa45c20c63 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -145,15 +145,14 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, int err; if (unlikely(!NILFS_VALID_INODE(sb, ino))) { - nilfs_error(sb, __func__, "bad inode number: %lu", - (unsigned long) ino); + nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino); return -EINVAL; } err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); if (unlikely(err)) - nilfs_warning(sb, __func__, "unable to read inode: %lu", - (unsigned long) ino); + nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu", + err, (unsigned long)ino); return err; } diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 23ad2f091e76..188b94fe0ec5 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -23,7 +23,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "alloc.h" diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index a0ebdb17e912..af04f553d7c9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -112,13 +112,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, * However, the page having this block must * be locked in this case. */ - printk(KERN_WARNING - "nilfs_get_block: a race condition " - "while inserting a data block. " - "(inode number=%lu, file block " - "offset=%llu)\n", - inode->i_ino, - (unsigned long long)blkoff); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): a race condition while inserting a data block at offset=%llu", + __func__, inode->i_ino, + (unsigned long long)blkoff); err = 0; } nilfs_transaction_abort(inode->i_sb); @@ -359,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) root = NILFS_I(dir)->i_root; ii = NILFS_I(inode); - ii->i_state = 1 << NILFS_I_NEW; + ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); @@ -558,7 +555,7 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) inode->i_ino = args->ino; if (args->for_gc) { - NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; + NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); NILFS_I(inode)->i_cno = args->cno; NILFS_I(inode)->i_root = NULL; } else { @@ -726,9 +723,9 @@ repeat: goto repeat; failed: - nilfs_warning(ii->vfs_inode.i_sb, __func__, - "failed to truncate bmap (ino=%lu, err=%d)", - ii->vfs_inode.i_ino, ret); + nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING, + "error %d truncating bmap (ino=%lu)", ret, + ii->vfs_inode.i_ino); } void nilfs_truncate(struct inode *inode) @@ -939,9 +936,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) * This will happen when somebody is freeing * this inode. */ - nilfs_warning(inode->i_sb, __func__, - "cannot get inode (ino=%lu)", - inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "cannot set file dirty (ino=%lu): the file is being freed", + inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); return -EINVAL; /* * NILFS_I_DIRTY may remain for @@ -962,8 +959,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { - nilfs_warning(inode->i_sb, __func__, - "failed to reget inode block."); + nilfs_msg(inode->i_sb, KERN_WARNING, + "cannot mark inode dirty (ino=%lu): error %d loading inode block", + inode->i_ino, err); return err; } nilfs_update_inode(inode, ibh, flags); @@ -989,8 +987,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags) struct nilfs_mdt_info *mdi = NILFS_MDT(inode); if (is_bad_inode(inode)) { - nilfs_warning(inode->i_sb, __func__, - "tried to mark bad_inode dirty. ignored."); + nilfs_msg(inode->i_sb, KERN_WARNING, + "tried to mark bad_inode dirty. ignored."); dump_stack(); return; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 358b57e2cdf9..f1d7989459fd 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -25,7 +25,6 @@ #include <linux/compat.h> /* compat_ptr() */ #include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */ #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "nilfs.h" #include "segment.h" #include "bmap.h" @@ -584,27 +583,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, if (unlikely(ret < 0)) { if (ret == -ENOENT) - printk(KERN_CRIT - "%s: invalid virtual block address (%s): " - "ino=%llu, cno=%llu, offset=%llu, " - "blocknr=%llu, vblocknr=%llu\n", - __func__, vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); + nilfs_msg(inode->i_sb, KERN_CRIT, + "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); return ret; } if (unlikely(!list_empty(&bh->b_assoc_buffers))) { - printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " - "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", - __func__, vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); + nilfs_msg(inode->i_sb, KERN_CRIT, + "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); brelse(bh); return -EEXIST; } @@ -854,8 +851,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return 0; failed: - printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", - msg, ret); + nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret, + msg); return ret; } @@ -963,10 +960,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); - if (ret < 0) - printk(KERN_ERR "NILFS: GC failed during preparation: " - "cannot read source blocks: err=%d\n", ret); - else { + if (ret < 0) { + nilfs_msg(inode->i_sb, KERN_ERR, + "error %d preparing GC: cannot read source blocks", + ret); + } else { if (nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 0d7b71fbeff8..d56d3a5bea88 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -207,8 +207,12 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, out_no_wait: err = -EIO; - if (!buffer_uptodate(first_bh)) + if (!buffer_uptodate(first_bh)) { + nilfs_msg(inode->i_sb, KERN_ERR, + "I/O error reading meta-data file (ino=%lu, block-offset=%lu)", + inode->i_ino, block); goto failed_bh; + } out: *out_bh = first_bh; return 0; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1ec8ae5995a5..dbcf1dc93a51 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -283,9 +283,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) goto out; if (!inode->i_nlink) { - nilfs_warning(inode->i_sb, __func__, - "deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + nilfs_msg(inode->i_sb, KERN_WARNING, + "deleting nonexistent file (ino=%lu), %d", + inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index b1d48bc0532d..33f8c8fc96e8 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -23,7 +23,8 @@ #include <linux/buffer_head.h> #include <linux/spinlock.h> #include <linux/blkdev.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_api.h> +#include <linux/nilfs2_ondisk.h> #include "the_nilfs.h" #include "bmap.h" @@ -119,20 +120,19 @@ enum { /* * Macros to check inode numbers */ -#define NILFS_MDT_INO_BITS \ - ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ - 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ - 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) +#define NILFS_MDT_INO_BITS \ + (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \ + BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \ + BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO)) -#define NILFS_SYS_INO_BITS \ - ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) +#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) #define NILFS_MDT_INODE(sb, ino) \ - ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino))) #define NILFS_VALID_INODE(sb, ino) \ - ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino))) /** * struct nilfs_transaction_info: context information for synchronization @@ -299,10 +299,36 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); + extern __printf(3, 4) -void nilfs_error(struct super_block *, const char *, const char *, ...); +void __nilfs_msg(struct super_block *sb, const char *level, + const char *fmt, ...); extern __printf(3, 4) -void nilfs_warning(struct super_block *, const char *, const char *, ...); +void __nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...); + +#ifdef CONFIG_PRINTK + +#define nilfs_msg(sb, level, fmt, ...) \ + __nilfs_msg(sb, level, fmt, ##__VA_ARGS__) +#define nilfs_error(sb, fmt, ...) \ + __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__) + +#else + +#define nilfs_msg(sb, level, fmt, ...) \ + do { \ + no_printk(fmt, ##__VA_ARGS__); \ + (void)(sb); \ + } while (0) +#define nilfs_error(sb, fmt, ...) \ + do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __nilfs_error(sb, "", " "); \ + } while (0) + +#endif /* CONFIG_PRINTK */ + extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d97ba5f11b77..f11a3ad2df0c 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -30,9 +30,9 @@ #include "mdt.h" -#define NILFS_BUFFER_INHERENT_BITS \ - ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) +#define NILFS_BUFFER_INHERENT_BITS \ + (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \ + BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -85,9 +85,9 @@ void nilfs_forget_buffer(struct buffer_head *bh) { struct page *page = bh->b_page; const unsigned long clear_bits = - (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | - 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | + BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); @@ -124,17 +124,17 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) dbh->b_bdev = sbh->b_bdev; bh = dbh; - bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped)); while ((bh = bh->b_this_page) != dbh) { lock_buffer(bh); bits &= bh->b_state; unlock_buffer(bh); } - if (bits & (1UL << BH_Uptodate)) + if (bits & BIT(BH_Uptodate)) SetPageUptodate(dpage); else ClearPageUptodate(dpage); - if (bits & (1UL << BH_Mapped)) + if (bits & BIT(BH_Mapped)) SetPageMappedToDisk(dpage); else ClearPageMappedToDisk(dpage); @@ -215,7 +215,7 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) - mask |= (1UL << BH_Dirty); + mask |= BIT(BH_Dirty); dbh = dbufs = page_buffers(dst); do { @@ -403,11 +403,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) BUG_ON(!PageLocked(page)); - if (!silent) { - nilfs_warning(sb, __func__, - "discard page: offset %lld, ino %lu", - page_offset(page), inode->i_ino); - } + if (!silent) + nilfs_msg(sb, KERN_WARNING, + "discard dirty page: offset=%lld, ino=%lu", + page_offset(page), inode->i_ino); ClearPageUptodate(page); ClearPageMappedToDisk(page); @@ -415,18 +414,18 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) if (page_has_buffers(page)) { struct buffer_head *bh, *head; const unsigned long clear_bits = - (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | - 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | + BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); bh = head = page_buffers(page); do { lock_buffer(bh); - if (!silent) { - nilfs_warning(sb, __func__, - "discard block %llu, size %zu", - (u64)bh->b_blocknr, bh->b_size); - } + if (!silent) + nilfs_msg(sb, KERN_WARNING, + "discard dirty block: blocknr=%llu, size=%zu", + (u64)bh->b_blocknr, bh->b_size); + set_mask_bits(&bh->b_state, clear_bits, 0); unlock_buffer(bh); } while (bh = bh->b_this_page, bh != head); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d893dc912b62..5139efed1888 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -54,38 +54,37 @@ struct nilfs_recovery_block { }; -static int nilfs_warn_segment_error(int err) +static int nilfs_warn_segment_error(struct super_block *sb, int err) { + const char *msg = NULL; + switch (err) { case NILFS_SEG_FAIL_IO: - printk(KERN_WARNING - "NILFS warning: I/O error on loading last segment\n"); + nilfs_msg(sb, KERN_ERR, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: - printk(KERN_WARNING - "NILFS warning: Segment magic number invalid\n"); + msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: - printk(KERN_WARNING - "NILFS warning: Sequence number mismatch\n"); + msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: - printk(KERN_WARNING - "NILFS warning: Checksum error in super root\n"); + msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: - printk(KERN_WARNING - "NILFS warning: Checksum error in segment payload\n"); + msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: - printk(KERN_WARNING - "NILFS warning: Inconsistent segment\n"); + msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: - printk(KERN_WARNING - "NILFS warning: No super root in the last segment\n"); + msg = "No super root in the last segment"; break; + default: + nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err); + return -EINVAL; } + nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg); return -EINVAL; } @@ -178,7 +177,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, brelse(bh_sr); failed: - return nilfs_warn_segment_error(ret); + return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** @@ -553,11 +552,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, put_page(page); failed_inode: - printk(KERN_WARNING - "NILFS warning: error recovering data block " - "(err=%d, ino=%lu, block-offset=%llu)\n", - err, (unsigned long)rb->ino, - (unsigned long long)rb->blkoff); + nilfs_msg(sb, KERN_WARNING, + "error %d recovering data block (ino=%lu, block-offset=%llu)", + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: @@ -680,8 +678,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } if (nsalvaged_blocks) { - printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", - sb->s_id, nsalvaged_blocks); + nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks", + nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: @@ -692,10 +690,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, confused: err = -EINVAL; failed: - printk(KERN_ERR - "NILFS (device %s): Error roll-forwarding " - "(err=%d, pseg block=%llu). ", - sb->s_id, err, (unsigned long long)pseg_start); + nilfs_msg(sb, KERN_ERR, + "error %d roll-forwarding partial segment at blocknr = %llu", + err, (unsigned long long)pseg_start); goto out; } @@ -715,9 +712,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, set_buffer_dirty(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) - printk(KERN_WARNING - "NILFS warning: buffer sync write failed during " - "post-cleaning of recovery.\n"); + nilfs_msg(nilfs->ns_sb, KERN_WARNING, + "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } @@ -752,8 +748,8 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { - printk(KERN_ERR - "NILFS: error loading the latest checkpoint.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d loading the latest checkpoint", err); return err; } @@ -764,8 +760,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { - printk(KERN_ERR "NILFS: Error preparing segments for " - "recovery.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d preparing segment for recovery", + err); goto failed; } @@ -778,8 +775,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, nilfs_detach_log_writer(sb); if (unlikely(err)) { - printk(KERN_ERR "NILFS: Oops! recovery failed. " - "(err=%d)\n", err); + nilfs_msg(sb, KERN_ERR, + "error %d writing segment for recovery", + err); goto failed; } @@ -961,5 +959,5 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); - return (ret < 0) ? ret : nilfs_warn_segment_error(ret); + return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index a962d7d83447..6f87b2ac1aeb 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -514,7 +514,11 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) } while (--segbuf->sb_nbio > 0); if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { - printk(KERN_ERR "NILFS: IO error writing segment\n"); + nilfs_msg(segbuf->sb_super, KERN_ERR, + "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu", + (unsigned long long)segbuf->sb_pseg_start, + segbuf->sb_sum.nblocks, + (unsigned long long)segbuf->sb_segnum); err = -EIO; } return err; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index e78b68a81aec..bedcae2c28e6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -150,7 +150,8 @@ static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); #define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) #define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) -static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +static int nilfs_prepare_segment_lock(struct super_block *sb, + struct nilfs_transaction_info *ti) { struct nilfs_transaction_info *cur_ti = current->journal_info; void *save = NULL; @@ -164,8 +165,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) * it is saved and will be restored on * nilfs_transaction_commit(). */ - printk(KERN_WARNING - "NILFS warning: journal info from a different FS\n"); + nilfs_msg(sb, KERN_WARNING, "journal info from a different FS"); save = current->journal_info; } if (!ti) { @@ -215,7 +215,7 @@ int nilfs_transaction_begin(struct super_block *sb, int vacancy_check) { struct the_nilfs *nilfs; - int ret = nilfs_prepare_segment_lock(ti); + int ret = nilfs_prepare_segment_lock(sb, ti); struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) @@ -373,7 +373,7 @@ static void nilfs_transaction_lock(struct super_block *sb, nilfs_segctor_do_immediate_flush(sci); up_write(&nilfs->ns_segctor_sem); - yield(); + cond_resched(); } if (gcflag) ti->ti_flags |= NILFS_TI_GC; @@ -1858,11 +1858,11 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - const unsigned long set_bits = (1 << BH_Uptodate); + const unsigned long set_bits = BIT(BH_Uptodate); const unsigned long clear_bits = - (1 << BH_Dirty | 1 << BH_Async_Write | - 1 << BH_Delay | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Redirected); + (BIT(BH_Dirty) | BIT(BH_Async_Write) | + BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Redirected)); set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { @@ -1951,8 +1951,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err = nilfs_ifile_get_inode_block( ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { - nilfs_warning(sci->sc_super, __func__, - "failed to get inode block."); + nilfs_msg(sci->sc_super, KERN_WARNING, + "log writer: error %d getting inode block (ino=%lu)", + err, ii->vfs_inode.i_ino); return err; } mark_buffer_dirty(ibh); @@ -2131,10 +2132,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) { spin_lock(&sci->sc_state_lock); - if (!(sci->sc_flush_request & (1 << bn))) { + if (!(sci->sc_flush_request & BIT(bn))) { unsigned long prev_req = sci->sc_flush_request; - sci->sc_flush_request |= (1 << bn); + sci->sc_flush_request |= BIT(bn); if (!prev_req) wake_up(&sci->sc_wait_daemon); } @@ -2318,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, } #define FLUSH_FILE_BIT (0x1) /* data file only */ -#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ +#define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */ /** * nilfs_segctor_accept - record accepted sequence count of log-write requests @@ -2458,8 +2459,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, if (likely(!err)) break; - nilfs_warning(sb, __func__, - "segment construction failed. (err=%d)", err); + nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(sci->sc_interval); } @@ -2467,9 +2467,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, sci->sc_nfreesegs); if (ret) { - printk(KERN_WARNING - "NILFS warning: error %d on discard request, " - "turning discards off for the device\n", ret); + nilfs_msg(sb, KERN_WARNING, + "error %d on discard request, turning discards off for the device", + ret); nilfs_clear_opt(nilfs, DISCARD); } } @@ -2551,10 +2551,9 @@ static int nilfs_segctor_thread(void *arg) /* start sync. */ sci->sc_task = current; wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ - printk(KERN_INFO - "segctord starting. Construction interval = %lu seconds, " - "CP frequency < %lu seconds\n", - sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + nilfs_msg(sci->sc_super, KERN_INFO, + "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); spin_lock(&sci->sc_state_lock); loop: @@ -2628,8 +2627,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) if (IS_ERR(t)) { int err = PTR_ERR(t); - printk(KERN_ERR "NILFS: error %d creating segctord thread\n", - err); + nilfs_msg(sci->sc_super, KERN_ERR, + "error %d creating segctord thread", err); return err; } wait_event(sci->sc_wait_task, sci->sc_task != NULL); @@ -2739,14 +2738,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) nilfs_segctor_write_out(sci); if (!list_empty(&sci->sc_dirty_files)) { - nilfs_warning(sci->sc_super, __func__, - "dirty file(s) after the final construction"); + nilfs_msg(sci->sc_super, KERN_WARNING, + "disposed unprocessed dirty file(s) when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } if (!list_empty(&sci->sc_iput_queue)) { - nilfs_warning(sci->sc_super, __func__, - "iput queue is not empty"); + nilfs_msg(sci->sc_super, KERN_WARNING, + "disposed unprocessed inode(s) in iput queue when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); } @@ -2822,8 +2821,8 @@ void nilfs_detach_log_writer(struct super_block *sb) spin_lock(&nilfs->ns_inode_lock); if (!list_empty(&nilfs->ns_dirty_files)) { list_splice_init(&nilfs->ns_dirty_files, &garbage_list); - nilfs_warning(sb, __func__, - "Hit dirty file after stopped log writer"); + nilfs_msg(sb, KERN_WARNING, + "disposed unprocessed dirty file(s) when detaching log writer"); } spin_unlock(&nilfs->ns_inode_lock); up_write(&nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 6565c10b7b76..1060949d7dd2 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -23,7 +23,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> -#include <linux/nilfs2_fs.h> #include "nilfs.h" struct nilfs_root; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1963595a1580..1541a1e9221a 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -22,7 +22,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "sufile.h" @@ -181,9 +180,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, down_write(&NILFS_MDT(sufile)->mi_sem); for (seg = segnumv; seg < segnumv + nsegs; seg++) { if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { - printk(KERN_WARNING - "%s: invalid segment number: %llu\n", __func__, - (unsigned long long)*seg); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: invalid segment number: %llu", + __func__, (unsigned long long)*seg); nerr++; } } @@ -240,8 +239,9 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, int ret; if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { - printk(KERN_WARNING "%s: invalid segment number: %llu\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: invalid segment number: %llu", + __func__, (unsigned long long)segnum); return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); @@ -419,8 +419,9 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (unlikely(!nilfs_segment_usage_clean(su))) { - printk(KERN_WARNING "%s: segment %llu must be clean\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: segment %llu must be clean", __func__, + (unsigned long long)segnum); kunmap_atomic(kaddr); return; } @@ -444,7 +445,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); - if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { kunmap_atomic(kaddr); return; @@ -455,7 +456,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, /* make the segment garbage */ su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); - su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); kunmap_atomic(kaddr); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); @@ -476,8 +477,9 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_clean(su)) { - printk(KERN_WARNING "%s: segment %llu is already clean\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: segment %llu is already clean", + __func__, (unsigned long long)segnum); kunmap_atomic(kaddr); return; } @@ -692,7 +694,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & - ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; kunmap_atomic(kaddr); @@ -859,10 +861,10 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, si->sui_lastmod = le64_to_cpu(su->su_lastmod); si->sui_nblocks = le32_to_cpu(su->su_nblocks); si->sui_flags = le32_to_cpu(su->su_flags) & - ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); if (nilfs_segment_is_active(nilfs, segnum + j)) si->sui_flags |= - (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + BIT(NILFS_SEGMENT_USAGE_ACTIVE); } kunmap_atomic(kaddr); brelse(su_bh); @@ -950,7 +952,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, * disk. */ sup->sup_sui.sui_flags &= - ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); cleansi = nilfs_suinfo_clean(&sup->sup_sui); cleansu = nilfs_segment_usage_clean(su); @@ -1175,14 +1177,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, int err; if (susize > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large segment usage size: %zu bytes.\n", - susize); + nilfs_msg(sb, KERN_ERR, + "too large segment usage size: %zu bytes", susize); return -EINVAL; } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { - printk(KERN_ERR - "NILFS: too small segment usage size: %zu bytes.\n", - susize); + nilfs_msg(sb, KERN_ERR, + "too small segment usage size: %zu bytes", susize); return -EINVAL; } diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 46e89872294c..158a9190c8ec 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -21,7 +21,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 666107a18a22..c95d369e90aa 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -71,6 +71,22 @@ struct kmem_cache *nilfs_btree_path_cache; static int nilfs_setup_super(struct super_block *sb, int is_mount); static int nilfs_remount(struct super_block *sb, int *flags, char *data); +void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sNILFS: %pV\n", level, &vaf); + va_end(args); +} + static void nilfs_set_error(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; @@ -91,19 +107,20 @@ static void nilfs_set_error(struct super_block *sb) } /** - * nilfs_error() - report failure condition on a filesystem + * __nilfs_error() - report failure condition on a filesystem + * + * __nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. This function should be called when + * NILFS detects incoherences or defects of meta data on disk. * - * nilfs_error() sets an ERROR_FS flag on the superblock as well as - * reporting an error message. It should be called when NILFS detects - * incoherences or defects of meta data on disk. As for sustainable - * errors such as a single-shot I/O error, nilfs_warning() or the printk() - * function should be used instead. + * This implements the body of nilfs_error() macro. Normally, + * nilfs_error() should be used. As for sustainable errors such as a + * single-shot I/O error, nilfs_msg() should be used instead. * - * The segment constructor must not call this function because it can - * kill itself. + * Callers should not add a trailing newline since this will do it. */ -void nilfs_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void __nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) { struct the_nilfs *nilfs = sb->s_fs_info; struct va_format vaf; @@ -133,24 +150,6 @@ void nilfs_error(struct super_block *sb, const char *function, sb->s_id); } -void nilfs_warning(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); -} - - struct inode *nilfs_alloc_inode(struct super_block *sb) { struct nilfs_inode_info *ii; @@ -196,8 +195,8 @@ static int nilfs_sync_super(struct super_block *sb, int flag) } if (unlikely(err)) { - printk(KERN_ERR - "NILFS: unable to write superblock (err=%d)\n", err); + nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d", + err); if (err == -EIO && nilfs->ns_sbh[1]) { /* * sbp[0] points to newer log than sbp[1], @@ -267,8 +266,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); } else { - printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", - sb->s_id); + nilfs_msg(sb, KERN_CRIT, "superblock broke"); return NULL; } } else if (sbp[1] && @@ -378,9 +376,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) offset = sb2off & (nilfs->ns_blocksize - 1); nsbh = sb_getblk(sb, newblocknr); if (!nsbh) { - printk(KERN_WARNING - "NILFS warning: unable to move secondary superblock " - "to block %llu\n", (unsigned long long)newblocknr); + nilfs_msg(sb, KERN_WARNING, + "unable to move secondary superblock to block %llu", + (unsigned long long)newblocknr); ret = -EIO; goto out; } @@ -543,10 +541,9 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { - printk(KERN_ERR - "NILFS: Invalid checkpoint " - "(checkpoint number=%llu)\n", - (unsigned long long)cno); + nilfs_msg(sb, KERN_ERR, + "Invalid checkpoint (checkpoint number=%llu)", + (unsigned long long)cno); err = -EINVAL; } goto failed; @@ -642,9 +639,8 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) err = nilfs_ifile_count_free_inodes(root->ifile, &nmaxinodes, &nfreeinodes); if (unlikely(err)) { - printk(KERN_WARNING - "NILFS warning: fail to count free inodes: err %d.\n", - err); + nilfs_msg(sb, KERN_WARNING, + "failed to count free inodes: err=%d", err); if (err == -ERANGE) { /* * If nilfs_palloc_count_max_entries() returns @@ -776,9 +772,9 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) break; case Opt_snapshot: if (is_remount) { - printk(KERN_ERR - "NILFS: \"%s\" option is invalid " - "for remount.\n", p); + nilfs_msg(sb, KERN_ERR, + "\"%s\" option is invalid for remount", + p); return 0; } break; @@ -792,8 +788,8 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) nilfs_clear_opt(nilfs, DISCARD); break; default: - printk(KERN_ERR - "NILFS: Unrecognized mount option \"%s\"\n", p); + nilfs_msg(sb, KERN_ERR, + "unrecognized mount option \"%s\"", p); return 0; } } @@ -829,12 +825,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount) mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); if (nilfs->ns_mount_state & NILFS_ERROR_FS) { - printk(KERN_WARNING - "NILFS warning: mounting fs with errors\n"); + nilfs_msg(sb, KERN_WARNING, "mounting fs with errors"); #if 0 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { - printk(KERN_WARNING - "NILFS warning: maximal mount count reached\n"); + nilfs_msg(sb, KERN_WARNING, "maximal mount count reached"); #endif } if (!max_mnt_count) @@ -897,17 +891,17 @@ int nilfs_check_feature_compatibility(struct super_block *sb, features = le64_to_cpu(sbp->s_feature_incompat) & ~NILFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "NILFS: couldn't mount because of unsupported " - "optional features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't mount because of unsupported optional features (%llx)", + (unsigned long long)features); return -EINVAL; } features = le64_to_cpu(sbp->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "NILFS: couldn't mount RDWR because of " - "unsupported optional features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't mount RDWR because of unsupported optional features (%llx)", + (unsigned long long)features); return -EINVAL; } return 0; @@ -923,13 +917,13 @@ static int nilfs_get_root_dentry(struct super_block *sb, inode = nilfs_iget(sb, root, NILFS_ROOT_INO); if (IS_ERR(inode)) { - printk(KERN_ERR "NILFS: get root inode failed\n"); ret = PTR_ERR(inode); + nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret); goto out; } if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) { iput(inode); - printk(KERN_ERR "NILFS: corrupt root inode.\n"); + nilfs_msg(sb, KERN_ERR, "corrupt root inode"); ret = -EINVAL; goto out; } @@ -957,7 +951,7 @@ static int nilfs_get_root_dentry(struct super_block *sb, return ret; failed_dentry: - printk(KERN_ERR "NILFS: get root dentry failed\n"); + nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret); goto out; } @@ -977,18 +971,18 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, ret = (ret == -ENOENT) ? -EINVAL : ret; goto out; } else if (!ret) { - printk(KERN_ERR "NILFS: The specified checkpoint is " - "not a snapshot (checkpoint number=%llu).\n", - (unsigned long long)cno); + nilfs_msg(s, KERN_ERR, + "The specified checkpoint is not a snapshot (checkpoint number=%llu)", + (unsigned long long)cno); ret = -EINVAL; goto out; } ret = nilfs_attach_checkpoint(s, cno, false, &root); if (ret) { - printk(KERN_ERR "NILFS: error loading snapshot " - "(checkpoint number=%llu).\n", - (unsigned long long)cno); + nilfs_msg(s, KERN_ERR, + "error %d while loading snapshot (checkpoint number=%llu)", + ret, (unsigned long long)cno); goto out; } ret = nilfs_get_root_dentry(s, root, root_dentry); @@ -1058,7 +1052,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) __u64 cno; int err; - nilfs = alloc_nilfs(sb->s_bdev); + nilfs = alloc_nilfs(sb); if (!nilfs) return -ENOMEM; @@ -1083,8 +1077,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) cno = nilfs_last_cno(nilfs); err = nilfs_attach_checkpoint(sb, cno, true, &fsroot); if (err) { - printk(KERN_ERR "NILFS: error loading last checkpoint " - "(checkpoint number=%llu).\n", (unsigned long long)cno); + nilfs_msg(sb, KERN_ERR, + "error %d while loading last checkpoint (checkpoint number=%llu)", + err, (unsigned long long)cno); goto failed_unload; } @@ -1144,9 +1139,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because the filesystem is in an " - "incomplete recovery state.\n", sb->s_id); + nilfs_msg(sb, KERN_WARNING, + "couldn't remount because the filesystem is in an incomplete recovery state"); goto restore_opts; } @@ -1178,10 +1172,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) ~NILFS_FEATURE_COMPAT_RO_SUPP; up_read(&nilfs->ns_sem); if (features) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount RDWR because of unsupported optional " - "features (%llx)\n", - sb->s_id, (unsigned long long)features); + nilfs_msg(sb, KERN_WARNING, + "couldn't remount RDWR because of unsupported optional features (%llx)", + (unsigned long long)features); err = -EROFS; goto restore_opts; } @@ -1212,6 +1205,38 @@ struct nilfs_super_data { int flags; }; +static int nilfs_parse_snapshot_option(const char *option, + const substring_t *arg, + struct nilfs_super_data *sd) +{ + unsigned long long val; + const char *msg = NULL; + int err; + + if (!(sd->flags & MS_RDONLY)) { + msg = "read-only option is not specified"; + goto parse_error; + } + + err = kstrtoull(arg->from, 0, &val); + if (err) { + if (err == -ERANGE) + msg = "too large checkpoint number"; + else + msg = "malformed argument"; + goto parse_error; + } else if (val == 0) { + msg = "invalid checkpoint number 0"; + goto parse_error; + } + sd->cno = val; + return 0; + +parse_error: + nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg); + return 1; +} + /** * nilfs_identify - pre-read mount options needed to identify mount instance * @data: mount options @@ -1228,24 +1253,9 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd) p = strsep(&options, ","); if (p != NULL && *p) { token = match_token(p, tokens, args); - if (token == Opt_snapshot) { - if (!(sd->flags & MS_RDONLY)) { - ret++; - } else { - sd->cno = simple_strtoull(args[0].from, - NULL, 0); - /* - * No need to see the end pointer; - * match_token() has done syntax - * checking. - */ - if (sd->cno == 0) - ret++; - } - } - if (ret) - printk(KERN_ERR - "NILFS: invalid mount option: %s\n", p); + if (token == Opt_snapshot) + ret = nilfs_parse_snapshot_option(p, &args[0], + sd); } if (!options) break; @@ -1326,10 +1336,10 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } else if (!sd.cno) { if (nilfs_tree_is_busy(s->s_root)) { if ((flags ^ s->s_flags) & MS_RDONLY) { - printk(KERN_ERR "NILFS: the device already " - "has a %s mount.\n", - (s->s_flags & MS_RDONLY) ? - "read-only" : "read/write"); + nilfs_msg(s, KERN_ERR, + "the device already has a %s mount.", + (s->s_flags & MS_RDONLY) ? + "read-only" : "read/write"); err = -EBUSY; goto failed_super; } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 8ffa42b704d8..490303e3d517 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -272,8 +272,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get checkpoint stat: err=%d", err); return err; } @@ -295,8 +295,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get checkpoint stat: err=%d", err); return err; } @@ -326,9 +326,9 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, { __u64 cno; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); cno = nilfs->ns_cno; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", cno); } @@ -414,8 +414,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get segment stat: err=%d", err); return err; } @@ -511,9 +511,9 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, { u64 seg_seq; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); seg_seq = nilfs->ns_seg_seq; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); } @@ -525,9 +525,9 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, { __u64 segnum; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); segnum = nilfs->ns_segnum; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); } @@ -539,9 +539,9 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, { __u64 nextnum; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nextnum = nilfs->ns_nextnum; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); } @@ -553,9 +553,9 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, { unsigned long pseg_offset; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); pseg_offset = nilfs->ns_pseg_offset; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); } @@ -567,9 +567,9 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, { __u64 cno; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); cno = nilfs->ns_cno; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", cno); } @@ -581,9 +581,9 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, { time_t ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return NILFS_SHOW_TIME(ctime, buf); } @@ -595,9 +595,9 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, { time_t ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); } @@ -609,9 +609,9 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, { time_t nongc_ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return NILFS_SHOW_TIME(nongc_ctime, buf); } @@ -623,9 +623,9 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, { time_t nongc_ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)nongc_ctime); @@ -638,9 +638,9 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, { u32 ndirtyblks; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); } @@ -789,14 +789,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, err = kstrtouint(skip_spaces(buf), 0, &val); if (err) { - printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to convert string: err=%d", err); return err; } if (val < NILFS_SB_FREQ) { val = NILFS_SB_FREQ; - printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + nilfs_msg(nilfs->ns_sb, KERN_WARNING, + "superblock update frequency cannot be lesser than 10 seconds"); } down_write(&nilfs->ns_sem); @@ -999,7 +1000,8 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); if (unlikely(!nilfs->ns_dev_subgroups)) { err = -ENOMEM; - printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + nilfs_msg(sb, KERN_ERR, + "unable to allocate memory for device group"); goto failed_create_device_group; } @@ -1109,15 +1111,15 @@ int __init nilfs_sysfs_init(void) nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); if (!nilfs_kset) { err = -ENOMEM; - printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", - err); + nilfs_msg(NULL, KERN_ERR, + "unable to create sysfs entry: err=%d", err); goto failed_sysfs_init; } err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); if (unlikely(err)) { - printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", - err); + nilfs_msg(NULL, KERN_ERR, + "unable to create feature group: err=%d", err); goto cleanup_sysfs_init; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index e9fd241b9a0a..2dd75bf619ad 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -56,12 +56,12 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, /** * alloc_nilfs - allocate a nilfs object - * @bdev: block device to which the_nilfs is related + * @sb: super block instance * * Return Value: On success, pointer to the_nilfs is returned. * On error, NULL is returned. */ -struct the_nilfs *alloc_nilfs(struct block_device *bdev) +struct the_nilfs *alloc_nilfs(struct super_block *sb) { struct the_nilfs *nilfs; @@ -69,7 +69,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) if (!nilfs) return NULL; - nilfs->ns_bdev = bdev; + nilfs->ns_sb = sb; + nilfs->ns_bdev = sb->s_bdev; atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); mutex_init(&nilfs->ns_snapshot_mount_mutex); @@ -191,7 +192,10 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); nilfs->ns_cno = nilfs->ns_last_cno + 1; if (nilfs->ns_segnum >= nilfs->ns_nsegments) { - printk(KERN_ERR "NILFS invalid last segment number.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "pointed segment number is out of range: segnum=%llu, nsegments=%lu", + (unsigned long long)nilfs->ns_segnum, + nilfs->ns_nsegments); ret = -EINVAL; } return ret; @@ -215,12 +219,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) int err; if (!valid_fs) { - printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs"); if (s_flags & MS_RDONLY) { - printk(KERN_INFO "NILFS: INFO: recovery " - "required for readonly filesystem.\n"); - printk(KERN_INFO "NILFS: write access will " - "be enabled during recovery.\n"); + nilfs_msg(sb, KERN_INFO, + "recovery required for readonly filesystem"); + nilfs_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); } } @@ -235,13 +239,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto scan_error; if (!nilfs_valid_sb(sbp[1])) { - printk(KERN_WARNING - "NILFS warning: unable to fall back to spare" - "super block\n"); + nilfs_msg(sb, KERN_WARNING, + "unable to fall back to spare super block"); goto scan_error; } - printk(KERN_INFO - "NILFS: try rollback from an earlier position\n"); + nilfs_msg(sb, KERN_INFO, + "trying rollback from an earlier position"); /* * restore super block with its spare and reconfigure @@ -254,10 +257,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) /* verify consistency between two super blocks */ blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); if (blocksize != nilfs->ns_blocksize) { - printk(KERN_WARNING - "NILFS warning: blocksize differs between " - "two super blocks (%d != %d)\n", - blocksize, nilfs->ns_blocksize); + nilfs_msg(sb, KERN_WARNING, + "blocksize differs between two super blocks (%d != %d)", + blocksize, nilfs->ns_blocksize); goto scan_error; } @@ -276,7 +278,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root); if (unlikely(err)) { - printk(KERN_ERR "NILFS: error loading super root.\n"); + nilfs_msg(sb, KERN_ERR, "error %d while loading super root", + err); goto failed; } @@ -287,30 +290,29 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) __u64 features; if (nilfs_test_opt(nilfs, NORECOVERY)) { - printk(KERN_INFO "NILFS: norecovery option specified. " - "skipping roll-forward recovery\n"); + nilfs_msg(sb, KERN_INFO, + "norecovery option specified, skipping roll-forward recovery"); goto skip_recovery; } features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (features) { - printk(KERN_ERR "NILFS: couldn't proceed with " - "recovery because of unsupported optional " - "features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't proceed with recovery because of unsupported optional features (%llx)", + (unsigned long long)features); err = -EROFS; goto failed_unload; } if (really_read_only) { - printk(KERN_ERR "NILFS: write access " - "unavailable, cannot proceed.\n"); + nilfs_msg(sb, KERN_ERR, + "write access unavailable, cannot proceed"); err = -EROFS; goto failed_unload; } sb->s_flags &= ~MS_RDONLY; } else if (nilfs_test_opt(nilfs, NORECOVERY)) { - printk(KERN_ERR "NILFS: recovery cancelled because norecovery " - "option was specified for a read/write mount\n"); + nilfs_msg(sb, KERN_ERR, + "recovery cancelled because norecovery option was specified for a read/write mount"); err = -EINVAL; goto failed_unload; } @@ -325,11 +327,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) up_write(&nilfs->ns_sem); if (err) { - printk(KERN_ERR "NILFS: failed to update super block. " - "recovery unfinished.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d updating super block. recovery unfinished.", + err); goto failed_unload; } - printk(KERN_INFO "NILFS: recovery complete.\n"); + nilfs_msg(sb, KERN_INFO, "recovery complete"); skip_recovery: nilfs_clear_recovery_info(&ri); @@ -337,7 +340,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) return 0; scan_error: - printk(KERN_ERR "NILFS: error searching super root.\n"); + nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err); goto failed; failed_unload: @@ -384,12 +387,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { - printk(KERN_ERR "NILFS: unsupported revision " - "(superblock rev.=%d.%d, current rev.=%d.%d). " - "Please check the version of mkfs.nilfs.\n", - le32_to_cpu(sbp->s_rev_level), - le16_to_cpu(sbp->s_minor_rev_level), - NILFS_CURRENT_REV, NILFS_MINOR_REV); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); return -EINVAL; } nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); @@ -398,12 +400,14 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); if (nilfs->ns_inode_size > nilfs->ns_blocksize) { - printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", - nilfs->ns_inode_size); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too large inode size: %d bytes", + nilfs->ns_inode_size); return -EINVAL; } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { - printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", - nilfs->ns_inode_size); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too small inode size: %d bytes", + nilfs->ns_inode_size); return -EINVAL; } @@ -411,7 +415,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { - printk(KERN_ERR "NILFS: too short segment.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too short segment: %lu blocks", + nilfs->ns_blocks_per_segment); return -EINVAL; } @@ -420,7 +426,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, le32_to_cpu(sbp->s_r_segments_percentage); if (nilfs->ns_r_segments_percentage < 1 || nilfs->ns_r_segments_percentage > 99) { - printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "invalid reserved segments percentage: %lu", + nilfs->ns_r_segments_percentage); return -EINVAL; } @@ -504,16 +512,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, if (!sbp[0]) { if (!sbp[1]) { - printk(KERN_ERR "NILFS: unable to read superblock\n"); + nilfs_msg(sb, KERN_ERR, "unable to read superblock"); return -EIO; } - printk(KERN_WARNING - "NILFS warning: unable to read primary superblock " - "(blocksize = %d)\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "unable to read primary superblock (blocksize = %d)", + blocksize); } else if (!sbp[1]) { - printk(KERN_WARNING - "NILFS warning: unable to read secondary superblock " - "(blocksize = %d)\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "unable to read secondary superblock (blocksize = %d)", + blocksize); } /* @@ -535,14 +543,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, } if (!valid[swp]) { nilfs_release_super_block(nilfs); - printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", - sb->s_id); + nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device"); return -EINVAL; } if (!valid[!swp]) - printk(KERN_WARNING "NILFS warning: broken superblock. " - "using spare superblock (blocksize = %d).\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "broken superblock, retrying with spare superblock (blocksize = %d)", + blocksize); if (swp) nilfs_swap_super_block(nilfs); @@ -576,7 +584,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "NILFS: unable to set blocksize\n"); + nilfs_msg(sb, KERN_ERR, "unable to set blocksize"); err = -EINVAL; goto out; } @@ -595,8 +603,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (blocksize < NILFS_MIN_BLOCK_SIZE || blocksize > NILFS_MAX_BLOCK_SIZE) { - printk(KERN_ERR "NILFS: couldn't mount because of unsupported " - "filesystem blocksize %d\n", blocksize); + nilfs_msg(sb, KERN_ERR, + "couldn't mount because of unsupported filesystem blocksize %d", + blocksize); err = -EINVAL; goto failed_sbh; } @@ -604,10 +613,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) int hw_blocksize = bdev_logical_block_size(sb->s_bdev); if (blocksize < hw_blocksize) { - printk(KERN_ERR - "NILFS: blocksize %d too small for device " - "(sector-size = %d).\n", - blocksize, hw_blocksize); + nilfs_msg(sb, KERN_ERR, + "blocksize %d too small for device (sector-size = %d)", + blocksize, hw_blocksize); err = -EINVAL; goto failed_sbh; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 79369fd6b13b..b305c6f033e7 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -43,6 +43,7 @@ enum { * struct the_nilfs - struct to supervise multiple nilfs mount points * @ns_flags: flags * @ns_flushed_device: flag indicating if all volatile data was flushed + * @ns_sb: back pointer to super block instance * @ns_bdev: block device * @ns_sem: semaphore for shared states * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts @@ -102,6 +103,7 @@ struct the_nilfs { unsigned long ns_flags; int ns_flushed_device; + struct super_block *ns_sb; struct block_device *ns_bdev; struct rw_semaphore ns_sem; struct mutex ns_snapshot_mount_mutex; @@ -120,11 +122,8 @@ struct the_nilfs { unsigned int ns_sb_update_freq; /* - * Following fields are dedicated to a writable FS-instance. - * Except for the period seeking checkpoint, code outside the segment - * constructor must lock a segment semaphore while accessing these - * fields. - * The writable FS-instance is sole during a lifetime of the_nilfs. + * The following fields are updated by a writable FS-instance. + * These fields are protected by ns_segctor_sem outside load_nilfs(). */ u64 ns_seg_seq; __u64 ns_segnum; @@ -281,7 +280,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) } void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); -struct the_nilfs *alloc_nilfs(struct block_device *bdev); +struct the_nilfs *alloc_nilfs(struct super_block *sb); void destroy_nilfs(struct the_nilfs *nilfs); int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 460c0cedab3a..7dabbc31060e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6106,6 +6106,43 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, } } +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + inode_lock(osb->osb_tl_inode); + truncated_clusters = osb->truncated_clusters; + inode_unlock(osb->osb_tl_inode); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int slot_num, struct inode **tl_inode, diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index f3dc1b0dfffc..4a5152ec88a3 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -188,6 +188,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, u64 start_blk, unsigned int num_clusters); int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed); /* * Process local structure which describes the block unlinks done diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index af2adfcb0f6f..98d36548153d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1645,43 +1645,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -/* - * Try to flush truncate logs if we can free enough clusters from it. - * As for return value, "< 0" means error, "0" no space and "1" means - * we have freed enough spaces and let the caller try to allocate again. - */ -static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, - unsigned int needed) -{ - tid_t target; - int ret = 0; - unsigned int truncated_clusters; - - inode_lock(osb->osb_tl_inode); - truncated_clusters = osb->truncated_clusters; - inode_unlock(osb->osb_tl_inode); - - /* - * Check whether we can succeed in allocating if we free - * the truncate log. - */ - if (truncated_clusters < needed) - goto out; - - ret = ocfs2_flush_truncate_log(osb); - if (ret) { - mlog_errno(ret); - goto out; - } - - if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { - jbd2_log_wait_commit(osb->journal->j_journal, target); - ret = 1; - } -out: - return ret; -} - int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1eaa9100c889..fc5443226675 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1665,10 +1665,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); - if (ret) { + if (ret) mlog_errno(ret); - goto bail; - } bail: return ret; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e607419cdfa4..bc0e21e8a674 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode) int status = 0; int i; u64 v_blkno, p_blkno, p_blocks, num_blocks; -#define CONCURRENT_JOURNAL_FILL 32ULL - struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; - - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + struct buffer_head *bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode *inode) goto bail; } - if (p_blocks > CONCURRENT_JOURNAL_FILL) - p_blocks = CONCURRENT_JOURNAL_FILL; + for (i = 0; i < p_blocks; i++) { + bh = __find_get_block(osb->sb->s_bdev, p_blkno, + osb->sb->s_blocksize); + /* block not cached. */ + if (!bh) { + p_blkno++; + continue; + } - /* We are reading journal data which should not - * be put in the uptodate cache */ - status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs); - if (status < 0) { - mlog_errno(status); - goto bail; - } + brelse(bh); + bh = NULL; + /* We are reading journal data which should not + * be put in the uptodate cache. + */ + status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } - for(i = 0; i < p_blocks; i++) { - brelse(bhs[i]); - bhs[i] = NULL; + brelse(bh); + bh = NULL; } v_blkno += p_blocks; } bail: - for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - brelse(bhs[i]); return status; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ced70c8139f7..c9e828ec3c8e 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1007,10 +1007,17 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) lc->oc_type = NO_CONTROLD; rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, - DLM_LSFL_FS, DLM_LVB_LEN, + DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN, &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); - if (rc) + if (rc) { + if (rc == -EEXIST || rc == -EPROTO) + printk(KERN_ERR "ocfs2: Unable to create the " + "lockspace %s (%d), because a ocfs2-tools " + "program is running on this file system " + "with the same name lockspace\n", + conn->cc_name, rc); goto out; + } if (ops_rv == -EOPNOTSUPP) { lc->oc_type = WITH_CONTROLD; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 13219ed73e1d..52c07346bea3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); - locking_max_version.pv_major = 0; - locking_max_version.pv_minor = 0; ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2f19aeec5482..9f7f3b60e667 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1164,7 +1164,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, int flags, struct ocfs2_alloc_context **ac) { - int status; + int status, ret = 0; + int retried = 0; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -1189,7 +1190,21 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, } if (status == -ENOSPC) { +retry: status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + /* Retry if there is sufficient space cached in truncate log */ + if (status == -ENOSPC && !retried) { + retried = 1; + ocfs2_inode_unlock((*ac)->ac_inode, 1); + inode_unlock((*ac)->ac_inode); + + ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); + if (ret == 1) + goto retry; + + if (ret < 0) + mlog_errno(ret); + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 0f586bded7f4..ed3d47d8334c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -80,7 +80,7 @@ static int orangefs_readpages(struct file *file, if (!add_to_page_cache(page, mapping, page->index, - GFP_KERNEL)) { + readahead_gfp_mask(mapping))) { ret = read_one_page(page); gossip_debug(GOSSIP_INODE_DEBUG, "failure adding page to cache, read_one_page returned: %d\n", diff --git a/fs/pipe.c b/fs/pipe.c index 0d3f5165cb0b..4b32928f5426 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, put_page(page); } +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + if (memcg_kmem_enabled()) { + memcg_kmem_uncharge(page, 0); + __ClearPageKmemcg(page); + } + __SetPageLocked(page); + return 0; + } + return 1; +} + /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to @@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int copied; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe) { unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); @@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void) if (!too_many_pipe_buffers_hard(user)) { if (too_many_pipe_buffers_soft(user)) pipe_bufs = 1; - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + pipe->bufs = kcalloc(pipe_bufs, + sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); } if (pipe->bufs) { @@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); + bufs = kcalloc(nr_pages, sizeof(*bufs), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/proc/base.c b/fs/proc/base.c index a11eb7196ec8..54e270262979 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -579,11 +579,8 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; - read_lock(&tasklist_lock); - if (pid_alive(task)) - points = oom_badness(task, NULL, NULL, totalpages) * - 1000 / totalpages; - read_unlock(&tasklist_lock); + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; @@ -1024,23 +1021,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; - unsigned long flags; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) - oom_adj = OOM_ADJUST_MAX; - else - oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / - OOM_SCORE_ADJ_MAX; - unlock_task_sighand(task, &flags); - } + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) +{ + static DEFINE_MUTEX(oom_adj_mutex); + struct mm_struct *mm = NULL; + struct task_struct *task; + int err = 0; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + + mutex_lock(&oom_adj_mutex); + if (legacy) { + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + } else { + if ((short)oom_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + } + + /* + * Make sure we will check other processes sharing the mm if this is + * not vfrok which wants its own oom_score_adj. + * pin the mm so it doesn't go away and get reused after task_unlock + */ + if (!task->vfork_done) { + struct task_struct *p = find_lock_task_mm(task); + + if (p) { + if (atomic_read(&p->mm->mm_users) > 1) { + mm = p->mm; + atomic_inc(&mm->mm_count); + } + task_unlock(p); + } + } + + task->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_adj; + trace_oom_score_adj_update(task); + + if (mm) { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) { + if (same_thread_group(task, p)) + continue; + + /* do not touch kernel threads or the global init */ + if (p->flags & PF_KTHREAD || is_global_init(p)) + continue; + + task_lock(p); + if (!p->vfork_done && process_shares_mm(p, mm)) { + pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", + task_pid_nr(p), p->comm, + p->signal->oom_score_adj, oom_adj, + task_pid_nr(task), task->comm); + p->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + p->signal->oom_score_adj_min = (short)oom_adj; + } + task_unlock(p); + } + rcu_read_unlock(); + mmdrop(mm); + } +err_unlock: + mutex_unlock(&oom_adj_mutex); + put_task_struct(task); + return err; +} + /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a @@ -1054,10 +1135,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; int oom_adj; - unsigned long flags; int err; memset(buffer, 0, sizeof(buffer)); @@ -1077,23 +1156,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. @@ -1103,27 +1165,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; - if (oom_adj < task->signal->oom_score_adj && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * /proc/pid/oom_adj is provided for legacy purposes, ask users to use - * /proc/pid/oom_score_adj instead. - */ - pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - - task->signal->oom_score_adj = oom_adj; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } @@ -1140,15 +1182,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; - unsigned long flags; size_t len; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - oom_score_adj = task->signal->oom_score_adj; - unlock_task_sighand(task, &flags); - } + oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -1157,9 +1195,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; - unsigned long flags; int oom_score_adj; int err; @@ -1180,39 +1216,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if ((short)oom_score_adj < task->signal->oom_score_adj_min && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - task->signal->oom_score_adj = (short)oom_score_adj; - if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = (short)oom_score_adj; - trace_oom_score_adj_update(task); - -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 83720460c5bc..cf301a9ef512 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE "AnonHugePages: %8lu kB\n" + "ShmemHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" #endif #ifdef CONFIG_CMA "CmaTotal: %8lu kB\n" @@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR) + , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) #endif #ifdef CONFIG_CMA , K(totalcma_pages) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 510413eb25b8..7907e456ac4f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -80,19 +80,17 @@ static u64 get_iowait_time(int cpu) static int show_stat(struct seq_file *p, void *v) { int i, j; - unsigned long jif; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; - struct timespec boottime; + struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; - getboottime(&boottime); - jif = boottime.tv_sec; + getboottime64(&boottime); for_each_possible_cpu(i) { user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -163,12 +161,12 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "\nctxt %llu\n" - "btime %lu\n" + "btime %llu\n" "processes %lu\n" "procs_running %lu\n" "procs_blocked %lu\n", nr_context_switches(), - (unsigned long)jif, + (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4648c7f63ae2..187d84ef9de9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -448,6 +448,7 @@ struct mem_size_stats { unsigned long referenced; unsigned long anonymous; unsigned long anonymous_thp; + unsigned long shmem_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); if (IS_ERR_OR_NULL(page)) return; - mss->anonymous_thp += HPAGE_PMD_SIZE; + if (PageAnon(page)) + mss->anonymous_thp += HPAGE_PMD_SIZE; + else if (PageSwapBacked(page)) + mss->shmem_thp += HPAGE_PMD_SIZE; + else + VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else @@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" @@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, mss.private_hugetlb >> 10, mss.swap >> 10, diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index b751eea32e20..5db6f45b3fed 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb, insert_ptr); } - memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); insert_ptr[0] = new_insert_ptr; + if (new_insert_ptr) + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); return order; } diff --git a/fs/super.c b/fs/super.c index 37813bf479cf..c2ff475c1711 100644 --- a/fs/super.c +++ b/fs/super.c @@ -213,6 +213,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); + INIT_LIST_HEAD(&s->s_inodes_wb); + spin_lock_init(&s->s_inode_wblist_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d97952e341a..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,10 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason) +int handle_userfault(struct fault_env *fe, unsigned long reason) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = fe->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = fe->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); dump_stack(); } #endif @@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * and wait. */ ret = VM_FAULT_RETRY; - if (flags & FAULT_FLAG_RETRY_NOWAIT) + if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(address, flags, reason); + uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.ctx = ctx; - return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + return_to_userland = + (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index dd86c5fc102d..d7d0f495a34e 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -13,7 +13,7 @@ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, } #endif -void __iomem *__init_refok +void __iomem *__ref acpi_os_map_iomem(acpi_physical_address phys, acpi_size size); void __ref acpi_os_unmap_iomem(void __iomem *virt, acpi_size size); void __iomem *acpi_os_get_iomem(acpi_physical_address phys, unsigned int size); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 9dbb739cafa0..c6d667187608 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -107,6 +107,12 @@ struct mmu_gather { struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; unsigned int batch_count; + /* + * __tlb_adjust_range will track the new addr here, + * that that we can adjust the range after the flush + */ + unsigned long addr; + int page_size; }; #define HAVE_GENERIC_MMU_GATHER @@ -115,23 +121,20 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long void tlb_flush_mmu(struct mmu_gather *tlb); void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); -int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); - -/* tlb_remove_page - * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when - * required. - */ -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (!__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + int page_size); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + PAGE_SIZE); + /* + * Track the last address with which we adjusted the range. This + * will be used later to adjust again after a mmu_flush due to + * failed __tlb_remove_page + */ + tlb->addr = address; } static inline void __tlb_reset_range(struct mmu_gather *tlb) @@ -144,6 +147,40 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) } } +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + if (__tlb_remove_page_size(tlb, page, page_size)) { + tlb_flush_mmu(tlb); + tlb->page_size = page_size; + __tlb_adjust_range(tlb, tlb->addr); + __tlb_remove_page_size(tlb, page, page_size); + } +} + +static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + return __tlb_remove_page_size(tlb, page, PAGE_SIZE); +} + +/* tlb_remove_page + * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when + * required. + */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + return tlb_remove_page_size(tlb, page, PAGE_SIZE); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + /* active->nr should be zero when we call this */ + VM_BUG_ON_PAGE(tlb->active->nr, page); + tlb->page_size = PAGE_SIZE; + __tlb_adjust_range(tlb, tlb->addr); + return __tlb_remove_page(tlb, page); +} + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 9b0a15d06a4f..79542b2698ec 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -48,6 +48,7 @@ #include <linux/migrate.h> #include <linux/gfp.h> #include <linux/err.h> +#include <linux/fs.h> /* * Balloon device information descriptor. @@ -62,6 +63,7 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); + struct inode *inode; }; extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); @@ -73,45 +75,19 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; + balloon->inode = NULL; } #ifdef CONFIG_BALLOON_COMPACTION -extern bool balloon_page_isolate(struct page *page); +extern const struct address_space_operations balloon_aops; +extern bool balloon_page_isolate(struct page *page, + isolate_mode_t mode); extern void balloon_page_putback(struct page *page); -extern int balloon_page_migrate(struct page *newpage, +extern int balloon_page_migrate(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode); /* - * __is_movable_balloon_page - helper to perform @page PageBalloon tests - */ -static inline bool __is_movable_balloon_page(struct page *page) -{ - return PageBalloon(page); -} - -/* - * balloon_page_movable - test PageBalloon to identify balloon pages - * and PagePrivate to check that the page is not - * isolated and can be moved by compaction/migration. - * - * As we might return false positives in the case of a balloon page being just - * released under us, this need to be re-tested later, under the page lock. - */ -static inline bool balloon_page_movable(struct page *page) -{ - return PageBalloon(page) && PagePrivate(page); -} - -/* - * isolated_balloon_page - identify an isolated balloon page on private - * compaction/migration page lists. - */ -static inline bool isolated_balloon_page(struct page *page) -{ - return PageBalloon(page); -} - -/* * balloon_page_insert - insert a page into the balloon's page list and make * the page->private assignment accordingly. * @balloon : pointer to balloon device @@ -124,7 +100,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { __SetPageBalloon(page); - SetPagePrivate(page); + __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); } @@ -140,11 +116,14 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, static inline void balloon_page_delete(struct page *page) { __ClearPageBalloon(page); + __ClearPageMovable(page); set_page_private(page, 0); - if (PagePrivate(page)) { - ClearPagePrivate(page); + /* + * No touch page.lru field once @page has been isolated + * because VM is using the field. + */ + if (!PageIsolated(page)) list_del(&page->lru); - } } /* diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a58c852a268f..1a02dab16646 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -212,6 +212,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +struct node; extern int compaction_register_node(struct node *node); extern void compaction_unregister_node(struct node *node); diff --git a/include/linux/compat.h b/include/linux/compat.h index f964ef79e0ad..63609398ef9f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -432,7 +432,6 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); -extern __printf(1, 2) int compat_printk(const char *fmt, ...); extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat); extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e828cf65d7df..da7fbf1cdd56 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -579,7 +579,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, } /** - * cpumask_parse - extract a cpumask from from a string + * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h new file mode 100644 index 000000000000..bba7a4d692b3 --- /dev/null +++ b/include/linux/crc64_ecma.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CRC64_ECMA_H_ +#define __CRC64_ECMA_H_ + +#include <linux/types.h> + + +#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * @pdata: pointer to the data to compute checksum for. + * @nbytes: number of bytes in data buffer. + * @seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed); + +#endif /* __CRC64_ECMA_H_ */ diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 46056cb161fc..d82bf1994485 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -38,7 +38,7 @@ struct debug_obj { * @name: name of the object typee * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object - * @is_static_object return true if the obj is static, otherwise return false + * @is_static_object: return true if the obj is static, otherwise return false * @fixup_init: fixup function, which is called when the init check * fails. All fixup functions must return true if fixup * was successful, otherwise return false diff --git a/include/linux/dma-attrs.h b/include/linux/dma-attrs.h index 5246239a4953..f3c5aeadb100 100644 --- a/include/linux/dma-attrs.h +++ b/include/linux/dma-attrs.h @@ -60,7 +60,8 @@ static inline void dma_set_attr(enum dma_attr attr, struct dma_attrs *attrs) * @attr: attribute to set * @attrs: struct dma_attrs (may be NULL) */ -static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs) +static inline int dma_get_attr(enum dma_attr attr, + const struct dma_attrs *attrs) { if (attrs == NULL) return 0; diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 5c41c5e75b5c..b1f9f0ccb8ac 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -47,6 +47,8 @@ int request_firmware_nowait( void (*cont)(const struct firmware *fw, void *context)); int request_firmware_direct(const struct firmware **fw, const char *name, struct device *device); +int request_firmware_into_buf(const struct firmware **firmware_p, + const char *name, struct device *device, void *buf, size_t size); void release_firmware(const struct firmware *fw); #else @@ -75,5 +77,11 @@ static inline int request_firmware_direct(const struct firmware **fw, return -EINVAL; } +static inline int request_firmware_into_buf(const struct firmware **firmware_p, + const char *name, struct device *device, void *buf, size_t size) +{ + return -EINVAL; +} + #endif #endif diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index e65ef959546c..c46d2aa16d81 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -4,6 +4,7 @@ #include <linux/swap.h> #include <linux/mm.h> #include <linux/bitops.h> +#include <linux/jump_label.h> struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ @@ -14,7 +15,6 @@ struct frontswap_ops { struct frontswap_ops *next; /* private pointer to next ops */ }; -extern bool frontswap_enabled; extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); @@ -30,7 +30,12 @@ extern void __frontswap_invalidate_page(unsigned, pgoff_t); extern void __frontswap_invalidate_area(unsigned); #ifdef CONFIG_FRONTSWAP -#define frontswap_enabled (1) +extern struct static_key_false frontswap_enabled_key; + +static inline bool frontswap_enabled(void) +{ + return static_branch_unlikely(&frontswap_enabled_key); +} static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { @@ -50,7 +55,10 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) #else /* all inline routines become no-ops and all externs are ignored */ -#define frontswap_enabled (0) +static inline bool frontswap_enabled(void) +{ + return false; +} static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { @@ -70,37 +78,35 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) static inline int frontswap_store(struct page *page) { - int ret = -1; + if (frontswap_enabled()) + return __frontswap_store(page); - if (frontswap_enabled) - ret = __frontswap_store(page); - return ret; + return -1; } static inline int frontswap_load(struct page *page) { - int ret = -1; + if (frontswap_enabled()) + return __frontswap_load(page); - if (frontswap_enabled) - ret = __frontswap_load(page); - return ret; + return -1; } static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) { - if (frontswap_enabled) + if (frontswap_enabled()) __frontswap_invalidate_page(type, offset); } static inline void frontswap_invalidate_area(unsigned type) { - if (frontswap_enabled) + if (frontswap_enabled()) __frontswap_invalidate_area(type); } static inline void frontswap_init(unsigned type, unsigned long *map) { - if (frontswap_enabled) + if (frontswap_enabled()) __frontswap_init(type, map); } diff --git a/include/linux/fs.h b/include/linux/fs.h index d30306189d4a..d8938809368b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -403,6 +403,8 @@ struct address_space_operations { */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); + bool (*isolate_page)(struct page *, isolate_mode_t); + void (*putback_page)(struct page *); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); @@ -666,6 +668,7 @@ struct inode { #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; + struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; @@ -1462,6 +1465,9 @@ struct super_block { /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ + + spinlock_t s_inode_wblist_lock; + struct list_head s_inodes_wb; /* writeback inodes */ }; extern struct timespec current_fs_time(struct super_block *sb); @@ -2684,6 +2690,7 @@ extern int do_pipe_flags(int *, int); #define __kernel_read_file_id(id) \ id(UNKNOWN, unknown) \ id(FIRMWARE, firmware) \ + id(FIRMWARE_PREALLOC_BUFFER, firmware) \ id(MODULE, kernel-module) \ id(KEXEC_IMAGE, kexec-image) \ id(KEXEC_INITRAMFS, kexec-initramfs) \ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 570383a41853..c29e9d347bc6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -78,8 +78,7 @@ struct vm_area_struct; * __GFP_THISNODE forces the allocation to be satisified from the requested * node with no fallbacks or placement policy enforcements. * - * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant - * to kmem allocations). + * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) @@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) -extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); -extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, - unsigned int order); - extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void __free_page_frag(void *addr); -extern void __free_kmem_pages(struct page *page, unsigned int order); -extern void free_kmem_pages(unsigned long addr, unsigned int order); - #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 419fb9e03447..eb810816bbc6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,20 +1,12 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - unsigned int flags); +extern int do_huge_pmd_anonymous_page(struct fault_env *fe); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); -extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd); +extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); +extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -49,6 +41,18 @@ enum transparent_hugepage_flag { #endif }; +struct kobject; +struct kobj_attribute; + +extern ssize_t single_hugepage_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag); +extern ssize_t single_hugepage_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag); +extern struct kobj_attribute shmem_enabled_attr; + #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) @@ -134,8 +138,7 @@ static inline int hpage_nr_pages(struct page *page) return 1; } -extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp); +extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -152,6 +155,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct page *get_huge_zero_page(void); void put_huge_zero_page(void); +#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -161,6 +166,8 @@ void put_huge_zero_page(void); #define transparent_hugepage_enabled(__vma) 0 +static inline void prep_transhuge_page(struct page *page) {} + #define transparent_hugepage_flags 0UL static inline int split_huge_page_to_list(struct page *page, struct list_head *list) @@ -196,8 +203,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, return NULL; } -static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) +static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) { return 0; } diff --git a/include/linux/init.h b/include/linux/init.h index 88646cb136e0..1e5c131d5c9a 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -77,12 +77,6 @@ #define __refdata __section(.ref.data) #define __refconst __constsection(.ref.rodata) -/* compatibility defines */ -#define __init_refok __ref -#define __initdata_refok __refdata -#define __exit_refok __ref - - #ifdef MODULE #define __exitused #else diff --git a/include/linux/kexec.h b/include/linux/kexec.h index e8acb2b43dd9..496213aa93b7 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -14,6 +14,8 @@ #if !defined(__ASSEMBLY__) +#include <asm/io.h> + #include <uapi/linux/kexec.h> #ifdef CONFIG_KEXEC_CORE @@ -41,7 +43,7 @@ #endif #ifndef KEXEC_CONTROL_MEMORY_GFP -#define KEXEC_CONTROL_MEMORY_GFP GFP_KERNEL +#define KEXEC_CONTROL_MEMORY_GFP (GFP_KERNEL | __GFP_NORETRY) #endif #ifndef KEXEC_CONTROL_PAGE_SIZE @@ -233,7 +235,7 @@ void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) void vmcoreinfo_append_str(const char *fmt, ...); -unsigned long paddr_vmcoreinfo_note(void); +phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_OSRELEASE(value) \ vmcoreinfo_append_str("OSRELEASE=%s\n", value) @@ -256,6 +258,8 @@ unsigned long paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -318,6 +322,44 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, void arch_kexec_protect_crashkres(void); void arch_kexec_unprotect_crashkres(void); +#ifndef page_to_boot_pfn +static inline unsigned long page_to_boot_pfn(struct page *page) +{ + return page_to_pfn(page); +} +#endif + +#ifndef boot_pfn_to_page +static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) +{ + return pfn_to_page(boot_pfn); +} +#endif + +#ifndef phys_to_boot_phys +static inline unsigned long phys_to_boot_phys(phys_addr_t phys) +{ + return phys; +} +#endif + +#ifndef boot_phys_to_phys +static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys) +{ + return boot_phys; +} +#endif + +static inline unsigned long virt_to_boot_phys(void *addr) +{ + return phys_to_boot_phys(__pa((unsigned long)addr)); +} + +static inline void *boot_phys_to_virt(unsigned long entry) +{ + return phys_to_virt(boot_phys_to_phys(entry)); +} + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eeb307985715..1e032a1ddb3e 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -4,6 +4,11 @@ #include <linux/sched.h> /* MMF_VM_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern struct attribute_group khugepaged_attr_group; + +extern int khugepaged_init(void); +extern void khugepaged_destroy(void); +extern int start_stop_khugepaged(void); extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 7ae216a39c9e..481c8c4627ca 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -43,8 +43,7 @@ static inline struct stable_node *page_stable_node(struct page *page) static inline void set_page_stable_node(struct page *page, struct stable_node *stable_node) { - page->mapping = (void *)stable_node + - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 3106ac1c895e..6c14b6179727 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -73,8 +73,8 @@ extern bool movable_node_enabled; if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end, - int nid, ulong flags); + phys_addr_t start, phys_addr_t end, + int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); @@ -110,7 +110,7 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, phys_addr_t *out_end, int *out_nid); void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, - phys_addr_t *out_end); + phys_addr_t *out_end); /** * for_each_mem_range - iterate through memblock areas from type_a and not @@ -148,7 +148,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ - p_start, p_end, p_nid); \ + p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) @@ -163,8 +163,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, * is initialized. */ #define for_each_reserved_mem_region(i, p_start, p_end) \ - for (i = 0UL, \ - __next_reserved_mem_region(&i, p_start, p_end); \ + for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \ i != (u64)ULLONG_MAX; \ __next_reserved_mem_region(&i, p_start, p_end)) @@ -403,15 +402,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo } #define for_each_memblock(memblock_type, region) \ - for (region = memblock.memblock_type.regions; \ + for (region = memblock.memblock_type.regions; \ region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) #define for_each_memblock_type(memblock_type, rgn) \ - idx = 0; \ - rgn = &memblock_type->regions[idx]; \ - for (idx = 0; idx < memblock_type->cnt; \ - idx++,rgn = &memblock_type->regions[idx]) + for (idx = 0, rgn = &memblock_type->regions[0]; \ + idx < memblock_type->cnt; \ + idx++, rgn = &memblock_type->regions[idx]) #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a805474df4ab..104efa6874db 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,6 +29,7 @@ #include <linux/mmzone.h> #include <linux/writeback.h> #include <linux/page-flags.h> +#include <linux/mm.h> struct mem_cgroup; struct page; @@ -97,6 +98,11 @@ enum mem_cgroup_events_target { #define MEM_CGROUP_ID_SHIFT 16 #define MEM_CGROUP_ID_MAX USHRT_MAX +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -172,6 +178,9 @@ enum memcg_kmem_state { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter swap; @@ -306,7 +315,48 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); +static inline struct mem_cgroup_per_zone * +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) +{ + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + return &memcg->nodeinfo[nid]->zoneinfo[zid]; +} + +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @memcg: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; + + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } + + mz = mem_cgroup_zone_zoneinfo(memcg, zone); + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; +} + struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); @@ -330,22 +380,9 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) if (mem_cgroup_disabled()) return 0; - return memcg->css.id; -} - -/** - * mem_cgroup_from_id - look up a memcg from an id - * @id: the id to look up - * - * Caller must hold rcu_read_lock() and use css_tryget() as necessary. - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_id(unsigned short id); /** * parent_mem_cgroup - find the accounting parent of a memcg @@ -754,6 +791,13 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } #endif +struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); +void memcg_kmem_put_cache(struct kmem_cache *cachep); +int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); +int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void memcg_kmem_uncharge(struct page *page, int order); + #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) extern struct static_key_false memcg_kmem_enabled_key; @@ -775,22 +819,6 @@ static inline bool memcg_kmem_enabled(void) } /* - * In general, we'll do everything in our power to not incur in any overhead - * for non-memcg users for the kmem functions. Not even a function call, if we - * can avoid it. - * - * Therefore, we'll inline all those functions so that in the best case, we'll - * see that kmemcg is off for everybody and proceed quickly. If it is on, - * we'll still do most of the flag checking inline. We check a lot of - * conditions, but because they are pretty simple, they are expected to be - * fast. - */ -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void __memcg_kmem_uncharge(struct page *page, int order); - -/* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function * will return -1 when this is not a kmem-limited memcg. @@ -800,67 +828,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); -void __memcg_kmem_put_cache(struct kmem_cache *cachep); - -static inline bool __memcg_kmem_bypass(void) -{ - if (!memcg_kmem_enabled()) - return true; - if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) - return true; - return false; -} - -/** - * memcg_kmem_charge: charge a kmem page - * @page: page to charge - * @gfp: reclaim mode - * @order: allocation order - * - * Returns 0 on success, an error code on failure. - */ -static __always_inline int memcg_kmem_charge(struct page *page, - gfp_t gfp, int order) -{ - if (__memcg_kmem_bypass()) - return 0; - if (!(gfp & __GFP_ACCOUNT)) - return 0; - return __memcg_kmem_charge(page, gfp, order); -} - -/** - * memcg_kmem_uncharge: uncharge a kmem page - * @page: page to uncharge - * @order: allocation order - */ -static __always_inline void memcg_kmem_uncharge(struct page *page, int order) -{ - if (memcg_kmem_enabled()) - __memcg_kmem_uncharge(page, order); -} - -/** - * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation - * @cachep: the original global kmem cache - * - * All memory allocated from a per-memcg cache is charged to the owner memcg. - */ -static __always_inline struct kmem_cache * -memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) -{ - if (__memcg_kmem_bypass()) - return cachep; - return __memcg_kmem_get_cache(cachep, gfp); -} - -static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) -{ - if (memcg_kmem_enabled()) - __memcg_kmem_put_cache(cachep); -} - /** * memcg_kmem_update_page_stat - update kmem page state statistics * @page: the page @@ -883,15 +850,6 @@ static inline bool memcg_kmem_enabled(void) return false; } -static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) -{ - return 0; -} - -static inline void memcg_kmem_uncharge(struct page *page, int order) -{ -} - static inline int memcg_cache_id(struct mem_cgroup *memcg) { return -1; @@ -905,16 +863,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline struct kmem_cache * -memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) -{ - return cachep; -} - -static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) -{ -} - static inline void memcg_kmem_update_page_stat(struct page *page, enum mem_cgroup_stat_index idx, int val) { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5145620ba48a..01033fadea47 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -284,5 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); +extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9b50325e4ddf..ae8d475a9385 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -37,6 +37,8 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); +extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); +extern void putback_movable_page(struct page *page); extern int migrate_prep(void); extern int migrate_prep_local(void); @@ -69,6 +71,21 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_COMPACTION +extern int PageMovable(struct page *page); +extern void __SetPageMovable(struct page *page, struct address_space *mapping); +extern void __ClearPageMovable(struct page *page); +#else +static inline int PageMovable(struct page *page) { return 0; }; +static inline void __SetPageMovable(struct page *page, + struct address_space *mapping) +{ +} +static inline void __ClearPageMovable(struct page *page) +{ +} +#endif + #ifdef CONFIG_NUMA_BALANCING extern bool pmd_trans_migrating(pmd_t pmd); extern int migrate_misplaced_page(struct page *page, diff --git a/include/linux/mm.h b/include/linux/mm.h index ece042dfe23c..b21e5f30378e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -309,10 +309,34 @@ struct vm_fault { * VM_FAULT_DAX_LOCKED and fill in * entry here. */ - /* for ->map_pages() only */ - pgoff_t max_pgoff; /* map pages for offset from pgoff till - * max_pgoff inclusive */ - pte_t *pte; /* pte entry associated with ->pgoff */ +}; + +/* + * Page fault context: passes though page fault handler instead of endless list + * of function arguments. + */ +struct fault_env { + struct vm_area_struct *vma; /* Target VMA */ + unsigned long address; /* Faulting virtual address */ + unsigned int flags; /* FAULT_FLAG_xxx flags */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' + */ + pte_t *pte; /* Pointer to pte entry matching + * the 'address'. NULL if the page + * table hasn't been allocated. + */ + spinlock_t *ptl; /* Page table lock. + * Protects pte page table if 'pte' + * is not NULL, otherwise pmd. + */ + pgtable_t prealloc_pte; /* Pre-allocated pte page table. + * vm_ops->map_pages() calls + * alloc_set_pte() from atomic context. + * do_fault_around() pre-allocates + * page table to avoid allocation from + * atomic context. + */ }; /* @@ -327,7 +351,8 @@ struct vm_operations_struct { int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*pmd_fault)(struct vm_area_struct *, unsigned long address, pmd_t *, unsigned int flags); - void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); + void (*map_pages)(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ @@ -537,7 +562,6 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); -int split_free_page(struct page *page); /* * Compound pages have a destructor function. Provide a @@ -601,8 +625,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon); +int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, + struct page *page); #endif /* @@ -949,11 +973,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page) { return page->mem_cgroup; } +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + return READ_ONCE(page->mem_cgroup); +} #else static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + return NULL; +} #endif /* @@ -1035,6 +1067,7 @@ static inline pgoff_t page_file_index(struct page *page) } bool page_mapped(struct page *page); +struct address_space *page_mapping(struct page *page); /* * Return true only if the page has been allocated with @@ -1215,15 +1248,14 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU -extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags); +extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); #else -static inline int handle_mm_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +static inline int handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { /* should never happen if there's no MMU */ BUG(); @@ -2063,7 +2095,8 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); +extern void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ @@ -2259,6 +2292,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ +extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); + #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ca3e517980a0..e093e1d3285b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -60,51 +60,52 @@ struct page { }; /* Second double word */ - struct { - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* sl[aou]b first free object */ - /* page_deferred_list().prev -- second tail page */ - }; + union { + pgoff_t index; /* Our offset within mapping. */ + void *freelist; /* sl[aou]b first free object */ + /* page_deferred_list().prev -- second tail page */ + }; - union { + union { #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; + /* Used for cmpxchg_double in slub */ + unsigned long counters; #else - /* - * Keep _refcount separate from slub cmpxchg_double - * data. As the rest of the double word is protected by - * slab_lock but _refcount is not. - */ - unsigned counters; + /* + * Keep _refcount separate from slub cmpxchg_double data. + * As the rest of the double word is protected by slab_lock + * but _refcount is not. + */ + unsigned counters; #endif + struct { - struct { - - union { - /* - * Count of ptes mapped in mms, to show - * when page is mapped & limit reverse - * map searches. - */ - atomic_t _mapcount; - - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; + union { /* - * Usage count, *USE WRAPPER FUNCTION* - * when manual accounting. See page_ref.h + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. */ - atomic_t _refcount; + atomic_t _mapcount; + + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ }; - unsigned int active; /* SLAB */ + /* + * Usage count, *USE WRAPPER FUNCTION* when manual + * accounting. See page_ref.h + */ + atomic_t _refcount; }; }; diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index de7be78c6f0e..451a811f48f2 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -39,6 +39,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) +#define VM_WARN(cond, format...) WARN(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) @@ -47,6 +48,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_DEBUG_VIRTUAL diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 02069c23486d..19425e988bdc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -140,6 +140,9 @@ enum zone_stat_item { NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_PAGES_SCANNED, /* pages scanned since last reclaim */ +#if IS_ENABLED(CONFIG_ZSMALLOC) + NR_ZSPAGES, /* allocated in zsmalloc */ +#endif #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -151,7 +154,9 @@ enum zone_stat_item { WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, - NR_ANON_TRANSPARENT_HUGEPAGES, + NR_ANON_THPS, + NR_SHMEM_THPS, + NR_SHMEM_PMDMAPPED, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; @@ -524,7 +529,6 @@ struct zone { enum zone_flags { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ - ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 83469522690a..5bc0457ee3a8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -23,6 +23,9 @@ struct oom_control { /* Used to determine mempolicy */ nodemask_t *nodemask; + /* Memory cgroup in which oom is invoked, or NULL for global oom */ + struct mem_cgroup *memcg; + /* Used to determine cpuset and node locality requirement */ const gfp_t gfp_mask; @@ -70,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern void mark_oom_victim(struct task_struct *tsk); #ifdef CONFIG_MMU -extern void try_oom_reaper(struct task_struct *tsk); +extern void wake_oom_reaper(struct task_struct *tsk); #else -static inline void try_oom_reaper(struct task_struct *tsk) +static inline void wake_oom_reaper(struct task_struct *tsk) { } #endif @@ -83,14 +86,13 @@ extern unsigned long oom_badness(struct task_struct *p, extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, const char *message); + const char *message); extern void check_panic_on_oom(struct oom_control *oc, - enum oom_constraint constraint, - struct mem_cgroup *memcg); + enum oom_constraint constraint); extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task, unsigned long totalpages); + struct task_struct *task); extern bool out_of_memory(struct oom_control *oc); @@ -105,27 +107,7 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -static inline bool task_will_free_mem(struct task_struct *task) -{ - struct signal_struct *sig = task->signal; - - /* - * A coredumping process may sleep for an extended period in exit_mm(), - * so the oom killer cannot assume that the process will promptly exit - * and release memory. - */ - if (sig->flags & SIGNAL_GROUP_COREDUMP) - return false; - - if (!(task->flags & PF_EXITING)) - return false; - - /* Make sure that the whole thread group is going down */ - if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT)) - return false; - - return true; -} +bool task_will_free_mem(struct task_struct *task); /* sysctls */ extern int sysctl_oom_dump_tasks; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e5a32445f930..74e4dda91238 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -129,6 +129,9 @@ enum pageflags { /* Compound pages. Stored in first tail page's flags */ PG_double_map = PG_private_2, + + /* non-lru isolated movable page */ + PG_isolated = PG_reclaim, }; #ifndef __GENERATING_BOUNDS_H @@ -292,11 +295,11 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) */ TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) -PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) - TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) + TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) @@ -357,29 +360,37 @@ PAGEFLAG(Idle, idle, PF_ANY) * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, - * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; - * and then page->mapping points, not to an anon_vma, but to a private + * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON + * bit; and then page->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * - * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. + * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable + * page and then page->mapping points a struct address_space. * * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" * refers to user virtual address space into which the page is mapped. */ -#define PAGE_MAPPING_ANON 1 -#define PAGE_MAPPING_KSM 2 -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) +#define PAGE_MAPPING_ANON 0x1 +#define PAGE_MAPPING_MOVABLE 0x2 +#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) -static __always_inline int PageAnonHead(struct page *page) +static __always_inline int PageMappingFlags(struct page *page) { - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline int PageAnon(struct page *page) { page = compound_head(page); - return PageAnonHead(page); + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + +static __always_inline int __PageMovable(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + PAGE_MAPPING_MOVABLE; } #ifdef CONFIG_KSM @@ -393,7 +404,7 @@ static __always_inline int PageKsm(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + PAGE_MAPPING_KSM; } #else TESTPAGEFLAG_FALSE(Ksm) @@ -570,6 +581,17 @@ static inline int PageDoubleMap(struct page *page) return PageHead(page) && test_bit(PG_double_map, &page[1].flags); } +static inline void SetPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + set_bit(PG_double_map, &page[1].flags); +} + +static inline void ClearPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + clear_bit(PG_double_map, &page[1].flags); +} static inline int TestSetPageDoubleMap(struct page *page) { VM_BUG_ON_PAGE(!PageHead(page), page); @@ -587,59 +609,59 @@ TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) -TESTPAGEFLAG_FALSE(DoubleMap) +PAGEFLAG_FALSE(DoubleMap) TESTSETFLAG_FALSE(DoubleMap) TESTCLEARFLAG_FALSE(DoubleMap) #endif /* + * For pages that are never mapped to userspace, page->mapcount may be + * used for storing extra information about page type. Any value used + * for this purpose must be <= -2, but it's better start not too close + * to -2 so that an underflow of the page_mapcount() won't be mistaken + * for a special page. + */ +#define PAGE_MAPCOUNT_OPS(uname, lname) \ +static __always_inline int Page##uname(struct page *page) \ +{ \ + return atomic_read(&page->_mapcount) == \ + PAGE_##lname##_MAPCOUNT_VALUE; \ +} \ +static __always_inline void __SetPage##uname(struct page *page) \ +{ \ + VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ + atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ +} \ +static __always_inline void __ClearPage##uname(struct page *page) \ +{ \ + VM_BUG_ON_PAGE(!Page##uname(page), page); \ + atomic_set(&page->_mapcount, -1); \ +} + +/* * PageBuddy() indicate that the page is free and in the buddy system * (see mm/page_alloc.c). - * - * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to - * -2 so that an underflow of the page_mapcount() won't be mistaken - * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very - * efficiently by most CPU architectures. */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) - -static inline int PageBuddy(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; -} +#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) +PAGE_MAPCOUNT_OPS(Buddy, BUDDY) -static inline void __SetPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); -} +/* + * PageBalloon() is set on pages that are on the balloon page list + * (see mm/balloon_compaction.c). + */ +#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) +PAGE_MAPCOUNT_OPS(Balloon, BALLOON) -static inline void __ClearPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBuddy(page), page); - atomic_set(&page->_mapcount, -1); -} +/* + * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on + * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. + */ +#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) +PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) extern bool is_free_buddy_page(struct page *page); -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) - -static inline int PageBalloon(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; -} - -static inline void __SetPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); -} - -static inline void __ClearPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBalloon(page), page); - atomic_set(&page->_mapcount, -1); -} +__PAGEFLAG(Isolated, isolated, PF_ANY); /* * If network-based swap is enabled, sl*b must keep track of whether pages diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index e1fe7cf5bddf..03f2a3e7d76d 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/stacktrace.h> +#include <linux/stackdepot.h> struct pglist_data; struct page_ext_operations { @@ -44,9 +45,8 @@ struct page_ext { #ifdef CONFIG_PAGE_OWNER unsigned int order; gfp_t gfp_mask; - unsigned int nr_entries; int last_migrate_reason; - unsigned long trace_entries[8]; + depot_stack_handle_t handle; #endif }; diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 46f1b939948c..30583ab0ffb1 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); -extern gfp_t __get_page_owner_gfp(struct page *page); +extern void __split_page_owner(struct page *page, unsigned int order); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); @@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline gfp_t get_page_owner_gfp(struct page *page) +static inline void split_page_owner(struct page *page, unsigned int order) { if (static_branch_unlikely(&page_owner_inited)) - return __get_page_owner_gfp(page); - else - return 0; + __split_page_owner(page, order); } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { @@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { } -static inline gfp_t get_page_owner_gfp(struct page *page) +static inline void split_page_owner(struct page *page, + unsigned int order) { - return 0; } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 97354102794d..81363b834900 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -209,10 +209,10 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); } -static inline struct page *page_cache_alloc_readahead(struct address_space *x) +static inline gfp_t readahead_gfp_mask(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x) | - __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN); + return mapping_gfp_mask(x) | + __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); diff --git a/include/linux/printk.h b/include/linux/printk.h index f4da695fd615..e6ff22ef9809 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -254,21 +254,39 @@ extern asmlinkage void dump_stack(void) __cold; * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ -#define pr_emerg(fmt, ...) \ - printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) \ - printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) \ - printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) \ - printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warning(fmt, ...) \ - printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn pr_warning -#define pr_notice(fmt, ...) \ - printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) \ - printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) + +#ifdef CONFIG_PRINTK + +asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...); + +#define pr_emerg(fmt, ...) __pr_emerg(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) __pr_alert(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) __pr_crit(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) __pr_err(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn(fmt, ...) __pr_warn(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_notice(fmt, ...) __pr_notice(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr_info(pr_fmt(fmt), ##__VA_ARGS__) + +#else + +#define pr_emerg(fmt, ...) printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn(fmt, ...) printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_notice(fmt, ...) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) + +#endif + +#define pr_warning pr_warn + /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index cb4b7e8cee81..084965cf90fc 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -291,6 +291,7 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 49eb4f8ebac9..5704f101b52e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,7 +165,7 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); -void page_add_file_rmap(struct page *); +void page_add_file_rmap(struct page *, bool); void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/include/linux/sched.h b/include/linux/sched.h index d99218a1e043..553af2923824 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_REAPED 21 /* mm has been already reaped */ +#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p) #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +static inline bool in_vfork(struct task_struct *tsk) +{ + bool ret; + + /* + * need RCU to access ->real_parent if CLONE_VM was used along with + * CLONE_PARENT. + * + * We check real_parent->mm == tsk->mm because CLONE_VFORK does not + * imply CLONE_VM + * + * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus + * ->real_parent is not necessarily the task doing vfork(), so in + * theory we can't rely on task_lock() if we want to dereference it. + * + * And in this case we can't trust the real_parent->mm == tsk->mm + * check, it can be false negative. But we do not care, if init or + * another oom-unkillable task does this it should blame itself. + */ + rcu_read_lock(); + ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); diff --git a/include/linux/sem.h b/include/linux/sem.h index 976ce3a19f1b..6fb3227718c8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,6 +21,7 @@ struct sem_array { struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ + int complex_mode; /* >0: no parallel simple ops */ }; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 4d4780c00d34..ff078e7043b6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -16,8 +16,9 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ - struct shared_policy policy; /* NUMA memory alloc policy */ + struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ + struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ struct inode vfs_inode; }; @@ -28,10 +29,14 @@ struct shmem_sb_info { unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + umode_t mode; /* Mount mode for root directory */ + unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ - umode_t mode; /* Mount mode for root directory */ struct mempolicy *mpol; /* default memory policy for mappings */ + spinlock_t shrinklist_lock; /* Protects shrinklist */ + struct list_head shrinklist; /* List of shinkable inodes */ + unsigned long shrinklist_len; /* Length of shrinklist */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) @@ -49,6 +54,8 @@ extern struct file *shmem_file_setup(const char *name, extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); +extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern bool shmem_mapping(struct address_space *mapping); extern void shmem_unlock_mapping(struct address_space *mapping); @@ -61,6 +68,19 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); +/* Flag allocation requirements to shmem_getpage */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ + SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ +}; + +extern int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp); + static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { @@ -68,6 +88,18 @@ static inline struct page *shmem_read_mapping_page( mapping_gfp_mask(mapping)); } +static inline bool shmem_file(struct file *file) +{ + if (!IS_ENABLED(CONFIG_SHMEM)) + return false; + if (!file || !file->f_mapping) + return false; + return shmem_mapping(file->f_mapping); +} + +extern bool shmem_charge(struct inode *inode, long pages); +extern void shmem_uncharge(struct inode *inode, long pages); + #ifdef CONFIG_TMPFS extern int shmem_add_seals(struct file *file, unsigned int seals); @@ -83,4 +115,13 @@ static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) #endif +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +extern bool shmem_huge_enabled(struct vm_area_struct *vma); +#else +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8694f7a5d92b..59614a5ef4e4 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -81,21 +81,10 @@ struct kmem_cache { #endif #ifdef CONFIG_SLAB_FREELIST_RANDOM - void *random_seq; + unsigned int *random_seq; #endif struct kmem_cache_node *node[MAX_NUMNODES]; }; -static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, - void *x) { - void *object = x - (x - page->s_mem) % cache->size; - void *last_object = page->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - #endif /* _LINUX_SLAB_DEF_H */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d1faa019c02a..af9c70fe5533 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,6 +99,15 @@ struct kmem_cache { */ int remote_node_defrag_ratio; #endif + +#ifdef CONFIG_SLAB_FREELIST_RANDOM + unsigned int *random_seq; +#endif + +#ifdef CONFIG_KASAN + struct kasan_cache kasan_info; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; @@ -114,15 +123,4 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason); -static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, - void *x) { - void *object = x - (x - page_address(page)) % cache->size; - void *last_object = page_address(page) + - (page->objects - 1) * cache->size; - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - #endif /* _LINUX_SLUB_DEF_H */ diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 816b7543f81b..0d9224cb3e19 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -249,6 +249,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); +extern u64 ktime_get_log_ts(u64 *offset_real); /* * Timespec interfaces utilizing the ktime based ones diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 587480ad41b7..dd66a952e8cd 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -27,8 +27,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason); +extern int handle_userfault(struct fault_env *fe, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len); @@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct vm_area_struct *vma, - unsigned long address, - unsigned int flags, - unsigned long reason) +static inline int handle_userfault(struct fault_env *fe, unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ec084321fe09..42604173f122 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_FAULT_FALLBACK, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, + THP_FILE_ALLOC, + THP_FILE_MAPPED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, @@ -100,4 +102,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_VM_EVENT_ITEMS }; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; }) +#define THP_FILE_MAPPED ({ BUILD_BUG(); 0; }) +#endif + #endif /* VM_EVENT_ITEM_H_INCLUDED */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d0b5ca5d4e08..717e6149e753 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -384,4 +384,7 @@ void tag_pages_for_writeback(struct address_space *mapping, void account_page_redirty(struct page *page); +void sb_mark_inode_writeback(struct inode *inode); +void sb_clear_inode_writeback(struct inode *inode); + #endif /* WRITEBACK_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4089abc6e9c0..0933c7455a30 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -275,7 +275,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #define __net_initconst #else #define __net_init __init -#define __net_exit __exit_refok +#define __net_exit __ref #define __net_initdata __initdata #define __net_initconst __initconst #endif diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 551ba4acde4d..830d47d5ca41 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -28,7 +28,9 @@ EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ - EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") + EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ + EMe(SCAN_TRUNCATED, "truncated") \ #undef EM #undef EMe @@ -45,9 +47,9 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, - bool referenced, int none_or_zero, int status), + bool referenced, int none_or_zero, int status, int unmapped), - TP_ARGS(mm, page, writable, referenced, none_or_zero, status), + TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped), TP_STRUCT__entry( __field(struct mm_struct *, mm) @@ -56,6 +58,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, __field(bool, referenced) __field(int, none_or_zero) __field(int, status) + __field(int, unmapped) ), TP_fast_assign( @@ -65,15 +68,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; __entry->status = status; + __entry->unmapped = unmapped; ), - TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s", + TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", __entry->mm, __entry->pfn, __entry->writable, __entry->referenced, __entry->none_or_zero, - __print_symbolic(__entry->status, SCAN_STATUS)) + __print_symbolic(__entry->status, SCAN_STATUS), + __entry->unmapped) ); TRACE_EVENT(mm_collapse_huge_page, @@ -131,5 +136,29 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __print_symbolic(__entry->status, SCAN_STATUS)) ); +TRACE_EVENT(mm_collapse_huge_page_swapin, + + TP_PROTO(struct mm_struct *mm, int swapped_in, int ret), + + TP_ARGS(mm, swapped_in, ret), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(int, swapped_in) + __field(int, ret) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->swapped_in = swapped_in; + __entry->ret = ret; + ), + + TP_printk("mm=%p, swapped_in=%d, ret=%d", + __entry->mm, + __entry->swapped_in, + __entry->ret) +); + #endif /* __HUGE_MEMORY_H */ #include <trace/define_trace.h> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 73614ce1d204..531f5811ff6b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -696,7 +696,7 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_ARGS(inode, wbc, nr_to_write) ); -DECLARE_EVENT_CLASS(writeback_lazytime_template, +DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), @@ -723,25 +723,39 @@ DECLARE_EVENT_CLASS(writeback_lazytime_template, show_inode_state(__entry->state), __entry->mode) ); -DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, +DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); -DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, +DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); -DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, +DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); +/* + * Inode writeback list tracking. + */ + +DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, + TP_PROTO(struct inode *inode), + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, + TP_PROTO(struct inode *inode), + TP_ARGS(inode) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/include/trace/events/zsmalloc.h b/include/trace/events/zsmalloc.h new file mode 100644 index 000000000000..3b6f14e041e6 --- /dev/null +++ b/include/trace/events/zsmalloc.h @@ -0,0 +1,56 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zsmalloc + +#if !defined(_TRACE_ZSMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZSMALLOC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + +TRACE_EVENT(zsmalloc_compact_start, + + TP_PROTO(const char *pool_name), + + TP_ARGS(pool_name), + + TP_STRUCT__entry( + __field(const char *, pool_name) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + ), + + TP_printk("pool %s", + __entry->pool_name) +); + +TRACE_EVENT(zsmalloc_compact_end, + + TP_PROTO(const char *pool_name, unsigned long pages_compacted, + unsigned long pages_total_compacted), + + TP_ARGS(pool_name, pages_compacted, pages_total_compacted), + + TP_STRUCT__entry( + __field(const char *, pool_name) + __field(unsigned long, pages_compacted) + __field(unsigned long, pages_total_compacted) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->pages_compacted = pages_compacted; + __entry->pages_total_compacted = pages_total_compacted; + ), + + TP_printk("pool %s: %ld pages compacted(total %ld)", + __entry->pool_name, + __entry->pages_compacted, + __entry->pages_total_compacted) +); + +#endif /* _TRACE_ZSMALLOC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 546b38886e11..e398beac67b8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -80,5 +80,7 @@ #define BPF_FS_MAGIC 0xcafe4a11 /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 +#define BALLOON_KVM_MAGIC 0x13661366 +#define ZSMALLOC_MAGIC 0x58295829 #endif /* __LINUX_MAGIC_H__ */ diff --git a/include/uapi/linux/nilfs2_api.h b/include/uapi/linux/nilfs2_api.h new file mode 100644 index 000000000000..ef4c1de89b11 --- /dev/null +++ b/include/uapi/linux/nilfs2_api.h @@ -0,0 +1,292 @@ +/* + * nilfs2_api.h - NILFS2 user space API + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + */ + +#ifndef _LINUX_NILFS2_API_H +#define _LINUX_NILFS2_API_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/** + * struct nilfs_cpinfo - checkpoint information + * @ci_flags: flags + * @ci_pad: padding + * @ci_cno: checkpoint number + * @ci_create: creation timestamp + * @ci_nblk_inc: number of blocks incremented by this checkpoint + * @ci_inodes_count: inodes count + * @ci_blocks_count: blocks count + * @ci_next: next checkpoint number in snapshot list + */ +struct nilfs_cpinfo { + __u32 ci_flags; + __u32 ci_pad; + __u64 ci_cno; + __u64 ci_create; + __u64 ci_nblk_inc; + __u64 ci_inodes_count; + __u64 ci_blocks_count; + __u64 ci_next; +}; + +/* checkpoint flags */ +enum { + NILFS_CPINFO_SNAPSHOT, + NILFS_CPINFO_INVALID, + NILFS_CPINFO_SKETCH, + NILFS_CPINFO_MINOR, +}; + +#define NILFS_CPINFO_FNS(flag, name) \ +static inline int \ +nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo) \ +{ \ + return !!(cpinfo->ci_flags & (1UL << NILFS_CPINFO_##flag)); \ +} + +NILFS_CPINFO_FNS(SNAPSHOT, snapshot) +NILFS_CPINFO_FNS(INVALID, invalid) +NILFS_CPINFO_FNS(MINOR, minor) + +/** + * nilfs_suinfo - segment usage information + * @sui_lastmod: timestamp of last modification + * @sui_nblocks: number of written blocks in segment + * @sui_flags: segment usage flags + */ +struct nilfs_suinfo { + __u64 sui_lastmod; + __u32 sui_nblocks; + __u32 sui_flags; +}; + +/* segment usage flags */ +enum { + NILFS_SUINFO_ACTIVE, + NILFS_SUINFO_DIRTY, + NILFS_SUINFO_ERROR, +}; + +#define NILFS_SUINFO_FNS(flag, name) \ +static inline int \ +nilfs_suinfo_##name(const struct nilfs_suinfo *si) \ +{ \ + return si->sui_flags & (1UL << NILFS_SUINFO_##flag); \ +} + +NILFS_SUINFO_FNS(ACTIVE, active) +NILFS_SUINFO_FNS(DIRTY, dirty) +NILFS_SUINFO_FNS(ERROR, error) + +static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) +{ + return !si->sui_flags; +} + +/** + * nilfs_suinfo_update - segment usage information update + * @sup_segnum: segment number + * @sup_flags: flags for which fields are active in sup_sui + * @sup_reserved: reserved necessary for alignment + * @sup_sui: segment usage information + */ +struct nilfs_suinfo_update { + __u64 sup_segnum; + __u32 sup_flags; + __u32 sup_reserved; + struct nilfs_suinfo sup_sui; +}; + +enum { + NILFS_SUINFO_UPDATE_LASTMOD, + NILFS_SUINFO_UPDATE_NBLOCKS, + NILFS_SUINFO_UPDATE_FLAGS, + __NR_NILFS_SUINFO_UPDATE_FIELDS, +}; + +#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ +static inline void \ +nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ +} \ +static inline void \ +nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ +} \ +static inline int \ +nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ +{ \ + return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ +} + +NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) +NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) +NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) + +enum { + NILFS_CHECKPOINT, + NILFS_SNAPSHOT, +}; + +/** + * struct nilfs_cpmode - change checkpoint mode structure + * @cm_cno: checkpoint number + * @cm_mode: mode of checkpoint + * @cm_pad: padding + */ +struct nilfs_cpmode { + __u64 cm_cno; + __u32 cm_mode; + __u32 cm_pad; +}; + +/** + * struct nilfs_argv - argument vector + * @v_base: pointer on data array from userspace + * @v_nmembs: number of members in data array + * @v_size: size of data array in bytes + * @v_flags: flags + * @v_index: start number of target data items + */ +struct nilfs_argv { + __u64 v_base; + __u32 v_nmembs; /* number of members */ + __u16 v_size; /* size of members */ + __u16 v_flags; + __u64 v_index; +}; + +/** + * struct nilfs_period - period of checkpoint numbers + * @p_start: start checkpoint number (inclusive) + * @p_end: end checkpoint number (exclusive) + */ +struct nilfs_period { + __u64 p_start; + __u64 p_end; +}; + +/** + * struct nilfs_cpstat - checkpoint statistics + * @cs_cno: checkpoint number + * @cs_ncps: number of checkpoints + * @cs_nsss: number of snapshots + */ +struct nilfs_cpstat { + __u64 cs_cno; + __u64 cs_ncps; + __u64 cs_nsss; +}; + +/** + * struct nilfs_sustat - segment usage statistics + * @ss_nsegs: number of segments + * @ss_ncleansegs: number of clean segments + * @ss_ndirtysegs: number of dirty segments + * @ss_ctime: creation time of the last segment + * @ss_nongc_ctime: creation time of the last segment not for GC + * @ss_prot_seq: least sequence number of segments which must not be reclaimed + */ +struct nilfs_sustat { + __u64 ss_nsegs; + __u64 ss_ncleansegs; + __u64 ss_ndirtysegs; + __u64 ss_ctime; + __u64 ss_nongc_ctime; + __u64 ss_prot_seq; +}; + +/** + * struct nilfs_vinfo - virtual block number information + * @vi_vblocknr: virtual block number + * @vi_start: start checkpoint number (inclusive) + * @vi_end: end checkpoint number (exclusive) + * @vi_blocknr: disk block number + */ +struct nilfs_vinfo { + __u64 vi_vblocknr; + __u64 vi_start; + __u64 vi_end; + __u64 vi_blocknr; +}; + +/** + * struct nilfs_vdesc - descriptor of virtual block number + * @vd_ino: inode number + * @vd_cno: checkpoint number + * @vd_vblocknr: virtual block number + * @vd_period: period of checkpoint numbers + * @vd_blocknr: disk block number + * @vd_offset: logical block offset inside a file + * @vd_flags: flags (data or node block) + * @vd_pad: padding + */ +struct nilfs_vdesc { + __u64 vd_ino; + __u64 vd_cno; + __u64 vd_vblocknr; + struct nilfs_period vd_period; + __u64 vd_blocknr; + __u64 vd_offset; + __u32 vd_flags; + __u32 vd_pad; +}; + +/** + * struct nilfs_bdesc - descriptor of disk block number + * @bd_ino: inode number + * @bd_oblocknr: disk block address (for skipping dead blocks) + * @bd_blocknr: disk block address + * @bd_offset: logical block offset inside a file + * @bd_level: level in the b-tree organization + * @bd_pad: padding + */ +struct nilfs_bdesc { + __u64 bd_ino; + __u64 bd_oblocknr; + __u64 bd_blocknr; + __u64 bd_offset; + __u32 bd_level; + __u32 bd_pad; +}; + +#define NILFS_IOCTL_IDENT 'n' + +#define NILFS_IOCTL_CHANGE_CPMODE \ + _IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode) +#define NILFS_IOCTL_DELETE_CHECKPOINT \ + _IOW(NILFS_IOCTL_IDENT, 0x81, __u64) +#define NILFS_IOCTL_GET_CPINFO \ + _IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv) +#define NILFS_IOCTL_GET_CPSTAT \ + _IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat) +#define NILFS_IOCTL_GET_SUINFO \ + _IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv) +#define NILFS_IOCTL_GET_SUSTAT \ + _IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat) +#define NILFS_IOCTL_GET_VINFO \ + _IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv) +#define NILFS_IOCTL_GET_BDESCS \ + _IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv) +#define NILFS_IOCTL_CLEAN_SEGMENTS \ + _IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5]) +#define NILFS_IOCTL_SYNC \ + _IOR(NILFS_IOCTL_IDENT, 0x8A, __u64) +#define NILFS_IOCTL_RESIZE \ + _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) +#define NILFS_IOCTL_SET_ALLOC_RANGE \ + _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) +#define NILFS_IOCTL_SET_SUINFO \ + _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) + +#endif /* _LINUX_NILFS2_API_H */ diff --git a/include/linux/nilfs2_fs.h b/include/uapi/linux/nilfs2_ondisk.h index 5988dd57ba66..2a8a3addb675 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -1,5 +1,5 @@ /* - * nilfs2_fs.h - NILFS2 on-disk structures and common declarations. + * nilfs2_ondisk.h - NILFS2 on-disk structures * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * @@ -7,13 +7,6 @@ * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * Written by Koji Sato and Ryusuke Konishi. */ /* * linux/include/linux/ext2_fs.h @@ -30,16 +23,15 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#ifndef _LINUX_NILFS_FS_H -#define _LINUX_NILFS_FS_H +#ifndef _LINUX_NILFS2_ONDISK_H +#define _LINUX_NILFS2_ONDISK_H #include <linux/types.h> -#include <linux/ioctl.h> #include <linux/magic.h> -#include <linux/bug.h> #define NILFS_INODE_BMAP_SIZE 7 + /** * struct nilfs_inode - structure of an inode on disk * @i_blocks: blocks count @@ -56,7 +48,7 @@ * @i_bmap: block mapping * @i_xattr: extended attributes * @i_generation: file generation (for NFS) - * @i_pad: padding + * @i_pad: padding */ struct nilfs_inode { __le64 i_blocks; @@ -338,29 +330,7 @@ enum { #define NILFS_DIR_ROUND (NILFS_DIR_PAD - 1) #define NILFS_DIR_REC_LEN(name_len) (((name_len) + 12 + NILFS_DIR_ROUND) & \ ~NILFS_DIR_ROUND) -#define NILFS_MAX_REC_LEN ((1<<16)-1) - -static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) -{ - unsigned int len = le16_to_cpu(dlen); - -#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536) - if (len == NILFS_MAX_REC_LEN) - return 1 << 16; -#endif - return len; -} - -static inline __le16 nilfs_rec_len_to_disk(unsigned int len) -{ -#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536) - if (len == (1 << 16)) - return cpu_to_le16(NILFS_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); -#endif - return cpu_to_le16(len); -} +#define NILFS_MAX_REC_LEN ((1 << 16) - 1) /** * struct nilfs_finfo - file information @@ -374,11 +344,10 @@ struct nilfs_finfo { __le64 fi_cno; __le32 fi_nblocks; __le32 fi_ndatablk; - /* array of virtual block numbers */ }; /** - * struct nilfs_binfo_v - information for the block to which a virtual block number is assigned + * struct nilfs_binfo_v - information on a data block (except DAT) * @bi_vblocknr: virtual block number * @bi_blkoff: block offset */ @@ -388,7 +357,7 @@ struct nilfs_binfo_v { }; /** - * struct nilfs_binfo_dat - information for the block which belongs to the DAT file + * struct nilfs_binfo_dat - information on a DAT node block * @bi_blkoff: block offset * @bi_level: level * @bi_pad: padding @@ -454,7 +423,7 @@ struct nilfs_segment_summary { #define NILFS_SS_GC 0x0010 /* segment written for cleaner operation */ /** - * struct nilfs_btree_node - B-tree node + * struct nilfs_btree_node - header of B-tree node block * @bn_flags: flags * @bn_level: level * @bn_nchildren: number of children @@ -476,6 +445,16 @@ struct nilfs_btree_node { #define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ /** + * struct nilfs_direct_node - header of built-in bmap array + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +/** * struct nilfs_palloc_group_desc - block group descriptor * @pg_nfrees: number of free entries in block group */ @@ -574,40 +553,6 @@ NILFS_CHECKPOINT_FNS(INVALID, invalid) NILFS_CHECKPOINT_FNS(MINOR, minor) /** - * struct nilfs_cpinfo - checkpoint information - * @ci_flags: flags - * @ci_pad: padding - * @ci_cno: checkpoint number - * @ci_create: creation timestamp - * @ci_nblk_inc: number of blocks incremented by this checkpoint - * @ci_inodes_count: inodes count - * @ci_blocks_count: blocks count - * @ci_next: next checkpoint number in snapshot list - */ -struct nilfs_cpinfo { - __u32 ci_flags; - __u32 ci_pad; - __u64 ci_cno; - __u64 ci_create; - __u64 ci_nblk_inc; - __u64 ci_inodes_count; - __u64 ci_blocks_count; - __u64 ci_next; -}; - -#define NILFS_CPINFO_FNS(flag, name) \ -static inline int \ -nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo) \ -{ \ - return !!(cpinfo->ci_flags & (1UL << NILFS_CHECKPOINT_##flag)); \ -} - -NILFS_CPINFO_FNS(SNAPSHOT, snapshot) -NILFS_CPINFO_FNS(INVALID, invalid) -NILFS_CPINFO_FNS(MINOR, minor) - - -/** * struct nilfs_cpfile_header - checkpoint file header * @ch_ncheckpoints: number of checkpoints * @ch_nsnapshots: number of snapshots @@ -619,7 +564,7 @@ struct nilfs_cpfile_header { struct nilfs_snapshot_list ch_snapshot_list; }; -#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ +#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ ((sizeof(struct nilfs_cpfile_header) + \ sizeof(struct nilfs_checkpoint) - 1) / \ sizeof(struct nilfs_checkpoint)) @@ -643,8 +588,6 @@ enum { NILFS_SEGMENT_USAGE_ACTIVE, NILFS_SEGMENT_USAGE_DIRTY, NILFS_SEGMENT_USAGE_ERROR, - - /* ... */ }; #define NILFS_SEGMENT_USAGE_FNS(flag, name) \ @@ -699,236 +642,9 @@ struct nilfs_sufile_header { /* ... */ }; -#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ +#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ ((sizeof(struct nilfs_sufile_header) + \ sizeof(struct nilfs_segment_usage) - 1) / \ sizeof(struct nilfs_segment_usage)) -/** - * nilfs_suinfo - segment usage information - * @sui_lastmod: timestamp of last modification - * @sui_nblocks: number of written blocks in segment - * @sui_flags: segment usage flags - */ -struct nilfs_suinfo { - __u64 sui_lastmod; - __u32 sui_nblocks; - __u32 sui_flags; -}; - -#define NILFS_SUINFO_FNS(flag, name) \ -static inline int \ -nilfs_suinfo_##name(const struct nilfs_suinfo *si) \ -{ \ - return si->sui_flags & (1UL << NILFS_SEGMENT_USAGE_##flag); \ -} - -NILFS_SUINFO_FNS(ACTIVE, active) -NILFS_SUINFO_FNS(DIRTY, dirty) -NILFS_SUINFO_FNS(ERROR, error) - -static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) -{ - return !si->sui_flags; -} - -/* ioctl */ -/** - * nilfs_suinfo_update - segment usage information update - * @sup_segnum: segment number - * @sup_flags: flags for which fields are active in sup_sui - * @sup_reserved: reserved necessary for alignment - * @sup_sui: segment usage information - */ -struct nilfs_suinfo_update { - __u64 sup_segnum; - __u32 sup_flags; - __u32 sup_reserved; - struct nilfs_suinfo sup_sui; -}; - -enum { - NILFS_SUINFO_UPDATE_LASTMOD, - NILFS_SUINFO_UPDATE_NBLOCKS, - NILFS_SUINFO_UPDATE_FLAGS, - __NR_NILFS_SUINFO_UPDATE_FIELDS, -}; - -#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ -static inline void \ -nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ -{ \ - sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ -} \ -static inline void \ -nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ -{ \ - sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ -} \ -static inline int \ -nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ -{ \ - return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ -} - -NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) -NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) -NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) - -enum { - NILFS_CHECKPOINT, - NILFS_SNAPSHOT, -}; - -/** - * struct nilfs_cpmode - change checkpoint mode structure - * @cm_cno: checkpoint number - * @cm_mode: mode of checkpoint - * @cm_pad: padding - */ -struct nilfs_cpmode { - __u64 cm_cno; - __u32 cm_mode; - __u32 cm_pad; -}; - -/** - * struct nilfs_argv - argument vector - * @v_base: pointer on data array from userspace - * @v_nmembs: number of members in data array - * @v_size: size of data array in bytes - * @v_flags: flags - * @v_index: start number of target data items - */ -struct nilfs_argv { - __u64 v_base; - __u32 v_nmembs; /* number of members */ - __u16 v_size; /* size of members */ - __u16 v_flags; - __u64 v_index; -}; - -/** - * struct nilfs_period - period of checkpoint numbers - * @p_start: start checkpoint number (inclusive) - * @p_end: end checkpoint number (exclusive) - */ -struct nilfs_period { - __u64 p_start; - __u64 p_end; -}; - -/** - * struct nilfs_cpstat - checkpoint statistics - * @cs_cno: checkpoint number - * @cs_ncps: number of checkpoints - * @cs_nsss: number of snapshots - */ -struct nilfs_cpstat { - __u64 cs_cno; - __u64 cs_ncps; - __u64 cs_nsss; -}; - -/** - * struct nilfs_sustat - segment usage statistics - * @ss_nsegs: number of segments - * @ss_ncleansegs: number of clean segments - * @ss_ndirtysegs: number of dirty segments - * @ss_ctime: creation time of the last segment - * @ss_nongc_ctime: creation time of the last segment not for GC - * @ss_prot_seq: least sequence number of segments which must not be reclaimed - */ -struct nilfs_sustat { - __u64 ss_nsegs; - __u64 ss_ncleansegs; - __u64 ss_ndirtysegs; - __u64 ss_ctime; - __u64 ss_nongc_ctime; - __u64 ss_prot_seq; -}; - -/** - * struct nilfs_vinfo - virtual block number information - * @vi_vblocknr: virtual block number - * @vi_start: start checkpoint number (inclusive) - * @vi_end: end checkpoint number (exclusive) - * @vi_blocknr: disk block number - */ -struct nilfs_vinfo { - __u64 vi_vblocknr; - __u64 vi_start; - __u64 vi_end; - __u64 vi_blocknr; -}; - -/** - * struct nilfs_vdesc - descriptor of virtual block number - * @vd_ino: inode number - * @vd_cno: checkpoint number - * @vd_vblocknr: virtual block number - * @vd_period: period of checkpoint numbers - * @vd_blocknr: disk block number - * @vd_offset: logical block offset inside a file - * @vd_flags: flags (data or node block) - * @vd_pad: padding - */ -struct nilfs_vdesc { - __u64 vd_ino; - __u64 vd_cno; - __u64 vd_vblocknr; - struct nilfs_period vd_period; - __u64 vd_blocknr; - __u64 vd_offset; - __u32 vd_flags; - __u32 vd_pad; -}; - -/** - * struct nilfs_bdesc - descriptor of disk block number - * @bd_ino: inode number - * @bd_oblocknr: disk block address (for skipping dead blocks) - * @bd_blocknr: disk block address - * @bd_offset: logical block offset inside a file - * @bd_level: level in the b-tree organization - * @bd_pad: padding - */ -struct nilfs_bdesc { - __u64 bd_ino; - __u64 bd_oblocknr; - __u64 bd_blocknr; - __u64 bd_offset; - __u32 bd_level; - __u32 bd_pad; -}; - -#define NILFS_IOCTL_IDENT 'n' - -#define NILFS_IOCTL_CHANGE_CPMODE \ - _IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode) -#define NILFS_IOCTL_DELETE_CHECKPOINT \ - _IOW(NILFS_IOCTL_IDENT, 0x81, __u64) -#define NILFS_IOCTL_GET_CPINFO \ - _IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv) -#define NILFS_IOCTL_GET_CPSTAT \ - _IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat) -#define NILFS_IOCTL_GET_SUINFO \ - _IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv) -#define NILFS_IOCTL_GET_SUSTAT \ - _IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat) -#define NILFS_IOCTL_GET_VINFO \ - _IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv) -#define NILFS_IOCTL_GET_BDESCS \ - _IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv) -#define NILFS_IOCTL_CLEAN_SEGMENTS \ - _IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5]) -#define NILFS_IOCTL_SYNC \ - _IOR(NILFS_IOCTL_IDENT, 0x8A, __u64) -#define NILFS_IOCTL_RESIZE \ - _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) -#define NILFS_IOCTL_SET_ALLOC_RANGE \ - _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) -#define NILFS_IOCTL_SET_SUINFO \ - _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) - -#endif /* _LINUX_NILFS_FS_H */ +#endif /* _LINUX_NILFS2_ONDISK_H */ diff --git a/init/Kconfig b/init/Kconfig index e2d5b8d18242..032c25e67a3f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -55,6 +55,7 @@ config CROSS_COMPILE config COMPILE_TEST bool "Compile also drivers which will not load" + depends on !UML default n help Some drivers can be compiled on a different platform than they are @@ -1785,10 +1786,10 @@ endchoice config SLAB_FREELIST_RANDOM default n - depends on SLAB + depends on SLAB || SLUB bool "SLAB freelist randomization" help - Randomizes the freelist order used on creating new SLABs. This + Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab allocator against heap overflows. diff --git a/init/main.c b/init/main.c index c826ff22cff0..9a9a25570f72 100644 --- a/init/main.c +++ b/init/main.c @@ -380,7 +380,7 @@ static void __init setup_command_line(char *command_line) static __initdata DECLARE_COMPLETION(kthreadd_done); -static noinline void __init_refok rest_init(void) +static noinline void __ref rest_init(void) { int pid; @@ -716,6 +716,12 @@ static bool __init_or_module initcall_blacklisted(initcall_t fn) addr = (unsigned long) dereference_function_descriptor(fn); sprint_symbol_no_offset(fn_name, addr); + /* + * fn will be "function_name [module_name]" where [module_name] is not + * displayed for built-in init functions. Strip off the [module_name]. + */ + strreplace(fn_name, ' ', '\0'); + list_for_each_entry(entry, &blacklisted_initcalls, next) { if (!strcmp(fn_name, entry->buf)) { pr_debug("initcall %s blacklisted\n", fn_name); diff --git a/ipc/msg.c b/ipc/msg.c index 1471db9a7e61..59559a215401 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -37,6 +37,7 @@ #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> +#include <linux/freezer.h> #include <asm/current.h> #include <linux/uaccess.h> @@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); - schedule(); + freezable_schedule(); rcu_read_lock(); ipc_lock_object(&msq->q_perm); @@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); - schedule(); + freezable_schedule(); /* Lockless receive, part 1: * Disable preemption. We don't hold a reference to the queue diff --git a/ipc/sem.c b/ipc/sem.c index ae72b3cddc8d..1afa695516fd 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -161,15 +161,29 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* + * Switching from the mode suitable for simple ops + * to the mode for complex ops is costly. Therefore: + * use some hysteresis + */ +#define COMPLEX_MODE_ENTER 10 + +/* * Locking: + * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, - * sem_array.pending{_alter,_cont}, - * sem_array.sem_undo: global sem_lock() for read/write - * sem_undo.proc_next: only "current" is allowed to read/write that field. + * sem_array.complex_mode + * sem_array.pending{_alter,_const}, + * sem_array.sem_undo * + * b) global or semaphore sem_lock() for read/write: * sem_array.sem_base[i].pending_{const,alter}: - * global or semaphore sem_lock() for read/write + * sem_array.complex_mode (for read) + * + * c) special: + * sem_undo_list.list_proc: + * * undo_list->lock for write + * * rcu for read */ #define sc_semmsl sem_ctls[0] @@ -260,23 +274,33 @@ static void sem_rcu_free(struct rcu_head *head) } /* - * Wait until all currently ongoing simple ops have completed. + * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. - * New simple ops cannot start, because simple ops first check - * that sem_perm.lock is free. - * that a) sem_perm.lock is free and b) complex_count is 0. + * Note: + * There is no leave complex mode function. Leaving + * happens in sem_lock, with some hysteresis. */ -static void sem_wait_array(struct sem_array *sma) +static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; - if (sma->complex_count) { - /* The thread that increased sma->complex_count waited on - * all sem->lock locks. Thus we don't need to wait again. + if (sma->complex_mode > 0) { + /* + * We are already in complex_mode. + * Nothing to do, just increase + * counter until we return to simple mode */ + WRITE_ONCE(sma->complex_mode, COMPLEX_MODE_ENTER); return; } + WRITE_ONCE(sma->complex_mode, COMPLEX_MODE_ENTER); + + /* We need a full barrier: + * The write to complex_mode must be visible + * before we read the first sem->lock spinlock state. + */ + smp_mb(); for (i = 0; i < sma->sem_nsems; i++) { sem = sma->sem_base + i; @@ -300,56 +324,38 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); - /* And wait until all simple ops that are processed - * right now have dropped their locks. - */ - sem_wait_array(sma); + /* Prevent parallel simple ops */ + complexmode_enter(sma); return -1; } /* * Only one semaphore affected - try to optimize locking. - * The rules are: - * - optimized locking is possible if no complex operation - * is either enqueued or processed right now. - * - The test for enqueued complex ops is simple: - * sma->complex_count != 0 - * - Testing for complex ops that are processed right now is - * a bit more difficult. Complex ops acquire the full lock - * and first wait that the running simple ops have completed. - * (see above) - * Thus: If we own a simple lock and the global lock is free - * and complex_count is now 0, then it will stay 0 and - * thus just locking sem->lock is sufficient. + * Optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * + * Both facts are tracked by complex_mode. */ sem = sma->sem_base + sops->sem_num; - if (sma->complex_count == 0) { + /* + * Initial check for complex_mode. Just an optimization, + * no locking. + */ + if (!READ_ONCE(sma->complex_mode)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); - /* Then check that the global lock is free */ - if (!spin_is_locked(&sma->sem_perm.lock)) { - /* - * We need a memory barrier with acquire semantics, - * otherwise we can race with another thread that does: - * complex_count++; - * spin_unlock(sem_perm.lock); - */ - smp_acquire__after_ctrl_dep(); - - /* - * Now repeat the test of complex_count: - * It can't change anymore until we drop sem->lock. - * Thus: if is now 0, then it will stay 0. - */ - if (sma->complex_count == 0) { - /* fast path successful! */ - return sops->sem_num; - } + /* Now repeat the test for complex_mode. + * A memory barrier is provided by the spin_lock() + * above. + */ + if (!READ_ONCE(sma->complex_mode)) { + /* fast path successful! */ + return sops->sem_num; } spin_unlock(&sem->lock); } @@ -358,20 +364,36 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, ipc_lock_object(&sma->sem_perm); if (sma->complex_count == 0) { - /* False alarm: - * There is no complex operation, thus we can switch - * back to the fast path. - */ - spin_lock(&sem->lock); - ipc_unlock_object(&sma->sem_perm); - return sops->sem_num; - } else { - /* Not a false alarm, thus complete the sequence for a - * full lock. + /* + * Check if fast path is possible: + * There is no complex operation, check hysteresis + * If 0, switch back to the fast path. */ - sem_wait_array(sma); - return -1; + if (sma->complex_mode > 0) { + /* Note: + * Immediately after setting complex_mode to 0, + * a simple op could start. + * The data it would access was written by the + * previous owner of sem->sem_perm.lock, i.e + * a release and an acquire memory barrier ago. + * No need for another barrier. + */ + WRITE_ONCE(sma->complex_mode, sma->complex_mode-1); + } + if (sma->complex_mode == 0) { + spin_lock(&sem->lock); + ipc_unlock_object(&sma->sem_perm); + return sops->sem_num; + } } + /* + * Not a false alarm, full lock is required. + * Since we are already in complex_mode (either because of waiting + * complex ops or due to hysteresis), there is not need for a + * complexmode_enter(). + */ + WARN_ON(sma->complex_mode == 0); + return -1; } static inline void sem_unlock(struct sem_array *sma, int locknum) @@ -529,6 +551,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) } sma->complex_count = 0; + WRITE_ONCE(sma->complex_mode, COMPLEX_MODE_ENTER); INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); @@ -2184,10 +2207,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object() directly (in sysvipc_find_ipc). - * In order to stay compatible with sem_lock(), we must wait until - * all simple semop() calls have left their critical regions. + * In order to stay compatible with sem_lock(), we must + * enter complex_mode. */ - sem_wait_array(sma); + complexmode_enter(sma); sem_otime = get_semotime(sma); diff --git a/ipc/shm.c b/ipc/shm.c index 13282510bc0d..dbac8860c721 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -476,13 +476,15 @@ static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, -#ifndef CONFIG_MMU .get_unmapped_area = shm_get_unmapped_area, -#endif .llseek = noop_llseek, .fallocate = shm_fallocate, }; +/* + * shm_file_operations_huge is now identical to shm_file_operations, + * but we keep it distinct for the sake of is_file_shm_hugepages(). + */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, .fsync = shm_fsync, @@ -764,10 +766,10 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, } else { #ifdef CONFIG_SHMEM struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); *rss_add += inode->i_mapping->nrpages; *swp_add += info->swapped; - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); #else *rss_add += inode->i_mapping->nrpages; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 8ef3acdeffdf..792699369e85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack) static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, - THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); if (page) memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, @@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack) memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); - __free_kmem_pages(page, THREAD_SIZE_ORDER); + __free_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; diff --git a/kernel/futex.c b/kernel/futex.c index 33664f70e2d2..b384f312a65a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3158,8 +3158,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; +#ifdef CONFIG_MMU if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; +#endif if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; diff --git a/kernel/kexec.c b/kernel/kexec.c index 4384672d3245..980936a90ee6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (kexec_on_panic) { /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) + if ((entry < phys_to_boot_phys(crashk_res.start)) || + (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0927b0..73d4c5f57dd8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -147,7 +147,7 @@ static struct page *kimage_alloc_page(struct kimage *image, int sanity_check_segment_list(struct kimage *image) { - int result, i; + int i; unsigned long nr_segments = image->nr_segments; /* @@ -163,16 +163,17 @@ int sanity_check_segment_list(struct kimage *image) * simply because addresses are changed to page size * granularity. */ - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; + if (mstart > mend) + return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; + return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; + return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. @@ -180,7 +181,6 @@ int sanity_check_segment_list(struct kimage *image) * through very weird things can happen with no * easy explanation as one segment stops on another. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; @@ -194,7 +194,7 @@ int sanity_check_segment_list(struct kimage *image) pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - return result; + return -EINVAL; } } @@ -203,10 +203,9 @@ int sanity_check_segment_list(struct kimage *image) * and it is easier to check up front than to be surprised * later on. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - return result; + return -EINVAL; } /* @@ -220,16 +219,15 @@ int sanity_check_segment_list(struct kimage *image) */ if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; + if ((mstart < phys_to_boot_phys(crashk_res.start)) || + (mend > phys_to_boot_phys(crashk_res.end))) + return -EADDRNOTAVAIL; } } @@ -352,7 +350,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -478,7 +476,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -533,13 +531,13 @@ void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -633,7 +631,7 @@ static struct page *kimage_alloc_page(struct kimage *image, * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -648,12 +646,12 @@ static struct page *kimage_alloc_page(struct kimage *image, if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -676,7 +674,7 @@ static struct page *kimage_alloc_page(struct kimage *image, struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -732,7 +730,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -793,7 +791,7 @@ static int kimage_load_crash_segment(struct kimage *image, char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -921,7 +919,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -1374,7 +1372,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __weak paddr_vmcoreinfo_note(void) +phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 152da4a48867..9f1920d2d0c6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size); static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), + phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); + return sprintf(buf, "%pa %x\n", &vmcore_base, (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 7fd2838fa417..5d4505f30083 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,9 +16,11 @@ */ #include <linux/percpu.h> -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt, + va_list args); -int __printf(1, 0) vprintk_default(const char *fmt, va_list args); +__printf(2, 0) +int vprintk_default(int level, const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI @@ -31,9 +33,10 @@ extern raw_spinlock_t logbuf_lock; * via per-CPU variable. */ DECLARE_PER_CPU(printk_func_t, printk_func); -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +__printf(2, 0) +static inline int vprintk_func(int level, const char *fmt, va_list args) { - return this_cpu_read(printk_func)(fmt, args); + return this_cpu_read(printk_func)(level, fmt, args); } extern atomic_t nmi_message_lost; @@ -44,9 +47,10 @@ static inline int get_nmi_message_lost(void) #else /* CONFIG_PRINTK_NMI */ -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +__printf(2, 0) +static inline int vprintk_func(int level, const char *fmt, va_list args) { - return vprintk_default(fmt, args); + return vprintk_default(level, fmt, args); } static inline int get_nmi_message_lost(void) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..bc3eeb1ae6da 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(const char *fmt, va_list args) +static int vprintk_nmi(int level, const char *fmt, va_list args) { struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); int add = 0; @@ -79,7 +79,16 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + if (level != LOGLEVEL_DEFAULT) { + add = snprintf(s->buffer + len, sizeof(s->buffer) - len, + KERN_SOH "%c", '0' + level); + add += vsnprintf(s->buffer + len + add, + sizeof(s->buffer) - len - add, + fmt, args); + } else { + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, + fmt, args); + } /* * Do it once again if the buffer has been flushed in the meantime. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..1607b69d4faf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -26,7 +26,6 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> @@ -425,6 +424,8 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); } +static u64 printk_get_ts(void); + /* insert record into the buffer, discard old ones, update heads */ static int log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -473,7 +474,7 @@ static int log_store(int facility, int level, if (ts_nsec > 0) msg->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); + msg->ts_nsec = printk_get_ts(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -986,6 +987,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); +static bool should_ignore_loglevel(int level) +{ + return (level >= console_loglevel && !ignore_loglevel); +} + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -1015,7 +1021,7 @@ static void boot_delay_msec(int level) unsigned long timeout; if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) - || (level >= console_loglevel && !ignore_loglevel)) { + || should_ignore_loglevel(level)) { return; } @@ -1041,8 +1047,74 @@ static inline void boot_delay_msec(int level) } #endif -static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static int printk_time = CONFIG_PRINTK_TIME; + +/* + * Real clock & 32-bit systems: Selecting the real clock printk timestamp may + * lead to unlikely situations where a timestamp is wrong because the real time + * offset is read without the protection of a sequence lock in the call to + * ktime_get_log_ts() in printk_get_ts() below. + */ +static int printk_time_param_set(const char *val, + const struct kernel_param *kp) +{ + char *param = strstrip((char *)val); + + if (strlen(param) != 1) + return -EINVAL; + + switch (param[0]) { + /* 0/N/n = disabled */ + case '0': + case 'N': + case 'n': + printk_time = 0; + break; + /* 1/Y/y = local clock */ + case '1': + case 'Y': + case 'y': + printk_time = 1; + break; + /* 2 = monotonic clock */ + case '2': + printk_time = 2; + break; + /* 3 = real clock */ + case '3': + printk_time = 3; + break; + default: + pr_warn("printk: invalid timestamp value\n"); + return -EINVAL; + break; + } + + pr_info("printk: timestamp set to %d.\n", printk_time); + return 0; +} + +static struct kernel_param_ops printk_time_param_ops = { + .set = printk_time_param_set, + .get = param_get_int, +}; + +module_param_cb(time, &printk_time_param_ops, &printk_time, S_IRUGO); + +static u64 printk_get_ts(void) +{ + u64 mono, offset_real; + + if (printk_time <= 1) + return local_clock(); + + mono = ktime_get_log_ts(&offset_real); + + if (printk_time == 2) + return mono; + + return mono + offset_real; +} static size_t print_time(u64 ts, char *buf) { @@ -1439,8 +1511,6 @@ static void call_console_drivers(int level, trace_console(text, len); - if (level >= console_loglevel && !ignore_loglevel) - return; if (!console_drivers) return; @@ -1562,7 +1632,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.facility = facility; cont.level = level; cont.owner = current; - cont.ts_nsec = local_clock(); + cont.ts_nsec = printk_get_ts(); cont.flags = 0; cont.cons = 0; cont.flushed = false; @@ -1802,7 +1872,28 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -int vprintk_default(const char *fmt, va_list args) +#ifdef CONFIG_PRINTK +#define define_pr_level(func, loglevel) \ +asmlinkage __visible void func(const char *fmt, ...) \ +{ \ + va_list args; \ + \ + va_start(args, fmt); \ + vprintk_default(loglevel, fmt, args); \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func) + +define_pr_level(__pr_emerg, LOGLEVEL_EMERG); +define_pr_level(__pr_alert, LOGLEVEL_ALERT); +define_pr_level(__pr_crit, LOGLEVEL_CRIT); +define_pr_level(__pr_err, LOGLEVEL_ERR); +define_pr_level(__pr_warn, LOGLEVEL_WARNING); +define_pr_level(__pr_notice, LOGLEVEL_NOTICE); +define_pr_level(__pr_info, LOGLEVEL_INFO); +#endif + +int vprintk_default(int level, const char *fmt, va_list args) { int r; @@ -1812,7 +1903,7 @@ int vprintk_default(const char *fmt, va_list args) return r; } #endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + r = vprintk_emit(0, level, NULL, 0, fmt, args); return r; } @@ -1845,7 +1936,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(fmt, args); + r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args); va_end(args); return r; @@ -1888,6 +1979,7 @@ static void call_console_drivers(int level, static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +static bool should_ignore_loglevel(int level) { return false; } /* Still needs to be defined for users */ DEFINE_PER_CPU(printk_func_t, printk_func); @@ -2167,6 +2259,13 @@ static void console_cont_flush(char *text, size_t size) if (!cont.len) goto out; + if (should_ignore_loglevel(cont.level)) { + cont.cons = cont.len; + if (cont.flushed) + cont.len = 0; + goto out; + } + /* * We still queue earlier records, likely because the console was * busy. The earlier ones need to be printed before this one, we @@ -2270,10 +2369,13 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + level = msg->level; + if ((msg->flags & LOG_NOCONS) || + should_ignore_loglevel(level)) { /* * Skip record we have buffered and already printed - * directly to the console when we received it. + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; @@ -2287,7 +2389,6 @@ skip: goto skip; } - level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..de331c3858e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; extern int pid_max; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; -extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU @@ -1077,15 +1076,6 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, }, #endif -#ifdef CONFIG_COMPAT - { - .procname = "compat-log", - .data = &compat_log, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", diff --git a/kernel/task_work.c b/kernel/task_work.c index 6ab4842b00e8..d513051fcca2 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) struct callback_head *head; do { - head = ACCESS_ONCE(task->task_works); + head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; @@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; + + if (likely(!task->task_works)) + return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the @@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = ACCESS_ONCE(*pprev))) { - smp_read_barrier_depends(); + while ((work = lockless_dereference(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -95,7 +97,7 @@ void task_work_run(void) * work_exited unless the list is empty. */ do { - work = ACCESS_ONCE(task->task_works); + work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a196e08324e7..dcd5ce620f36 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -43,6 +43,7 @@ static struct { static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; +static int timekeeping_active; /** * struct tk_fast - NMI safe timekeeper @@ -419,6 +420,16 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +u64 ktime_get_log_ts(u64 *offset_real) +{ + *offset_real = ktime_to_ns(tk_core.timekeeper.offs_real); + + if (timekeeping_active) + return ktime_get_mono_fast_ns(); + else + return local_clock(); +} + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; @@ -1505,6 +1516,8 @@ void __init timekeeping_init(void) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + timekeeping_active = 1; } /* time in seconds when suspend began for persistent clock */ diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..9d188fbd856a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -185,6 +185,13 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e889b4f5f4ef..0f9981999a27 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1,7 +1,9 @@ menu "printk and dmesg options" config PRINTK_TIME - bool "Show timing information on printks" + int "Show timing information on printks (0-3)" + range 0 3 + default "0" depends on PRINTK help Selecting this option causes time stamps of the printk() @@ -10,7 +12,9 @@ config PRINTK_TIME The timestamp is always recorded internally, and exported to /dev/kmsg. This flag just specifies if the timestamp should - be included, not that the timestamp is recorded. + be included, not that the timestamp is recorded. 0 disables the + timestamp and 1 uses the local clock, 2 uses the monotonic clock, and + 3 uses real clock. The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt @@ -244,6 +248,7 @@ config PAGE_OWNER depends on DEBUG_KERNEL && STACKTRACE_SUPPORT select DEBUG_FS select STACKTRACE + select STACKDEPOT select PAGE_EXTENSION help This keeps track of what call chain is the owner of a page, may @@ -720,6 +725,17 @@ config KCOV For more details, see Documentation/kcov.txt. +config KCOV_INSTRUMENT_ALL + bool "Instrument all code by default" + depends on KCOV + default y if KCOV + help + If you are doing generic system call fuzzing (like e.g. syzkaller), + then you will want to instrument the whole kernel and you should + say y here. If you are doing more targeted fuzzing (like e.g. + filesystem fuzzing with AFL) then you will want to enable coverage + for more specific subsets of files, and should say n here. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838ba9..bd38aab05929 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) + depends on SLUB || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS - select STACKDEPOT if SLAB + select STACKDEPOT help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/Makefile b/lib/Makefile index 07d06a8b9788..f6455db094e3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_842_COMPRESS) += 842/ diff --git a/lib/crc32.c b/lib/crc32.c index 9a907d489d95..7fbd1a112b9d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -979,7 +979,6 @@ static int __init crc32c_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -999,20 +998,17 @@ static int __init crc32c_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc32c_le != __crc32c_le(test[i].crc, test_buf + test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); if (errors) @@ -1065,7 +1061,6 @@ static int __init crc32_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -1088,7 +1083,7 @@ static int __init crc32_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc_le != crc32_le(test[i].crc, test_buf + test[i].start, test[i].length)) @@ -1098,14 +1093,11 @@ static int __init crc32_test(void) test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c new file mode 100644 index 000000000000..41629ea5a60c --- /dev/null +++ b/lib/crc64_ecma.c @@ -0,0 +1,341 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/crc64_ecma.h> + + +#define CRC64_BYTE_MASK 0xFF +#define CRC64_TABLE_SIZE 256 + + +struct crc64_table { + u64 seed; + u64 table[CRC64_TABLE_SIZE]; +}; + + +static struct crc64_table CRC64_ECMA_182 = { + CRC64_DEFAULT_INITVAL, + { + 0x0000000000000000ULL, + 0xb32e4cbe03a75f6fULL, + 0xf4843657a840a05bULL, + 0x47aa7ae9abe7ff34ULL, + 0x7bd0c384ff8f5e33ULL, + 0xc8fe8f3afc28015cULL, + 0x8f54f5d357cffe68ULL, + 0x3c7ab96d5468a107ULL, + 0xf7a18709ff1ebc66ULL, + 0x448fcbb7fcb9e309ULL, + 0x0325b15e575e1c3dULL, + 0xb00bfde054f94352ULL, + 0x8c71448d0091e255ULL, + 0x3f5f08330336bd3aULL, + 0x78f572daa8d1420eULL, + 0xcbdb3e64ab761d61ULL, + 0x7d9ba13851336649ULL, + 0xceb5ed8652943926ULL, + 0x891f976ff973c612ULL, + 0x3a31dbd1fad4997dULL, + 0x064b62bcaebc387aULL, + 0xb5652e02ad1b6715ULL, + 0xf2cf54eb06fc9821ULL, + 0x41e11855055bc74eULL, + 0x8a3a2631ae2dda2fULL, + 0x39146a8fad8a8540ULL, + 0x7ebe1066066d7a74ULL, + 0xcd905cd805ca251bULL, + 0xf1eae5b551a2841cULL, + 0x42c4a90b5205db73ULL, + 0x056ed3e2f9e22447ULL, + 0xb6409f5cfa457b28ULL, + 0xfb374270a266cc92ULL, + 0x48190ecea1c193fdULL, + 0x0fb374270a266cc9ULL, + 0xbc9d3899098133a6ULL, + 0x80e781f45de992a1ULL, + 0x33c9cd4a5e4ecdceULL, + 0x7463b7a3f5a932faULL, + 0xc74dfb1df60e6d95ULL, + 0x0c96c5795d7870f4ULL, + 0xbfb889c75edf2f9bULL, + 0xf812f32ef538d0afULL, + 0x4b3cbf90f69f8fc0ULL, + 0x774606fda2f72ec7ULL, + 0xc4684a43a15071a8ULL, + 0x83c230aa0ab78e9cULL, + 0x30ec7c140910d1f3ULL, + 0x86ace348f355aadbULL, + 0x3582aff6f0f2f5b4ULL, + 0x7228d51f5b150a80ULL, + 0xc10699a158b255efULL, + 0xfd7c20cc0cdaf4e8ULL, + 0x4e526c720f7dab87ULL, + 0x09f8169ba49a54b3ULL, + 0xbad65a25a73d0bdcULL, + 0x710d64410c4b16bdULL, + 0xc22328ff0fec49d2ULL, + 0x85895216a40bb6e6ULL, + 0x36a71ea8a7ace989ULL, + 0x0adda7c5f3c4488eULL, + 0xb9f3eb7bf06317e1ULL, + 0xfe5991925b84e8d5ULL, + 0x4d77dd2c5823b7baULL, + 0x64b62bcaebc387a1ULL, + 0xd7986774e864d8ceULL, + 0x90321d9d438327faULL, + 0x231c512340247895ULL, + 0x1f66e84e144cd992ULL, + 0xac48a4f017eb86fdULL, + 0xebe2de19bc0c79c9ULL, + 0x58cc92a7bfab26a6ULL, + 0x9317acc314dd3bc7ULL, + 0x2039e07d177a64a8ULL, + 0x67939a94bc9d9b9cULL, + 0xd4bdd62abf3ac4f3ULL, + 0xe8c76f47eb5265f4ULL, + 0x5be923f9e8f53a9bULL, + 0x1c4359104312c5afULL, + 0xaf6d15ae40b59ac0ULL, + 0x192d8af2baf0e1e8ULL, + 0xaa03c64cb957be87ULL, + 0xeda9bca512b041b3ULL, + 0x5e87f01b11171edcULL, + 0x62fd4976457fbfdbULL, + 0xd1d305c846d8e0b4ULL, + 0x96797f21ed3f1f80ULL, + 0x2557339fee9840efULL, + 0xee8c0dfb45ee5d8eULL, + 0x5da24145464902e1ULL, + 0x1a083bacedaefdd5ULL, + 0xa9267712ee09a2baULL, + 0x955cce7fba6103bdULL, + 0x267282c1b9c65cd2ULL, + 0x61d8f8281221a3e6ULL, + 0xd2f6b4961186fc89ULL, + 0x9f8169ba49a54b33ULL, + 0x2caf25044a02145cULL, + 0x6b055fede1e5eb68ULL, + 0xd82b1353e242b407ULL, + 0xe451aa3eb62a1500ULL, + 0x577fe680b58d4a6fULL, + 0x10d59c691e6ab55bULL, + 0xa3fbd0d71dcdea34ULL, + 0x6820eeb3b6bbf755ULL, + 0xdb0ea20db51ca83aULL, + 0x9ca4d8e41efb570eULL, + 0x2f8a945a1d5c0861ULL, + 0x13f02d374934a966ULL, + 0xa0de61894a93f609ULL, + 0xe7741b60e174093dULL, + 0x545a57dee2d35652ULL, + 0xe21ac88218962d7aULL, + 0x5134843c1b317215ULL, + 0x169efed5b0d68d21ULL, + 0xa5b0b26bb371d24eULL, + 0x99ca0b06e7197349ULL, + 0x2ae447b8e4be2c26ULL, + 0x6d4e3d514f59d312ULL, + 0xde6071ef4cfe8c7dULL, + 0x15bb4f8be788911cULL, + 0xa6950335e42fce73ULL, + 0xe13f79dc4fc83147ULL, + 0x521135624c6f6e28ULL, + 0x6e6b8c0f1807cf2fULL, + 0xdd45c0b11ba09040ULL, + 0x9aefba58b0476f74ULL, + 0x29c1f6e6b3e0301bULL, + 0xc96c5795d7870f42ULL, + 0x7a421b2bd420502dULL, + 0x3de861c27fc7af19ULL, + 0x8ec62d7c7c60f076ULL, + 0xb2bc941128085171ULL, + 0x0192d8af2baf0e1eULL, + 0x4638a2468048f12aULL, + 0xf516eef883efae45ULL, + 0x3ecdd09c2899b324ULL, + 0x8de39c222b3eec4bULL, + 0xca49e6cb80d9137fULL, + 0x7967aa75837e4c10ULL, + 0x451d1318d716ed17ULL, + 0xf6335fa6d4b1b278ULL, + 0xb199254f7f564d4cULL, + 0x02b769f17cf11223ULL, + 0xb4f7f6ad86b4690bULL, + 0x07d9ba1385133664ULL, + 0x4073c0fa2ef4c950ULL, + 0xf35d8c442d53963fULL, + 0xcf273529793b3738ULL, + 0x7c0979977a9c6857ULL, + 0x3ba3037ed17b9763ULL, + 0x888d4fc0d2dcc80cULL, + 0x435671a479aad56dULL, + 0xf0783d1a7a0d8a02ULL, + 0xb7d247f3d1ea7536ULL, + 0x04fc0b4dd24d2a59ULL, + 0x3886b22086258b5eULL, + 0x8ba8fe9e8582d431ULL, + 0xcc0284772e652b05ULL, + 0x7f2cc8c92dc2746aULL, + 0x325b15e575e1c3d0ULL, + 0x8175595b76469cbfULL, + 0xc6df23b2dda1638bULL, + 0x75f16f0cde063ce4ULL, + 0x498bd6618a6e9de3ULL, + 0xfaa59adf89c9c28cULL, + 0xbd0fe036222e3db8ULL, + 0x0e21ac88218962d7ULL, + 0xc5fa92ec8aff7fb6ULL, + 0x76d4de52895820d9ULL, + 0x317ea4bb22bfdfedULL, + 0x8250e80521188082ULL, + 0xbe2a516875702185ULL, + 0x0d041dd676d77eeaULL, + 0x4aae673fdd3081deULL, + 0xf9802b81de97deb1ULL, + 0x4fc0b4dd24d2a599ULL, + 0xfceef8632775faf6ULL, + 0xbb44828a8c9205c2ULL, + 0x086ace348f355aadULL, + 0x34107759db5dfbaaULL, + 0x873e3be7d8faa4c5ULL, + 0xc094410e731d5bf1ULL, + 0x73ba0db070ba049eULL, + 0xb86133d4dbcc19ffULL, + 0x0b4f7f6ad86b4690ULL, + 0x4ce50583738cb9a4ULL, + 0xffcb493d702be6cbULL, + 0xc3b1f050244347ccULL, + 0x709fbcee27e418a3ULL, + 0x3735c6078c03e797ULL, + 0x841b8ab98fa4b8f8ULL, + 0xadda7c5f3c4488e3ULL, + 0x1ef430e13fe3d78cULL, + 0x595e4a08940428b8ULL, + 0xea7006b697a377d7ULL, + 0xd60abfdbc3cbd6d0ULL, + 0x6524f365c06c89bfULL, + 0x228e898c6b8b768bULL, + 0x91a0c532682c29e4ULL, + 0x5a7bfb56c35a3485ULL, + 0xe955b7e8c0fd6beaULL, + 0xaeffcd016b1a94deULL, + 0x1dd181bf68bdcbb1ULL, + 0x21ab38d23cd56ab6ULL, + 0x9285746c3f7235d9ULL, + 0xd52f0e859495caedULL, + 0x6601423b97329582ULL, + 0xd041dd676d77eeaaULL, + 0x636f91d96ed0b1c5ULL, + 0x24c5eb30c5374ef1ULL, + 0x97eba78ec690119eULL, + 0xab911ee392f8b099ULL, + 0x18bf525d915feff6ULL, + 0x5f1528b43ab810c2ULL, + 0xec3b640a391f4fadULL, + 0x27e05a6e926952ccULL, + 0x94ce16d091ce0da3ULL, + 0xd3646c393a29f297ULL, + 0x604a2087398eadf8ULL, + 0x5c3099ea6de60cffULL, + 0xef1ed5546e415390ULL, + 0xa8b4afbdc5a6aca4ULL, + 0x1b9ae303c601f3cbULL, + 0x56ed3e2f9e224471ULL, + 0xe5c372919d851b1eULL, + 0xa26908783662e42aULL, + 0x114744c635c5bb45ULL, + 0x2d3dfdab61ad1a42ULL, + 0x9e13b115620a452dULL, + 0xd9b9cbfcc9edba19ULL, + 0x6a978742ca4ae576ULL, + 0xa14cb926613cf817ULL, + 0x1262f598629ba778ULL, + 0x55c88f71c97c584cULL, + 0xe6e6c3cfcadb0723ULL, + 0xda9c7aa29eb3a624ULL, + 0x69b2361c9d14f94bULL, + 0x2e184cf536f3067fULL, + 0x9d36004b35545910ULL, + 0x2b769f17cf112238ULL, + 0x9858d3a9ccb67d57ULL, + 0xdff2a94067518263ULL, + 0x6cdce5fe64f6dd0cULL, + 0x50a65c93309e7c0bULL, + 0xe388102d33392364ULL, + 0xa4226ac498dedc50ULL, + 0x170c267a9b79833fULL, + 0xdcd7181e300f9e5eULL, + 0x6ff954a033a8c131ULL, + 0x28532e49984f3e05ULL, + 0x9b7d62f79be8616aULL, + 0xa707db9acf80c06dULL, + 0x14299724cc279f02ULL, + 0x5383edcd67c06036ULL, + 0xe0ada17364673f59ULL + } +}; + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void) +{ + return CRC64_ECMA_182.seed; +} +EXPORT_SYMBOL(crc64_ecma_seed); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * pdata: pointer to the data to compute checksum for. + * nbytes: number of bytes in data buffer. + * seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed) +{ + unsigned int i; + u64 crc = seed; + + for (i = 0; i < nbytes; i++) + crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^ + (crc >> 8); + + return crc; +} +EXPORT_SYMBOL(crc64_ecma); + +MODULE_DESCRIPTION("CRC64 ECMA function"); +MODULE_AUTHOR("Freescale Semiconductor Inc."); +MODULE_LICENSE("GPL"); diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c27e269210c4..a816f3a80625 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -29,8 +29,7 @@ again: index = bitmap_find_next_zero_area(map, size, start, nr, align_mask); if (index < size) { if (iommu_is_span_boundary(index, nr, shift, boundary_size)) { - /* we could do more effectively */ - start = index + 1; + start = ALIGN(shift + index, boundary_size) - shift; goto again; } bitmap_set(map, index, nr); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b7d8459bb9d..61b8fb529cef 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,6 +38,9 @@ #include <linux/preempt.h> /* in_interrupt() */ +/* Number of nodes in fully populated tree of given height */ +static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; + /* * Radix tree node cache. */ @@ -342,7 +345,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask, int nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -350,14 +353,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + while (rtp->nr < nr) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + if (rtp->nr < nr) { node->private_data = rtp->nodes; rtp->nodes = node; rtp->nr++; @@ -383,7 +386,7 @@ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); @@ -395,7 +398,7 @@ EXPORT_SYMBOL(radix_tree_preload); int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); return 0; @@ -403,6 +406,51 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) EXPORT_SYMBOL(radix_tree_maybe_preload); /* + * The same as function above, but preload number of nodes required to insert + * (1 << order) continuous naturally-aligned elements. + */ +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) +{ + unsigned long nr_subtrees; + int nr_nodes, subtree_height; + + /* Preloading doesn't help anything with this gfp mask, skip it */ + if (!gfpflags_allow_blocking(gfp_mask)) { + preempt_disable(); + return 0; + } + + /* + * Calculate number and height of fully populated subtrees it takes to + * store (1 << order) elements. + */ + nr_subtrees = 1 << order; + for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; + subtree_height++) + nr_subtrees >>= RADIX_TREE_MAP_SHIFT; + + /* + * The worst case is zero height tree with a single item at index 0 and + * then inserting items starting at ULONG_MAX - (1 << order). + * + * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to + * 0-index item. + */ + nr_nodes = RADIX_TREE_MAX_PATH; + + /* Plus branch to fully populated subtrees. */ + nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; + + /* Root node is shared. */ + nr_nodes--; + + /* Plus nodes required to build subtrees. */ + nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; + + return __radix_tree_preload(gfp_mask, nr_nodes); +} + +/* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) @@ -1571,6 +1619,31 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } +static __init unsigned long __maxindex(unsigned int height) +{ + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; +} + +static __init void radix_tree_init_maxnodes(void) +{ + unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); + for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { + for (j = i; j > 0; j--) + height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; + } +} + static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1597,5 +1670,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); + radix_tree_init_maxnodes(); hotcpu_notifier(radix_tree_callback, 0); } diff --git a/mm/Kconfig b/mm/Kconfig index d109a7a0c1c4..c0837845c17c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -440,6 +440,14 @@ choice endchoice # +# We don't deposit page tables on file THP mapping, +# but Power makes use of them to address MMU quirk. +# +config TRANSPARENT_HUGE_PAGECACHE + def_bool y + depends on TRANSPARENT_HUGEPAGE && !PPC + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM diff --git a/mm/Makefile b/mm/Makefile index 78c6f7dedb83..fc059666c760 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -74,7 +74,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 57b3e9bd6bc5..da91df50ba31 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) */ if (trylock_page(page)) { #ifdef CONFIG_BALLOON_COMPACTION - if (!PagePrivate(page)) { + if (PageIsolated(page)) { /* raced with isolation */ unlock_page(page); continue; @@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -static inline void __isolate_balloon_page(struct page *page) +bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - ClearPagePrivate(page); list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + return true; } -static inline void __putback_balloon_page(struct page *page) +void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - SetPagePrivate(page); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -/* __isolate_lru_page() counterpart for a ballooned page */ -bool balloon_page_isolate(struct page *page) -{ - /* - * Avoid burning cycles with pages that are yet under __free_pages(), - * or just got freed under us. - * - * In case we 'win' a race for a balloon page being freed under us and - * raise its refcount preventing __free_pages() from doing its job - * the put_page() at the end of this block will take care of - * release this page, thus avoiding a nasty leakage. - */ - if (likely(get_page_unless_zero(page))) { - /* - * As balloon pages are not isolated from LRU lists, concurrent - * compaction threads can race against page migration functions - * as well as race against the balloon driver releasing a page. - * - * In order to avoid having an already isolated balloon page - * being (wrongly) re-isolated while it is under migration, - * or to avoid attempting to isolate pages being released by - * the balloon driver, lets be sure we have the page lock - * before proceeding with the balloon page isolation steps. - */ - if (likely(trylock_page(page))) { - /* - * A ballooned page, by default, has PagePrivate set. - * Prevent concurrent compaction threads from isolating - * an already isolated balloon page by clearing it. - */ - if (balloon_page_movable(page)) { - __isolate_balloon_page(page); - unlock_page(page); - return true; - } - unlock_page(page); - } - put_page(page); - } - return false; -} - -/* putback_lru_page() counterpart for a ballooned page */ -void balloon_page_putback(struct page *page) -{ - /* - * 'lock_page()' stabilizes the page and prevents races against - * concurrent isolation threads attempting to re-isolate it. - */ - lock_page(page); - - if (__is_movable_balloon_page(page)) { - __putback_balloon_page(page); - /* drop the extra ref count taken for page isolation */ - put_page(page); - } else { - WARN_ON(1); - dump_page(page, "not movable balloon page"); - } - unlock_page(page); -} /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) +int balloon_page_migrate(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); - int rc = -EAGAIN; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - if (WARN_ON(!__is_movable_balloon_page(page))) { - dump_page(page, "not movable balloon page"); - return rc; - } + return balloon->migratepage(balloon, newpage, page, mode); +} - if (balloon && balloon->migratepage) - rc = balloon->migratepage(balloon, newpage, page, mode); +const struct address_space_operations balloon_aops = { + .migratepage = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; +EXPORT_SYMBOL_GPL(balloon_aops); - return rc; -} #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/compaction.c b/mm/compaction.c index 79bfe0e06907..0bd53fb05162 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -15,11 +15,11 @@ #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> -#include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/page_owner.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -65,13 +65,27 @@ static unsigned long release_freepages(struct list_head *freelist) static void map_pages(struct list_head *list) { - struct page *page; + unsigned int i, order, nr_pages; + struct page *page, *next; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); - list_for_each_entry(page, list, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); - kasan_alloc_pages(page, 0); + order = page_private(page); + nr_pages = 1 << order; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); + + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } } + + list_splice(&tmp_list, list); } static inline bool migrate_async_suitable(int migratetype) @@ -81,6 +95,44 @@ static inline bool migrate_async_suitable(int migratetype) #ifdef CONFIG_COMPACTION +int PageMovable(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (!__PageMovable(page)) + return 0; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) + return 1; + + return 0; +} +EXPORT_SYMBOL(PageMovable); + +void __SetPageMovable(struct page *page, struct address_space *mapping) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__SetPageMovable); + +void __ClearPageMovable(struct page *page) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + /* + * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE + * flag so that VM can catch up released page by driver after isolation. + * With it, VM migration doesn't try to put it back. + */ + page->mapping = (void *)((unsigned long)page->mapping & + PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__ClearPageMovable); + /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -368,12 +420,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; + unsigned int order; cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { - int isolated, i; + int isolated; struct page *page = cursor; /* @@ -439,17 +492,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, goto isolate_fail; } - /* Found a free page, break it into order-0 pages */ - isolated = split_free_page(page); + /* Found a free page, will break it into order-0 pages */ + order = page_order(page); + isolated = __isolate_free_page(page, order); if (!isolated) break; + set_page_private(page, order); total_isolated += isolated; cc->nr_freepages += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; - } + list_add_tail(&page->lru, freelist); + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; break; @@ -568,7 +621,7 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(&freelist); if (pfn < end_pfn) { @@ -670,7 +723,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { - bool is_lru; if (skip_on_failure && low_pfn >= next_skip_pfn) { /* @@ -733,21 +785,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Check may be lockless but that's ok as we recheck later. - * It's possible to migrate LRU pages and balloon pages - * Skip any other type of page - */ - is_lru = PageLRU(page); - if (!is_lru) { - if (unlikely(balloon_page_movable(page))) { - if (balloon_page_isolate(page)) { - /* Successfully isolated */ - goto isolate_success; - } - } - } - - /* * Regardless of being on LRU, compound pages such as THP and * hugetlbfs are not to be compacted. We can potentially save * a lot of iterations if we skip them at once. The check is @@ -763,8 +800,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } - if (!is_lru) + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU and non-lru movable pages. + * Skip any other type of page + */ + if (!PageLRU(page)) { + /* + * __PageMovable can return false positive so we need + * to verify it under page_lock. + */ + if (unlikely(__PageMovable(page)) && + !PageIsolated(page)) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, + flags); + locked = false; + } + + if (isolate_movable_page(page, isolate_mode)) + goto isolate_success; + } + goto isolate_fail; + } /* * Migration will fail if an anonymous page is pinned in memory, @@ -1067,7 +1126,7 @@ static void isolate_freepages(struct compact_control *cc) } } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(freelist); /* @@ -1630,7 +1689,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ - if (!order || !may_enter_fs || !may_perform_io) + if (!may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); diff --git a/mm/filemap.c b/mm/filemap.c index 20f3b1f33f0e..e90c1543ec2d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -114,14 +114,14 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; + int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); - VM_BUG_ON(!PageLocked(page)); - - node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, - shadow); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(nr != 1 && shadow, page); if (shadow) { - mapping->nrexceptional++; + mapping->nrexceptional += nr; /* * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing @@ -130,31 +130,38 @@ static void page_cache_tree_delete(struct address_space *mapping, */ smp_wmb(); } - mapping->nrpages--; + mapping->nrpages -= nr; - if (!node) - return; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) + for (i = 0; i < nr; i++) { + node = radix_tree_replace_clear_tags(&mapping->page_tree, + page->index + i, shadow); + if (!node) { + VM_BUG_ON_PAGE(nr != 1, page); return; + } - /* - * Track node that only contains shadow entries. DAX mappings contain - * no shadow entries and may contain other exceptional entries so skip - * those. - * - * Avoid acquiring the list_lru lock if already tracked. The - * list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, &node->private_list); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + continue; + + /* + * Track node that only contains shadow entries. DAX mappings + * contain no shadow entries and may contain other exceptional + * entries so skip those. + * + * Avoid acquiring the list_lru lock if already tracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!dax_mapping(mapping) && !workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, + &node->private_list); + } } } @@ -166,6 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping, void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; + int nr = hpage_nr_pages(page); trace_mm_filemap_delete_from_page_cache(page); /* @@ -178,6 +186,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) else cleancache_invalidate_page(mapping, page); + VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_mapped(page), page); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { int mapcount; @@ -209,9 +218,14 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(page)) - __dec_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __dec_zone_page_state(page, NR_SHMEM); + __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); + if (PageSwapBacked(page)) { + __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); + if (PageTransHuge(page)) + __dec_zone_page_state(page, NR_SHMEM_THPS); + } else { + VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); + } /* * At this point page must be either written or cleaned by truncate. @@ -235,9 +249,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) */ void delete_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); @@ -250,7 +263,13 @@ void delete_from_page_cache(struct page *page) if (freepage) freepage(page); - put_page(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } } EXPORT_SYMBOL(delete_from_page_cache); @@ -1053,7 +1072,7 @@ EXPORT_SYMBOL(page_cache_prev_hole); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; - struct page *page; + struct page *head, *page; rcu_read_lock(); repeat: @@ -1073,16 +1092,24 @@ repeat: */ goto out; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* * Has the page moved? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - put_page(page); + put_page(head); goto repeat; } } @@ -1118,12 +1145,12 @@ repeat: if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(page_mapping(page) != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); } return page; } @@ -1255,7 +1282,7 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1272,12 +1299,20 @@ repeat: */ goto export; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } export: @@ -1318,7 +1353,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1337,12 +1372,19 @@ repeat: continue; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1379,7 +1421,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); /* The hole, there no reason to continue */ @@ -1399,12 +1441,19 @@ repeat: break; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1413,7 +1462,7 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != iter.index) { + if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { put_page(page); break; } @@ -1451,7 +1500,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1476,12 +1525,19 @@ repeat: continue; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1525,7 +1581,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1543,12 +1599,20 @@ repeat: */ goto export; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } export: @@ -2128,21 +2192,21 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = vma->vm_file; + struct file *file = fe->vma->vm_file; struct address_space *mapping = file->f_mapping; + pgoff_t last_pgoff = start_pgoff; loff_t size; - struct page *page; - unsigned long address = (unsigned long) vmf->virtual_address; - unsigned long addr; - pte_t *pte; + struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { - if (iter.index > vmf->max_pgoff) + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start_pgoff) { + if (iter.index > end_pgoff) break; repeat: page = radix_tree_deref_slot(slot); @@ -2156,12 +2220,19 @@ repeat: goto next; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -2179,14 +2250,15 @@ repeat: if (page->index >= size >> PAGE_SHIFT) goto unlock; - pte = vmf->pte + page->index - vmf->pgoff; - if (!pte_none(*pte)) - goto unlock; - if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + + fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (fe->pte) + fe->pte += iter.index - last_pgoff; + last_pgoff = iter.index; + if (alloc_set_pte(fe, NULL, page)) + goto unlock; unlock_page(page); goto next; unlock: @@ -2194,7 +2266,10 @@ unlock: skip: put_page(page); next: - if (iter.index == vmf->max_pgoff) + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*fe->pmd)) + break; + if (iter.index == end_pgoff) break; } rcu_read_unlock(); diff --git a/mm/frontswap.c b/mm/frontswap.c index 27a9924caf61..fec8b5044040 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -20,6 +20,8 @@ #include <linux/frontswap.h> #include <linux/swapfile.h> +DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); + /* * frontswap_ops are added by frontswap_register_ops, and provide the * frontswap "backend" implementation functions. Multiple implementations @@ -139,6 +141,8 @@ void frontswap_register_ops(struct frontswap_ops *ops) ops->next = frontswap_ops; } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + static_branch_inc(&frontswap_enabled_key); + spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (si->frontswap_map) @@ -189,7 +193,7 @@ void __frontswap_init(unsigned type, unsigned long *map) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - BUG_ON(sis == NULL); + VM_BUG_ON(sis == NULL); /* * p->frontswap is a bitmap that we MUST have to figure out which page @@ -248,15 +252,9 @@ int __frontswap_store(struct page *page) pgoff_t offset = swp_offset(entry); struct frontswap_ops *ops; - /* - * Return if no backend registed. - * Don't need to inc frontswap_failed_stores here. - */ - if (!frontswap_ops) - return -1; - - BUG_ON(!PageLocked(page)); - BUG_ON(sis == NULL); + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); /* * If a dup, we must remove the old page first; we can't leave the @@ -303,11 +301,10 @@ int __frontswap_load(struct page *page) pgoff_t offset = swp_offset(entry); struct frontswap_ops *ops; - if (!frontswap_ops) - return -1; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); - BUG_ON(!PageLocked(page)); - BUG_ON(sis == NULL); if (!__frontswap_test(sis, offset)) return -1; @@ -337,10 +334,9 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - if (!frontswap_ops) - return; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); - BUG_ON(sis == NULL); if (!__frontswap_test(sis, offset)) return; @@ -360,10 +356,9 @@ void __frontswap_invalidate_area(unsigned type) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - if (!frontswap_ops) - return; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); - BUG_ON(sis == NULL); if (sis->frontswap_map == NULL) return; @@ -279,6 +279,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, spin_unlock(ptl); ret = 0; split_huge_pmd(vma, pmd, address); + if (pmd_trans_unstable(pmd)) + ret = -EBUSY; } else { get_page(page); spin_unlock(ptl); @@ -286,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, ret = split_huge_page(page); unlock_page(page); put_page(page); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); } return ret ? ERR_PTR(ret) : @@ -350,7 +354,6 @@ unmap: static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { - struct mm_struct *mm = vma->vm_mm; unsigned int fault_flags = 0; int ret; @@ -375,7 +378,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_TRIED; } - ret = handle_mm_fault(mm, vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -690,7 +693,7 @@ retry: if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; - ret = handle_mm_fault(mm, vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ed58530f695..848c16caf8f8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,7 +18,6 @@ #include <linux/mm_inline.h> #include <linux/swapops.h> #include <linux/dax.h> -#include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/pfn_t.h> @@ -30,39 +29,12 @@ #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" -enum scan_result { - SCAN_FAIL, - SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_EXCEED_NONE_PTE, - SCAN_PTE_NON_PRESENT, - SCAN_PAGE_RO, - SCAN_NO_REFERENCED_PAGE, - SCAN_PAGE_NULL, - SCAN_SCAN_ABORT, - SCAN_PAGE_COUNT, - SCAN_PAGE_LRU, - SCAN_PAGE_LOCK, - SCAN_PAGE_ANON, - SCAN_PAGE_COMPOUND, - SCAN_ANY_PROCESS, - SCAN_VMA_NULL, - SCAN_VMA_CHECK, - SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, - SCAN_DEL_PAGE_LRU, - SCAN_ALLOC_HUGE_PAGE_FAIL, - SCAN_CGROUP_CHARGE_FAIL -}; - -#define CREATE_TRACE_POINTS -#include <trace/events/huge_memory.h> - /* * By default transparent hugepage support is disabled in order that avoid * to risk increase the memory footprint of applications without a guaranteed @@ -82,127 +54,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); -/* default scan 8*512 pte (or vmas) every 30 second */ -static unsigned int khugepaged_pages_to_scan __read_mostly; -static unsigned int khugepaged_pages_collapsed; -static unsigned int khugepaged_full_scans; -static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; -/* during fragmentation poll the hugepage allocator once every minute */ -static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; -static unsigned long khugepaged_sleep_expire; -static struct task_struct *khugepaged_thread __read_mostly; -static DEFINE_MUTEX(khugepaged_mutex); -static DEFINE_SPINLOCK(khugepaged_mm_lock); -static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); -/* - * default collapse hugepages if there is at least one pte mapped like - * it would have happened if the vma was large enough during page - * fault. - */ -static unsigned int khugepaged_max_ptes_none __read_mostly; - -static int khugepaged(void *none); -static int khugepaged_slab_init(void); -static void khugepaged_slab_exit(void); - -#define MM_SLOTS_HASH_BITS 10 -static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); - -static struct kmem_cache *mm_slot_cache __read_mostly; - -/** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for - */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; -}; - -/** - * struct khugepaged_scan - cursor for scanning - * @mm_head: the head of the mm list to scan - * @mm_slot: the current mm_slot we are scanning - * @address: the next address inside that to be scanned - * - * There is only the one khugepaged_scan instance of this cursor structure. - */ -struct khugepaged_scan { - struct list_head mm_head; - struct mm_slot *mm_slot; - unsigned long address; -}; -static struct khugepaged_scan khugepaged_scan = { - .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), -}; - static struct shrinker deferred_split_shrinker; -static void set_recommended_min_free_kbytes(void) -{ - struct zone *zone; - int nr_zones = 0; - unsigned long recommended_min; - - for_each_populated_zone(zone) - nr_zones++; - - /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ - recommended_min = pageblock_nr_pages * nr_zones * 2; - - /* - * Make sure that on average at least two pageblocks are almost free - * of another type, one for a migratetype to fall back to and a - * second to avoid subsequent fallbacks of other types There are 3 - * MIGRATE_TYPES we care about. - */ - recommended_min += pageblock_nr_pages * nr_zones * - MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; - - /* don't ever allow to reserve more than 5% of the lowmem */ - recommended_min = min(recommended_min, - (unsigned long) nr_free_buffer_pages() / 20); - recommended_min <<= (PAGE_SHIFT-10); - - if (recommended_min > min_free_kbytes) { - if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", - min_free_kbytes, recommended_min); - - min_free_kbytes = recommended_min; - } - setup_per_zone_wmarks(); -} - -static int start_stop_khugepaged(void) -{ - int err = 0; - if (khugepaged_enabled()) { - if (!khugepaged_thread) - khugepaged_thread = kthread_run(khugepaged, NULL, - "khugepaged"); - if (IS_ERR(khugepaged_thread)) { - pr_err("khugepaged: kthread_run(khugepaged) failed\n"); - err = PTR_ERR(khugepaged_thread); - khugepaged_thread = NULL; - goto fail; - } - - if (!list_empty(&khugepaged_scan.mm_head)) - wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); - } else if (khugepaged_thread) { - kthread_stop(khugepaged_thread); - khugepaged_thread = NULL; - } -fail: - return err; -} - static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; @@ -328,12 +181,7 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err; - - mutex_lock(&khugepaged_mutex); - err = start_stop_khugepaged(); - mutex_unlock(&khugepaged_mutex); - + int err = start_stop_khugepaged(); if (err) ret = err; } @@ -343,7 +191,7 @@ static ssize_t enabled_store(struct kobject *kobj, static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); -static ssize_t single_flag_show(struct kobject *kobj, +ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { @@ -351,7 +199,7 @@ static ssize_t single_flag_show(struct kobject *kobj, !!test_bit(flag, &transparent_hugepage_flags)); } -static ssize_t single_flag_store(struct kobject *kobj, +ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) @@ -406,13 +254,13 @@ static struct kobj_attribute defrag_attr = static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return single_flag_show(kobj, attr, buf, + return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return single_flag_store(kobj, attr, buf, count, + return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static struct kobj_attribute use_zero_page_attr = @@ -421,14 +269,14 @@ static struct kobj_attribute use_zero_page_attr = static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return single_flag_show(kobj, attr, buf, + return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); } static ssize_t debug_cow_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return single_flag_store(kobj, attr, buf, count, + return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); } static struct kobj_attribute debug_cow_attr = @@ -439,6 +287,9 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, +#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) + &shmem_enabled_attr.attr, +#endif #ifdef CONFIG_DEBUG_VM &debug_cow_attr.attr, #endif @@ -449,171 +300,6 @@ static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; -static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); -} - -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) - return -EINVAL; - - khugepaged_scan_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; -} -static struct kobj_attribute scan_sleep_millisecs_attr = - __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, - scan_sleep_millisecs_store); - -static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); -} - -static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; -} -static struct kobj_attribute alloc_sleep_millisecs_attr = - __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, - alloc_sleep_millisecs_store); - -static ssize_t pages_to_scan_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_pages_to_scan); -} -static ssize_t pages_to_scan_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long pages; - - err = kstrtoul(buf, 10, &pages); - if (err || !pages || pages > UINT_MAX) - return -EINVAL; - - khugepaged_pages_to_scan = pages; - - return count; -} -static struct kobj_attribute pages_to_scan_attr = - __ATTR(pages_to_scan, 0644, pages_to_scan_show, - pages_to_scan_store); - -static ssize_t pages_collapsed_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_pages_collapsed); -} -static struct kobj_attribute pages_collapsed_attr = - __ATTR_RO(pages_collapsed); - -static ssize_t full_scans_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_full_scans); -} -static struct kobj_attribute full_scans_attr = - __ATTR_RO(full_scans); - -static ssize_t khugepaged_defrag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return single_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); -} -static ssize_t khugepaged_defrag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - return single_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); -} -static struct kobj_attribute khugepaged_defrag_attr = - __ATTR(defrag, 0644, khugepaged_defrag_show, - khugepaged_defrag_store); - -/* - * max_ptes_none controls if khugepaged should collapse hugepages over - * any unmapped ptes in turn potentially increasing the memory - * footprint of the vmas. When max_ptes_none is 0 khugepaged will not - * reduce the available free memory in the system as it - * runs. Increasing max_ptes_none will instead potentially reduce the - * free memory in the system during the khugepaged scan. - */ -static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_max_ptes_none); -} -static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long max_ptes_none; - - err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR-1) - return -EINVAL; - - khugepaged_max_ptes_none = max_ptes_none; - - return count; -} -static struct kobj_attribute khugepaged_max_ptes_none_attr = - __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, - khugepaged_max_ptes_none_store); - -static struct attribute *khugepaged_attr[] = { - &khugepaged_defrag_attr.attr, - &khugepaged_max_ptes_none_attr.attr, - &pages_to_scan_attr.attr, - &pages_collapsed_attr.attr, - &full_scans_attr.attr, - &scan_sleep_millisecs_attr.attr, - &alloc_sleep_millisecs_attr.attr, - NULL, -}; - -static struct attribute_group khugepaged_attr_group = { - .attrs = khugepaged_attr, - .name = "khugepaged", -}; - static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; @@ -672,8 +358,6 @@ static int __init hugepage_init(void) return -EINVAL; } - khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; - khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; /* * hugepages can't be allocated by the buddy allocator */ @@ -688,7 +372,7 @@ static int __init hugepage_init(void) if (err) goto err_sysfs; - err = khugepaged_slab_init(); + err = khugepaged_init(); if (err) goto err_slab; @@ -719,7 +403,7 @@ err_khugepaged: err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: - khugepaged_slab_exit(); + khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: @@ -765,11 +449,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) -{ - return pmd_mkhuge(mk_pmd(page, prot)); -} - static inline struct list_head *page_deferred_list(struct page *page) { /* @@ -790,26 +469,23 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - struct page *page, gfp_t gfp, - unsigned int flags) +static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, + gfp_t gfp) { + struct vm_area_struct *vma = fe->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - spinlock_t *ptl; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg, true); put_page(page); @@ -824,12 +500,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_none(*pmd))) { - spin_unlock(ptl); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); } else { pmd_t entry; @@ -837,12 +513,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(ptl); + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + pte_free(vma->vm_mm, pgtable); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -852,11 +527,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, haddr, pmd, entry); - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&mm->nr_ptes); - spin_unlock(ptl); + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&vma->vm_mm->nr_ptes); + spin_unlock(fe->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -883,12 +558,6 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) return GFP_TRANSHUGE | reclaim_flags; } -/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ -static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) -{ - return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); -} - /* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -906,13 +575,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - unsigned int flags) +int do_huge_pmd_anonymous_page(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; gfp_t gfp; struct page *page; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -920,42 +588,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && + if (!(fe->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { - spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; int ret; - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - ptl = pmd_lock(mm, pmd); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); ret = 0; set = false; - if (pmd_none(*pmd)) { + if (pmd_none(*fe->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(ptl); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + spin_unlock(fe->ptl); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { - set_huge_zero_page(pgtable, mm, vma, - haddr, pmd, - zero_page); - spin_unlock(ptl); + set_huge_zero_page(pgtable, vma->vm_mm, vma, + haddr, fe->pmd, zero_page); + spin_unlock(fe->ptl); set = true; } } else - spin_unlock(ptl); + spin_unlock(fe->ptl); if (!set) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); put_huge_zero_page(); } return ret; @@ -967,8 +633,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, - flags); + return __do_huge_pmd_anonymous_page(fe, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1080,14 +745,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *src_page; pmd_t pmd; pgtable_t pgtable = NULL; - int ret; + int ret = -ENOMEM; - if (!vma_is_dax(vma)) { - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; - } + /* Skip if can be re-fill on fault */ + if (!vma_is_anonymous(vma)) + return 0; + + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -1095,7 +761,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } @@ -1118,16 +784,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (!vma_is_dax(vma)) { - /* thp accounting separate from pmd_devmap accounting */ - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - get_page(src_page); - page_dup_rmap(src_page, true); - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - } + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); @@ -1141,38 +804,31 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) +void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; pmd_t entry; unsigned long haddr; - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); + haddr = fe->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, + fe->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); } -static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - struct page *page, - unsigned long haddr) +static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, + struct page *page) { + struct vm_area_struct *vma = fe->vma; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; - spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1189,11 +845,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, - vma, address, page_to_nid(page)); + __GFP_OTHER_NODE, vma, + fe->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg, false))) { + mem_cgroup_try_charge(pages[i], vma->vm_mm, + GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { @@ -1219,41 +875,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; + pte_t entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr, false); + page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + fe->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*fe->pte)); + set_pte_at(vma->vm_mm, haddr, fe->pte, entry); + pte_unmap(fe->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); + pmd_populate(vma->vm_mm, fe->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(ptl); + spin_unlock(fe->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1262,8 +918,8 @@ out: return ret; out_free_pages: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + spin_unlock(fe->ptl); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1274,25 +930,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; - int ret = 0; + struct vm_area_struct *vma = fe->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ + int ret = 0; - ptl = pmd_lockptr(mm, pmd); + fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + spin_lock(fe->ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1305,13 +959,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache_pmd(vma, address, pmd); + if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) + update_mmu_cache_pmd(vma, fe->address, fe->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1324,13 +978,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); + ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1339,14 +992,12 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, - true))) { + if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + huge_gfp, &memcg, true))) { put_page(new_page); - if (page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); + if (page) put_page(page); - } else - split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1362,13 +1013,13 @@ alloc: mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(fe->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1376,14 +1027,14 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, pmd); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1392,13 +1043,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(ptl); + spin_unlock(fe->ptl); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); return ret; } @@ -1432,6 +1083,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * We don't mlock() pte-mapped THPs. This way we can avoid * leaking mlocked pages into non-VM_LOCKED VMAs. * + * For anon THP: + * * In most cases the pmd is the only mapping of the page as we * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for * writable private mappings in populate_vma_page_range(). @@ -1439,15 +1092,26 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * The only scenario when we have the page shared here is if we * mlocking read-only mapping shared over fork(). We skip * mlocking such pages. + * + * For file THP: + * + * We can expect PageDoubleMap() to be stable under page lock: + * for file pages we set it in page_add_file_rmap(), which + * requires page to be locked. */ - if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && - page->mapping && trylock_page(page)) { - lru_add_drain(); - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } + + if (PageAnon(page) && compound_mapcount(page) != 1) + goto skip_mlock; + if (PageDoubleMap(page) || !page->mapping) + goto skip_mlock; + if (!trylock_page(page)) + goto skip_mlock; + lru_add_drain(); + if (page->mapping && !PageDoubleMap(page)) + mlock_vma_page(page); + unlock_page(page); } +skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) @@ -1458,13 +1122,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) { - spinlock_t *ptl; + struct vm_area_struct *vma = fe->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1475,8 +1138,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* A PROT_NONE fault should not end up here */ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_same(pmd, *pmdp))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(pmd, *fe->pmd))) goto out_unlock; /* @@ -1484,9 +1147,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*pmdp))) { - page = pmd_page(*pmdp); - spin_unlock(ptl); + if (unlikely(pmd_trans_migrating(*fe->pmd))) { + page = pmd_page(*fe->pmd); + spin_unlock(fe->ptl); wait_on_page_locked(page); goto out; } @@ -1519,7 +1182,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(ptl); + spin_unlock(fe->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1530,12 +1193,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * to serialises splits */ get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(ptl); - if (unlikely(!pmd_same(pmd, *pmdp))) { + spin_lock(fe->ptl); + if (unlikely(!pmd_same(pmd, *fe->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1553,9 +1216,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(ptl); - migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, page, target_nid); + spin_unlock(fe->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, + fe->pmd, pmd, fe->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1570,18 +1233,18 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(mm, haddr, pmdp, pmd); - update_mmu_cache_pmd(vma, addr, pmdp); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); unlock_page(page); out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); return 0; } @@ -1689,12 +1352,18 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); - pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); - atomic_long_dec(&tlb->mm->nr_ptes); + if (PageAnon(page)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + pte_free(tlb->mm, pgtable); + atomic_long_dec(&tlb->mm->nr_ptes); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); + } spin_unlock(ptl); - tlb_remove_page(tlb, page); + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } @@ -1784,7 +1453,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mkwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - BUG_ON(!preserve_write && pmd_write(entry)); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && + pmd_write(entry)); } spin_unlock(ptl); } @@ -1793,10 +1463,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Returns true if a given pmd maps a thp, false otherwise. + * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * - * Note that if it returns true, this routine returns without unlocking page - * table lock. So callers must unlock it. + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { @@ -1808,1040 +1478,6 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return NULL; } -#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) - -int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) -{ - switch (advice) { - case MADV_HUGEPAGE: -#ifdef CONFIG_S390 - /* - * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 - * can't handle this properly after s390_enable_sie, so we simply - * ignore the madvise to prevent qemu from causing a SIGSEGV. - */ - if (mm_has_pgste(vma->vm_mm)) - return 0; -#endif - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & VM_NO_THP) - return -EINVAL; - *vm_flags &= ~VM_NOHUGEPAGE; - *vm_flags |= VM_HUGEPAGE; - /* - * If the vma become good for khugepaged to scan, - * register it here without waiting a page fault that - * may not happen any time soon. - */ - if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) - return -ENOMEM; - break; - case MADV_NOHUGEPAGE: - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & VM_NO_THP) - return -EINVAL; - *vm_flags &= ~VM_HUGEPAGE; - *vm_flags |= VM_NOHUGEPAGE; - /* - * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning - * this vma even if we leave the mm registered in khugepaged if - * it got registered before VM_NOHUGEPAGE was set. - */ - break; - } - - return 0; -} - -static int __init khugepaged_slab_init(void) -{ - mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); - if (!mm_slot_cache) - return -ENOMEM; - - return 0; -} - -static void __init khugepaged_slab_exit(void) -{ - kmem_cache_destroy(mm_slot_cache); -} - -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - -static inline int khugepaged_test_exit(struct mm_struct *mm) -{ - return atomic_read(&mm->mm_users) == 0; -} - -int __khugepaged_enter(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - int wakeup; - - mm_slot = alloc_mm_slot(); - if (!mm_slot) - return -ENOMEM; - - /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); - return 0; - } - - spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); - /* - * Insert just behind the scanning cursor, to let the area settle - * down a little. - */ - wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); - spin_unlock(&khugepaged_mm_lock); - - atomic_inc(&mm->mm_count); - if (wakeup) - wake_up_interruptible(&khugepaged_wait); - - return 0; -} - -int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_THP)) - /* khugepaged not yet working on file or special mappings */ - return 0; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart < hend) - return khugepaged_enter(vma, vm_flags); - return 0; -} - -void __khugepaged_exit(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - int free = 0; - - spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); - free = 1; - } - spin_unlock(&khugepaged_mm_lock); - - if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); - mmdrop(mm); - } else if (mm_slot) { - /* - * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_sem. - */ - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } -} - -static void release_pte_page(struct page *page) -{ - /* 0 stands for page_is_file_cache(page) == false */ - dec_zone_page_state(page, NR_ISOLATED_ANON + 0); - unlock_page(page); - putback_lru_page(page); -} - -static void release_pte_pages(pte_t *pte, pte_t *_pte) -{ - while (--_pte >= pte) { - pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); - } -} - -static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, - pte_t *pte) -{ - struct page *page = NULL; - pte_t *_pte; - int none_or_zero = 0, result = 0; - bool referenced = false, writable = false; - - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { - if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - goto out; - } - } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out; - } - page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - result = SCAN_PAGE_NULL; - goto out; - } - - VM_BUG_ON_PAGE(PageCompound(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - - /* - * We can do it before isolate_lru_page because the - * page can't be freed from under us. NOTE: PG_lock - * is needed to serialize against split_huge_page - * when invoked from the VM. - */ - if (!trylock_page(page)) { - result = SCAN_PAGE_LOCK; - goto out; - } - - /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. - */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { - unlock_page(page); - result = SCAN_PAGE_COUNT; - goto out; - } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } - /* - * Page is not in the swap cache. It can be collapsed - * into a THP. - */ - } - - /* - * Isolate the page to avoid collapsing an hugepage - * currently in use by the VM. - */ - if (isolate_lru_page(page)) { - unlock_page(page); - result = SCAN_DEL_PAGE_LRU; - goto out; - } - /* 0 stands for page_is_file_cache(page) == false */ - inc_zone_page_state(page, NR_ISOLATED_ANON + 0); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - - /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) - referenced = true; - } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { - result = SCAN_PAGE_RO; - } - -out: - release_pte_pages(pte, _pte); - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 0; -} - -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl) -{ - pte_t *_pte; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { - pte_t pteval = *_pte; - struct page *src_page; - - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - } - } else { - src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); - /* - * ptl mostly unnecessary, but preempt has to - * be disabled to update the per-cpu stats - * inside page_remove_rmap(). - */ - spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); - spin_unlock(ptl); - free_page_and_swap_cache(src_page); - } - - address += PAGE_SIZE; - page++; - } -} - -static void khugepaged_alloc_sleep(void) -{ - DEFINE_WAIT(wait); - - add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); -} - -static int khugepaged_node_load[MAX_NUMNODES]; - -static bool khugepaged_scan_abort(int nid) -{ - int i; - - /* - * If zone_reclaim_mode is disabled, then no extra effort is made to - * allocate memory locally. - */ - if (!zone_reclaim_mode) - return false; - - /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) - return false; - - for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) - continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) - return true; - } - return false; -} - -#ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) -{ - static int last_khugepaged_target_node = NUMA_NO_NODE; - int nid, target_node = 0, max_value = 0; - - /* find first node with max normal pages hit */ - for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; - target_node = nid; - } - - /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { - target_node = nid; - break; - } - - last_khugepaged_target_node = target_node; - return target_node; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - unsigned long address, int node) -{ - VM_BUG_ON_PAGE(*hpage, *hpage); - - /* - * Before allocating the hugepage, release the mmap_sem read lock. - * The allocation can take potentially a long time if it involves - * sync compaction, and we do not need to hold the mmap_sem during - * that. We will recheck the vma after taking it again in write mode. - */ - up_read(&mm->mmap_sem); - - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; - } - - prep_transhuge_page(*hpage); - count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - unsigned long address, int node) -{ - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - return false; - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_THP); -} - -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) -{ - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; - unsigned long hstart, hend; - struct mem_cgroup *memcg; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - gfp_t gfp; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; - - /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out_nolock; - } - - /* - * Prevent all access to pagetables with the exception of - * gup_fast later hanlded by the ptep_clear_flush and the VM - * handled by the anon_vma lock + PG_lock. - */ - down_write(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) { - result = SCAN_ANY_PROCESS; - goto out; - } - - vma = find_vma(mm, address); - if (!vma) { - result = SCAN_VMA_NULL; - goto out; - } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) { - result = SCAN_ADDRESS_RANGE; - goto out; - } - if (!hugepage_vma_check(vma)) { - result = SCAN_VMA_CHECK; - goto out; - } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; - goto out; - } - - anon_vma_lock_write(vma->anon_vma); - - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); - - mmun_start = address; - mmun_end = address + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ - /* - * After this gup_fast can't run anymore. This also removes - * any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address - * to avoid the risk of CPU bugs in that area. - */ - _pmd = pmdp_collapse_flush(vma, address, pmd); - spin_unlock(pmd_ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(pte_ptl); - - if (unlikely(!isolated)) { - pte_unmap(pte); - spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); - /* - * We can only use set_pmd_at when establishing - * hugepmds and never for establishing regular pmds that - * points to regular pagetables. Use pmd_populate for that - */ - pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(pmd_ptl); - anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; - goto out; - } - - /* - * All pages are isolated and locked so anon_vma rmap - * can't run anymore. - */ - anon_vma_unlock_write(vma->anon_vma); - - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); - pte_unmap(pte); - __SetPageUptodate(new_page); - pgtable = pmd_pgtable(_pmd); - - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - - spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(pmd_ptl); - - *hpage = NULL; - - khugepaged_pages_collapsed++; - result = SCAN_SUCCEED; -out_up_write: - up_write(&mm->mmap_sem); - trace_mm_collapse_huge_page(mm, isolated, result); - return; - -out_nolock: - trace_mm_collapse_huge_page(mm, isolated, result); - return; -out: - mem_cgroup_cancel_charge(new_page, memcg, true); - goto out_up_write; -} - -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) -{ - pmd_t *pmd; - pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0; - struct page *page = NULL; - unsigned long _address; - spinlock_t *ptl; - int node = NUMA_NO_NODE; - bool writable = false, referenced = false; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; - goto out; - } - - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - goto out_unmap; - } - } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } - if (pte_write(pteval)) - writable = true; - - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { - result = SCAN_PAGE_NULL; - goto out_unmap; - } - - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; - goto out_unmap; - } - - /* - * Record which node the original page is from and save this - * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max - * hit record. - */ - node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { - result = SCAN_SCAN_ABORT; - goto out_unmap; - } - khugepaged_node_load[node]++; - if (!PageLRU(page)) { - result = SCAN_PAGE_LRU; - goto out_unmap; - } - if (PageLocked(page)) { - result = SCAN_PAGE_LOCK; - goto out_unmap; - } - if (!PageAnon(page)) { - result = SCAN_PAGE_ANON; - goto out_unmap; - } - - /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. - */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { - result = SCAN_PAGE_COUNT; - goto out_unmap; - } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) - referenced = true; - } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_NO_REFERENCED_PAGE; - } - } else { - result = SCAN_PAGE_RO; - } -out_unmap: - pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); - /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node); - } -out: - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, - none_or_zero, result); - return ret; -} - -static void collect_mm_slot(struct mm_slot *mm_slot) -{ - struct mm_struct *mm = mm_slot->mm; - - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); - - if (khugepaged_test_exit(mm)) { - /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); - - /* - * Not strictly needed because the mm exited already. - * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - */ - - /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); - mmdrop(mm); - } -} - -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) - __releases(&khugepaged_mm_lock) - __acquires(&khugepaged_mm_lock) -{ - struct mm_slot *mm_slot; - struct mm_struct *mm; - struct vm_area_struct *vma; - int progress = 0; - - VM_BUG_ON(!pages); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); - - if (khugepaged_scan.mm_slot) - mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, - struct mm_slot, mm_node); - khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; - } - spin_unlock(&khugepaged_mm_lock); - - mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else - vma = find_vma(mm, khugepaged_scan.address); - - progress++; - for (; vma; vma = vma->vm_next) { - unsigned long hstart, hend; - - cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { - progress++; - break; - } - if (!hugepage_vma_check(vma)) { -skip: - progress++; - continue; - } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; - if (khugepaged_scan.address > hend) - goto skip; - if (khugepaged_scan.address < hstart) - khugepaged_scan.address = hstart; - VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); - - while (khugepaged_scan.address < hend) { - int ret; - cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) - goto breakouterloop; - - VM_BUG_ON(khugepaged_scan.address < hstart || - khugepaged_scan.address + HPAGE_PMD_SIZE > - hend); - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); - /* move to next address */ - khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_sem so break loop */ - goto breakouterloop_mmap_sem; - if (progress >= pages) - goto breakouterloop; - } - } -breakouterloop: - up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ -breakouterloop_mmap_sem: - - spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); - /* - * Release the current mm_slot if this mm is about to die, or - * if we scanned all vmas of this mm. - */ - if (khugepaged_test_exit(mm) || !vma) { - /* - * Make sure that if mm_users is reaching zero while - * khugepaged runs here, khugepaged_exit will find - * mm_slot not pointing to the exiting mm. - */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); - khugepaged_scan.address = 0; - } else { - khugepaged_scan.mm_slot = NULL; - khugepaged_full_scans++; - } - - collect_mm_slot(mm_slot); - } - - return progress; -} - -static int khugepaged_has_work(void) -{ - return !list_empty(&khugepaged_scan.mm_head) && - khugepaged_enabled(); -} - -static int khugepaged_wait_event(void) -{ - return !list_empty(&khugepaged_scan.mm_head) || - kthread_should_stop(); -} - -static void khugepaged_do_scan(void) -{ - struct page *hpage = NULL; - unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; - bool wait = true; - - barrier(); /* write khugepaged_pages_to_scan to local stack */ - - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - - cond_resched(); - - if (unlikely(kthread_should_stop() || try_to_freeze())) - break; - - spin_lock(&khugepaged_mm_lock); - if (!khugepaged_scan.mm_slot) - pass_through_head++; - if (khugepaged_has_work() && - pass_through_head < 2) - progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); - else - progress = pages; - spin_unlock(&khugepaged_mm_lock); - } - - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); -} - -static bool khugepaged_should_wakeup(void) -{ - return kthread_should_stop() || - time_after_eq(jiffies, khugepaged_sleep_expire); -} - -static void khugepaged_wait_work(void) -{ - if (khugepaged_has_work()) { - const unsigned long scan_sleep_jiffies = - msecs_to_jiffies(khugepaged_scan_sleep_millisecs); - - if (!scan_sleep_jiffies) - return; - - khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; - wait_event_freezable_timeout(khugepaged_wait, - khugepaged_should_wakeup(), - scan_sleep_jiffies); - return; - } - - if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); -} - -static int khugepaged(void *none) -{ - struct mm_slot *mm_slot; - - set_freezable(); - set_user_nice(current, MAX_NICE); - - while (!kthread_should_stop()) { - khugepaged_do_scan(); - khugepaged_wait_work(); - } - - spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; - khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); - spin_unlock(&khugepaged_mm_lock); - return 0; -} - static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { @@ -2888,10 +1524,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); - if (vma_is_dax(vma)) { - pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + if (!vma_is_anonymous(vma)) { + _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); if (is_huge_zero_pmd(_pmd)) put_huge_zero_page(); + if (vma_is_dax(vma)) + return; + page = pmd_page(_pmd); + if (!PageReferenced(page) && pmd_young(_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { return __split_huge_zero_page_pmd(vma, haddr, pmd); @@ -2947,7 +1591,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __dec_zone_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) @@ -3088,12 +1732,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void freeze_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | - TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED; int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); + if (PageAnon(page)) + ttu_flags |= TTU_MIGRATION; + /* We only need TTU_SPLIT_HUGE_PMD once */ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { @@ -3103,7 +1750,7 @@ static void freeze_page(struct page *page) ret = try_to_unmap(page + i, ttu_flags); } - VM_BUG_ON(ret); + VM_BUG_ON_PAGE(ret, page + i - 1); } static void unfreeze_page(struct page *page) @@ -3125,15 +1772,20 @@ static void __split_huge_page_tail(struct page *head, int tail, /* * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_inc(), we - * would then run atomic_set() concurrently with + * tail_page. If we used atomic_set() below instead of atomic_inc() or + * atomic_add(), we would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_inc(). + * it's safer to use atomic_inc()/atomic_add(). */ - page_ref_inc(page_tail); + if (PageAnon(head)) { + page_ref_inc(page_tail); + } else { + /* Additional pin to radix tree */ + page_ref_add(page_tail, 2); + } page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3169,25 +1821,46 @@ static void __split_huge_page_tail(struct page *head, int tail, lru_add_page_tail(head, page_tail, lruvec, list); } -static void __split_huge_page(struct page *page, struct list_head *list) +static void __split_huge_page(struct page *page, struct list_head *list, + unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; + pgoff_t end = -1; int i; - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irq(&zone->lru_lock); lruvec = mem_cgroup_page_lruvec(head, zone); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) + if (!PageAnon(page)) + end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); + + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); + /* Some pages can be beyond i_size: drop them from page cache */ + if (head[i].index >= end) { + __ClearPageDirty(head + i); + __delete_from_page_cache(head + i, NULL); + if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + shmem_uncharge(head->mapping->host, 1); + put_page(head + i); + } + } ClearPageCompound(head); - spin_unlock_irq(&zone->lru_lock); + /* See comment in __split_huge_page_tail() */ + if (PageAnon(head)) { + page_ref_inc(head); + } else { + /* Additional pin to radix tree */ + page_ref_add(head, 2); + spin_unlock(&head->mapping->tree_lock); + } + + spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); unfreeze_page(head); @@ -3210,18 +1883,22 @@ static void __split_huge_page(struct page *page, struct list_head *list) int total_mapcount(struct page *page) { - int i, ret; + int i, compound, ret; VM_BUG_ON_PAGE(PageTail(page), page); if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - ret = compound_mapcount(page); + compound = compound_mapcount(page); if (PageHuge(page)) - return ret; + return compound; + ret = compound; for (i = 0; i < HPAGE_PMD_NR; i++) ret += atomic_read(&page[i]._mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!PageAnon(page)) + return ret - compound * HPAGE_PMD_NR; if (PageDoubleMap(page)) ret -= HPAGE_PMD_NR; return ret; @@ -3308,36 +1985,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); - struct anon_vma *anon_vma; - int count, mapcount, ret; + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; + int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); - /* - * The caller does not necessarily hold an mmap_sem that would prevent - * the anon_vma disappearing so we first we take a reference to it - * and then lock the anon_vma for write. This is similar to - * page_lock_anon_vma_read except the write lock is taken to serialise - * against parallel split or collapse operations. - */ - anon_vma = page_get_anon_vma(head); - if (!anon_vma) { - ret = -EBUSY; - goto out; + if (PageAnon(head)) { + /* + * The caller does not necessarily hold an mmap_sem that would + * prevent the anon_vma disappearing so we first we take a + * reference to it and then lock the anon_vma for write. This + * is similar to page_lock_anon_vma_read except the write lock + * is taken to serialise against parallel split or collapse + * operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + extra_pins = 0; + mapping = NULL; + anon_vma_lock_write(anon_vma); + } else { + mapping = head->mapping; + + /* Truncated ? */ + if (!mapping) { + ret = -EBUSY; + goto out; + } + + /* Addidional pins from radix tree */ + extra_pins = HPAGE_PMD_NR; + anon_vma = NULL; + i_mmap_lock_read(mapping); } - anon_vma_lock_write(anon_vma); /* * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - 1) { + if (total_mapcount(head) != page_count(head) - extra_pins - 1) { ret = -EBUSY; goto out_unlock; } @@ -3350,35 +2045,62 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irqsave(&page_zone(head)->lru_lock, flags); + + if (mapping) { + void **pslot; + + spin_lock(&mapping->tree_lock); + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(head)); + /* + * Check if the head page is present in radix tree. + * We assume all tail are present too, if head is there. + */ + if (radix_tree_deref_slot_protected(pslot, + &mapping->tree_lock) != head) + goto fail; + } + /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock(&pgdata->split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); - if (!mapcount && count == 1) { + if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - __split_huge_page(page, list); + if (mapping) + __dec_zone_page_state(page, NR_SHMEM_THPS); + spin_unlock(&pgdata->split_queue_lock); + __split_huge_page(page, list, flags); ret = 0; - } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); } else { - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + BUG(); + } + spin_unlock(&pgdata->split_queue_lock); +fail: if (mapping) + spin_unlock(&mapping->tree_lock); + spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); unfreeze_page(head); ret = -EBUSY; } out_unlock: - anon_vma_unlock_write(anon_vma); - put_anon_vma(anon_vma); + if (anon_vma) { + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); + } + if (mapping) + i_mmap_unlock_read(mapping); out: count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; @@ -3501,8 +2223,7 @@ static int split_huge_pages_set(void *data, u64 val) if (zone != page_zone(page)) goto next; - if (!PageHead(page) || !PageAnon(page) || - PageHuge(page)) + if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) goto next; total++; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index df3c17603a0d..056ae82b4dde 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3177,7 +3177,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; @@ -3196,19 +3195,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; -again: for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, &address, ptep)) - goto unlock; + if (huge_pmd_unshare(mm, &address, ptep)) { + spin_unlock(ptl); + continue; + } pte = huge_ptep_get(ptep); - if (huge_pte_none(pte)) - goto unlock; + if (huge_pte_none(pte)) { + spin_unlock(ptl); + continue; + } /* * Migrating hugepage or HWPoisoned hugepage is already @@ -3216,7 +3218,8 @@ again: */ if (unlikely(!pte_present(pte))) { huge_pte_clear(mm, address, ptep); - goto unlock; + spin_unlock(ptl); + continue; } page = pte_page(pte); @@ -3226,9 +3229,10 @@ again: * are about to unmap is the actual page of interest. */ if (ref_page) { - if (page != ref_page) - goto unlock; - + if (page != ref_page) { + spin_unlock(ptl); + continue; + } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than @@ -3244,30 +3248,14 @@ again: hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, true); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) { - address += sz; - spin_unlock(ptl); - break; - } - /* Bail out after unmapping reference page if supplied */ - if (ref_page) { - spin_unlock(ptl); - break; - } -unlock: + spin_unlock(ptl); - } - /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. - */ - if (force_flush) { - force_flush = 0; - tlb_flush_mmu(tlb); - if (address < end && !ref_page) - goto again; + tlb_remove_page_size(tlb, page, huge_page_size(h)); + /* + * Bail out after unmapping reference page if supplied + */ + if (ref_page) + break; } mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); diff --git a/mm/internal.h b/mm/internal.h index 2524ec880e24..9b6a6c43ac39 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,6 +36,8 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +int do_swap_page(struct fault_env *fe, pte_t orig_pte); + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -150,6 +152,8 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); +extern void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags); extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1548749a3d45..2976a9ee104f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o -obj-$(CONFIG_SLAB) += quarantine.o +obj-y := kasan.o report.o kasan_init.o quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6845f9294696..5be43cb9c105 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } -#ifdef CONFIG_SLAB /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. @@ -373,16 +372,12 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) { int redzone_adjust; - /* Make sure the adjusted size is still less than - * KMALLOC_MAX_CACHE_SIZE. - * TODO: this check is only useful for SLAB, but not SLUB. We'll need - * to skip it for SLUB when it starts using kasan_cache_create(). - */ - if (*size > KMALLOC_MAX_CACHE_SIZE - - sizeof(struct kasan_alloc_meta) - - sizeof(struct kasan_free_meta)) - return; +#ifdef CONFIG_SLAB + int orig_size = *size; +#endif + *flags |= SLAB_KASAN; + /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; *size += sizeof(struct kasan_alloc_meta); @@ -392,17 +387,35 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, cache->object_size < sizeof(struct kasan_free_meta)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); + } else { + cache->kasan_info.free_meta_offset = 0; } redzone_adjust = optimal_redzone(cache->object_size) - (*size - cache->object_size); + if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_CACHE_SIZE, + +#ifdef CONFIG_SLAB + *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + optimal_redzone(cache->object_size))); -} + /* + * If the metadata doesn't fit, disable KASAN at all. + */ + if (*size <= cache->kasan_info.alloc_meta_offset || + *size <= cache->kasan_info.free_meta_offset) { + *flags &= ~SLAB_KASAN; + *size = orig_size; + } +#else + *size = max(*size, + cache->object_size + + optimal_redzone(cache->object_size)); + #endif +} void kasan_cache_shrink(struct kmem_cache *cache) { @@ -431,16 +444,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); alloc_info->state = KASAN_STATE_INIT; } -#endif } -#ifdef CONFIG_SLAB static inline int in_irqentry_text(unsigned long ptr) { return (ptr >= (unsigned long)&__irqentry_text_start && @@ -501,7 +511,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); return (void *)object + cache->kasan_info.free_meta_offset; } -#endif void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) { @@ -522,16 +531,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) bool kasan_slab_free(struct kmem_cache *cache, void *object) { -#ifdef CONFIG_SLAB /* RCU slabs could be legally used after free within the RCU period */ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return false; if (likely(cache->flags & SLAB_KASAN)) { - struct kasan_alloc_meta *alloc_info = - get_alloc_info(cache, object); - struct kasan_free_meta *free_info = - get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info; + struct kasan_free_meta *free_info; + + alloc_info = get_alloc_info(cache, object); + free_info = get_free_info(cache, object); switch (alloc_info->state) { case KASAN_STATE_ALLOC: @@ -550,10 +559,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) } } return false; -#else - kasan_poison_slab_free(cache, object); - return false; -#endif } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -568,6 +573,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, if (unlikely(object == NULL)) return; + if (!(cache->flags & SLAB_KASAN)) + return; + redzone_start = round_up((unsigned long)(object + size), KASAN_SHADOW_SCALE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, @@ -576,16 +584,13 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - alloc_info->state = KASAN_STATE_ALLOC; alloc_info->alloc_size = size; set_track(&alloc_info->track, flags); } -#endif } EXPORT_SYMBOL(kasan_kmalloc); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb87923552ef..8c75953937e7 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -110,7 +110,7 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b3c122ddd454..861b9776841a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -#ifdef CONFIG_SLAB static void print_track(struct kasan_track *track) { pr_err("PID = %u\n", track->pid); @@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track) } } -static void object_err(struct kmem_cache *cache, struct page *page, - void *object, char *unused_reason) +static void kasan_object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); struct kasan_free_meta *free_info; @@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page, break; } } -#endif static void print_address_description(struct kasan_access_info *info) { @@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info) struct kmem_cache *cache = page->slab_cache; object = nearest_obj(cache, page, (void *)info->access_addr); - object_err(cache, page, object, + kasan_object_err(cache, page, object, "kasan: bad access detected"); return; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c new file mode 100644 index 000000000000..93d5f87c00d5 --- /dev/null +++ b/mm/khugepaged.c @@ -0,0 +1,1913 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/mm_inline.h> +#include <linux/kthread.h> +#include <linux/khugepaged.h> +#include <linux/freezer.h> +#include <linux/mman.h> +#include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> +#include <linux/page_idle.h> +#include <linux/swapops.h> +#include <linux/shmem_fs.h> + +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include "internal.h" + +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_EXCEED_NONE_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PAGE_RO, + SCAN_NO_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_SWAP_CACHE_PAGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL, + SCAN_EXCEED_SWAP_PTE, + SCAN_TRUNCATED, +}; + +#define CREATE_TRACE_POINTS +#include <trace/events/huge_memory.h> + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly; +static unsigned int khugepaged_max_ptes_swap __read_mostly; + +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + +static struct kmem_cache *mm_slot_cache __read_mostly; + +/** + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: hash collision list + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct mm_slot *mm_slot; + unsigned long address; +}; + +static struct khugepaged_scan khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = kstrtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_hugepage_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_hugepage_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = kstrtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); +} + +static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_swap; + + err = kstrtoul(buf, 10, &max_ptes_swap); + if (err || max_ptes_swap > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_swap = max_ptes_swap; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_swap_attr = + __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, + khugepaged_max_ptes_swap_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + NULL, +}; + +struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; + +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (!(*vm_flags & VM_NO_KHUGEPAGED) && + khugepaged_enter_vma_merge(vma, *vm_flags)) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +int __init khugepaged_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; + khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + + return 0; +} + +void __init khugepaged_destroy(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) + if (mm == mm_slot->mm) + return mm_slot; + + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + mm_slot->mm = mm; + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) + /* khugepaged not yet working on file or special mappings */ + return 0; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma, vm_flags); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + spin_unlock(&khugepaged_mm_lock); + + if (free) { + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) + release_pte_page(pte_page(pteval)); + } +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page = NULL; + pte_t *_pte; + int none_or_zero = 0, result = 0; + bool referenced = false, writable = false; + + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval) || (pte_present(pteval) && + is_zero_pfn(pte_pfn(pteval)))) { + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + goto out; + } + } + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out; + } + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto out; + } + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + unlock_page(page); + result = SCAN_PAGE_COUNT; + goto out; + } + if (pte_write(pteval)) { + writable = true; + } else { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; + } + /* + * Page is not in the swap cache. It can be collapsed + * into a THP. + */ + } + + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + result = SCAN_DEL_PAGE_LRU; + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = true; + } + if (likely(writable)) { + if (likely(referenced)) { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; + } + } else { + result = SCAN_PAGE_RO; + } + +out: + release_pte_pages(pte, _pte); + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 0; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page, false); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + freezable_schedule_timeout_interruptible( + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +static int khugepaged_node_load[MAX_NUMNODES]; + +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) +{ + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); +} + +#ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +{ + VM_BUG_ON_PAGE(*hpage, *hpage); + + *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + *hpage = ERR_PTR(-ENOMEM); + return NULL; + } + + prep_transhuge_page(*hpage); + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_khugepaged_hugepage(void) +{ + struct page *page; + + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; +} + +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_khugepaged_hugepage(); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +{ + VM_BUG_ON(!*hpage); + + return *hpage; +} +#endif + +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + if (shmem_file(vma->vm_file)) { + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return false; + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + return !(vma->vm_flags & VM_NO_KHUGEPAGED); +} + +/* + * If mmap_sem temporarily dropped, revalidate vma + * before taking mmap_sem. + * Return 0 if succeeds, otherwise return none-zero + * value (scan code). + */ + +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + unsigned long hstart, hend; + + if (unlikely(khugepaged_test_exit(mm))) + return SCAN_ANY_PROCESS; + + vma = find_vma(mm, address); + if (!vma) + return SCAN_VMA_NULL; + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + return SCAN_ADDRESS_RANGE; + if (!hugepage_vma_check(vma)) + return SCAN_VMA_CHECK; + return 0; +} + +/* + * Bring missing pages in from swap, to complete THP collapse. + * Only done if khugepaged_scan_pmd believes it is worthwhile. + * + * Called and returns without pte mapped or spinlocks held, + * but with mmap_sem held to protect against vma changes. + */ + +static bool __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd) +{ + pte_t pteval; + int swapped_in = 0, ret = 0; + struct fault_env fe = { + .vma = vma, + .address = address, + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + fe.pte = pte_offset_map(pmd, address); + for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; + fe.pte++, fe.address += PAGE_SIZE) { + pteval = *fe.pte; + if (!is_swap_pte(pteval)) + continue; + swapped_in++; + ret = do_swap_page(&fe, pteval); + /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + /* vma is no longer available, don't continue to swapin */ + if (hugepage_vma_revalidate(mm, address)) + return false; + /* check if the pmd is still valid */ + if (mm_find_pmd(mm, address) != pmd) + return false; + } + if (ret & VM_FAULT_ERROR) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0); + return false; + } + /* pte is unmapped now, we need to map it */ + fe.pte = pte_offset_map(pmd, fe.address); + } + fe.pte--; + pte_unmap(fe.pte); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); + return true; +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *pmd_ptl, *pte_ptl; + int isolated = 0, result = 0; + struct mem_cgroup *memcg; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + + /* + * Before allocating the hugepage, release the mmap_sem read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_sem during + * that. We will recheck the vma after taking it again in write mode. + */ + up_read(&mm->mmap_sem); + new_page = khugepaged_alloc_page(hpage, gfp, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out_nolock; + } + + down_read(&mm->mmap_sem); + result = hugepage_vma_revalidate(mm, address); + if (result) { + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + pmd = mm_find_pmd(mm, address); + if (!pmd) { + result = SCAN_PMD_NULL; + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + /* + * __collapse_huge_page_swapin always returns with mmap_sem locked. + * If it fails, release mmap_sem and jump directly out. + * Continuing to collapse causes inconsistency. + */ + if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) { + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + up_read(&mm->mmap_sem); + /* + * Prevent all access to pagetables with the exception of + * gup_fast later handled by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + result = hugepage_vma_revalidate(mm, address); + if (result) + goto out; + /* check if the pmd is still valid */ + if (mm_find_pmd(mm, address) != pmd) + goto out; + + anon_vma_lock_write(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + pte_ptl = pte_lockptr(mm, pmd); + + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_collapse_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + spin_lock(pte_ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(pte_ptl); + + if (unlikely(!isolated)) { + pte_unmap(pte); + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock_write(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + pte_unmap(pte); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_active_or_unevictable(new_page, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); + + *hpage = NULL; + + khugepaged_pages_collapsed++; + result = SCAN_SUCCEED; +out_up_write: + up_write(&mm->mmap_sem); +out_nolock: + trace_mm_collapse_huge_page(mm, isolated, result); + return; +out: + mem_cgroup_cancel_charge(new_page, memcg, true); + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, none_or_zero = 0, result = 0; + struct page *page = NULL; + unsigned long _address; + spinlock_t *ptl; + int node = NUMA_NO_NODE, unmapped = 0; + bool writable = false, referenced = false; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pmd = mm_find_pmd(mm, address); + if (!pmd) { + result = SCAN_PMD_NULL; + goto out; + } + + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (is_swap_pte(pteval)) { + if (++unmapped <= khugepaged_max_ptes_swap) { + continue; + } else { + result = SCAN_EXCEED_SWAP_PTE; + goto out_unmap; + } + } + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + goto out_unmap; + } + } + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + goto out_unmap; + } + if (pte_write(pteval)) + writable = true; + + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out_unmap; + } + + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unmap; + } + + /* + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. + */ + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; + goto out_unmap; + } + khugepaged_node_load[node]++; + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + result = SCAN_PAGE_COUNT; + goto out_unmap; + } + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = true; + } + if (writable) { + if (referenced) { + result = SCAN_SUCCEED; + ret = 1; + } else { + result = SCAN_NO_REFERENCED_PAGE; + } + } else { + result = SCAN_PAGE_RO; + } +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + node = khugepaged_find_target_node(); + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma, node); + } +out: + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + none_or_zero, result, unmapped); + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +{ + struct vm_area_struct *vma; + unsigned long addr; + pmd_t *pmd, _pmd; + + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + /* probably overkill */ + if (vma->anon_vma) + continue; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr & ~HPAGE_PMD_MASK) + continue; + if (vma->vm_end < addr + HPAGE_PMD_SIZE) + continue; + pmd = mm_find_pmd(vma->vm_mm, addr); + if (!pmd) + continue; + /* + * We need exclusive mmap_sem to retract page table. + * If trylock fails we would end up with pte-mapped THP after + * re-fault. Not ideal, but it's more important to not disturb + * the system too much. + */ + if (down_write_trylock(&vma->vm_mm->mmap_sem)) { + spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); + /* assume page table is clear */ + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + up_write(&vma->vm_mm->mmap_sem); + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } + } + i_mmap_unlock_write(mapping); +} + +/** + * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * + * Basic scheme is simple, details are more complex: + * - allocate and freeze a new huge page; + * - scan over radix tree replacing old pages the new one + * + swap in pages if necessary; + * + fill in gaps; + * + keep old pages around in case if rollback is required; + * - if replacing succeed: + * + copy data over; + * + free old pages; + * + unfreeze huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the radix-tree; + * + free huge page; + */ +static void collapse_shmem(struct mm_struct *mm, + struct address_space *mapping, pgoff_t start, + struct page **hpage, int node) +{ + gfp_t gfp; + struct page *page, *new_page, *tmp; + struct mem_cgroup *memcg; + pgoff_t index, end = start + HPAGE_PMD_NR; + LIST_HEAD(pagelist); + struct radix_tree_iter iter; + void **slot; + int nr_none = 0, result = SCAN_SUCCEED; + + VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_khugepaged_gfpmask() | + __GFP_OTHER_NODE | __GFP_THISNODE; + + new_page = khugepaged_alloc_page(hpage, gfp, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out; + } + + new_page->index = start; + new_page->mapping = mapping; + __SetPageSwapBacked(new_page); + __SetPageLocked(new_page); + BUG_ON(!page_ref_freeze(new_page, 1)); + + + /* + * At this point the new_page is 'frozen' (page_count() is zero), locked + * and not up-to-date. It's safe to insert it into radix tree, because + * nobody would be able to map it or use it in other way until we + * unfreeze it. + */ + + index = start; + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + int n = min(iter.index, end) - index; + + /* + * Handle holes in the radix tree: charge it from shmem and + * insert relevant subpage of new_page into the radix-tree. + */ + if (n && !shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + break; + } + nr_none += n; + for (; index < min(iter.index, end); index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + + /* We are done. */ + if (index >= end) + break; + + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { + spin_unlock_irq(&mapping->tree_lock); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto tree_unlocked; + } + spin_lock_irq(&mapping->tree_lock); + } else if (trylock_page(page)) { + get_page(page); + } else { + result = SCAN_PAGE_LOCK; + break; + } + + /* + * The page must be locked, so we can drop the tree_lock + * without racing with truncate. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (page_mapping(page) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } + spin_unlock_irq(&mapping->tree_lock); + + if (isolate_lru_page(page)) { + result = SCAN_DEL_PAGE_LRU; + goto out_isolate_failed; + } + + if (page_mapped(page)) + unmap_mapping_range(mapping, index << PAGE_SHIFT, + PAGE_SIZE, 0); + + spin_lock_irq(&mapping->tree_lock); + + VM_BUG_ON_PAGE(page_mapped(page), page); + + /* + * The page is expected to have page_count() == 3: + * - we hold a pin on it; + * - one reference from radix tree; + * - one from isolate_lru_page; + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; + goto out_lru; + } + + /* + * Add the page to the list to be able to undo the collapse if + * something go wrong. + */ + list_add_tail(&page->lru, &pagelist); + + /* Finally, replace with the new page. */ + radix_tree_replace_slot(slot, + new_page + (index % HPAGE_PMD_NR)); + + index++; + continue; +out_lru: + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); +out_isolate_failed: + unlock_page(page); + put_page(page); + goto tree_unlocked; +out_unlock: + unlock_page(page); + put_page(page); + break; + } + + /* + * Handle hole in radix tree at the end of the range. + * This code only triggers if there's nothing in radix tree + * beyond 'end'. + */ + if (result == SCAN_SUCCEED && index < end) { + int n = end - index; + + if (!shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + goto tree_locked; + } + + for (; index < end; index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + nr_none += n; + } + +tree_locked: + spin_unlock_irq(&mapping->tree_lock); +tree_unlocked: + + if (result == SCAN_SUCCEED) { + unsigned long flags; + struct zone *zone = page_zone(new_page); + + /* + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); + unlock_page(page); + page_ref_unfreeze(page, 1); + page->mapping = NULL; + ClearPageActive(page); + ClearPageUnevictable(page); + put_page(page); + } + + local_irq_save(flags); + __inc_zone_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); + __mod_zone_page_state(zone, NR_SHMEM, nr_none); + } + local_irq_restore(flags); + + /* + * Remove pte page tables, so we can re-faulti + * the page as huge. + */ + retract_page_tables(mapping, start); + + /* Everything is ready, let's unfreeze the new_page */ + set_page_dirty(new_page); + SetPageUptodate(new_page); + page_ref_unfreeze(new_page, HPAGE_PMD_NR); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); + unlock_page(new_page); + + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ + shmem_uncharge(mapping->host, nr_none); + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start) { + if (iter.index >= end) + break; + page = list_first_entry_or_null(&pagelist, + struct page, lru); + if (!page || iter.index < page->index) { + if (!nr_none) + break; + /* Put holes back where they were */ + radix_tree_replace_slot(slot, NULL); + nr_none--; + continue; + } + + VM_BUG_ON_PAGE(page->index != iter.index, page); + + /* Unfreeze the page. */ + list_del(&page->lru); + page_ref_unfreeze(page, 2); + radix_tree_replace_slot(slot, page); + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); + unlock_page(page); + spin_lock_irq(&mapping->tree_lock); + } + VM_BUG_ON(nr_none); + spin_unlock_irq(&mapping->tree_lock); + + /* Unfreeze new_page, caller would take care about freeing it */ + page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); + unlock_page(new_page); + new_page->mapping = NULL; + } +out: + VM_BUG_ON(!list_empty(&pagelist)); + /* TODO: tracepoints */ +} + +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + struct page *page = NULL; + struct radix_tree_iter iter; + void **slot; + int present, swap; + int node = NUMA_NO_NODE; + int result = SCAN_SUCCEED; + + present = 0; + swap = 0; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= start + HPAGE_PMD_NR) + break; + + page = radix_tree_deref_slot(slot); + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (radix_tree_exception(page)) { + if (++swap > khugepaged_max_ptes_swap) { + result = SCAN_EXCEED_SWAP_PTE; + break; + } + continue; + } + + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + break; + } + + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; + break; + } + khugepaged_node_load[node]++; + + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + break; + } + + if (page_count(page) != 1 + page_mapcount(page)) { + result = SCAN_PAGE_COUNT; + break; + } + + /* + * We probably should check if the page is referenced here, but + * nobody would transfer pte_young() to PageReferenced() for us. + * And rmap walk here is just too costly... + */ + + present++; + + if (need_resched()) { + cond_resched_rcu(); + slot = radix_tree_iter_next(&iter); + } + } + rcu_read_unlock(); + + if (result == SCAN_SUCCEED) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + result = SCAN_EXCEED_NONE_PTE; + } else { + node = khugepaged_find_target_node(); + collapse_shmem(mm, mapping, start, hpage, node); + } + } + + /* TODO: tracepoints */ +} +#else +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + BUILD_BUG(); +} +#endif + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + if (!hugepage_vma_check(vma)) { +skip: + progress++; + continue; + } + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + if (shmem_file(vma->vm_file)) { + struct file *file; + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + if (!shmem_huge_enabled(vma)) + goto skip; + file = get_file(vma->vm_file); + up_read(&mm->mmap_sem); + ret = 1; + khugepaged_scan_shmem(mm, file->f_mapping, + pgoff, hpage); + fput(file); + } else { + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + } + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + kthread_should_stop(); +} + +static void khugepaged_do_scan(void) +{ + struct page *hpage = NULL; + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + if (!khugepaged_prealloc_page(&hpage, &wait)) + break; + + cond_resched(); + + if (unlikely(kthread_should_stop() || try_to_freeze())) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + &hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } + + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); +} + +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + +static void khugepaged_wait_work(void) +{ + if (khugepaged_has_work()) { + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) + return; + + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; + wait_event_freezable_timeout(khugepaged_wait, + khugepaged_should_wakeup(), + scan_sleep_jiffies); + return; + } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, MAX_NICE); + + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +static void set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + + for_each_populated_zone(zone) + nr_zones++; + + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + + min_free_kbytes = recommended_min; + } + setup_per_zone_wmarks(); +} + +int start_stop_khugepaged(void) +{ + static struct task_struct *khugepaged_thread __read_mostly; + static DEFINE_MUTEX(khugepaged_mutex); + int err = 0; + + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled()) { + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (IS_ERR(khugepaged_thread)) { + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + goto fail; + } + + if (!list_empty(&khugepaged_scan.mm_head)) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } +fail: + mutex_unlock(&khugepaged_mutex); + return err; +} @@ -376,9 +376,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) - ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE | - FAULT_FLAG_REMOTE); + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); @@ -532,8 +531,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) void *expected_mapping; unsigned long kpfn; - expected_mapping = (void *)stable_node + - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + expected_mapping = (void *)((unsigned long)stable_node | + PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); page = pfn_to_page(kpfn); @@ -971,11 +970,13 @@ out: * @page: the PageAnon page that we want to replace with kpage * @kpage: the PageKsm page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. + * @anon_vma: output the anon_vma of page used as kpage * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, - struct page *page, struct page *kpage) + struct page *page, struct page *kpage, + struct anon_vma **anon_vma) { pte_t orig_pte = __pte(0); int err = -EFAULT; @@ -1015,6 +1016,8 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * PageAnon+anon_vma to PageKsm+NULL stable_node: * stable_tree_insert() will update stable_node. */ + if (anon_vma != NULL) + *anon_vma = page_anon_vma(page); set_page_stable_node(page, NULL); mark_page_accessed(page); /* @@ -1055,6 +1058,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; + struct anon_vma *anon_vma = NULL; int err = -EFAULT; down_read(&mm->mmap_sem); @@ -1062,7 +1066,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, if (!vma) goto out; - err = try_to_merge_one_page(vma, page, kpage); + err = try_to_merge_one_page(vma, page, kpage, &anon_vma); if (err) goto out; @@ -1070,7 +1074,10 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_sem */ - rmap_item->anon_vma = vma->anon_vma; + if (anon_vma != NULL) + rmap_item->anon_vma = anon_vma; + else + rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: up_read(&mm->mmap_sem); @@ -1435,6 +1442,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); + if (kpage == page) { + put_page(kpage); + return; + } + if (kpage) { err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { diff --git a/mm/memblock.c b/mm/memblock.c index ac1248933b31..ca099159b45a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -584,6 +584,9 @@ repeat: nid, flags); } + if (!nr_new) + return 0; + /* * If this was the first round, resize array and repeat for actual * insertions; otherwise, merge and return. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac8664db3823..40dfca3ef4bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -323,15 +323,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* !CONFIG_SLOB */ -static struct mem_cgroup_per_zone * -mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - - return &memcg->nodeinfo[nid]->zoneinfo[zid]; -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -944,39 +935,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg - * @zone: zone of the wanted lruvec - * @memcg: memcg of the wanted lruvec - * - * Returns the lru list vector holding pages for the given @zone and - * @mem. This can be the global zone lruvec, if the memory controller - * is disabled. - */ -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_zone *mz; - struct lruvec *lruvec; - - if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; - goto out; - } - - mz = mem_cgroup_zone_zoneinfo(memcg, zone); - lruvec = &mz->lruvec; -out: - /* - * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; - * and if offlined then reonlined, we need to reinitialize it. - */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; - return lruvec; -} - -/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -1259,6 +1217,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = memcg, .gfp_mask = gfp_mask, .order = order, }; @@ -1275,13 +1234,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); goto unlock; } - check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1289,7 +1248,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task, totalpages)) { + switch (oom_scan_process_thread(&oc, task)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1329,7 +1288,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, memcg, + oom_kill_process(&oc, chosen, points, totalpages, "Memory cgroup out of memory"); } unlock: @@ -2272,20 +2231,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -/* +static inline bool memcg_kmem_bypass(void) +{ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + return false; +} + +/** + * memcg_kmem_get_cache: select the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. * - * If the cache does not exist yet, if we are the first user of it, - * we either create it immediately, if possible, or create it asynchronously - * in a workqueue. - * In the latter case, we will let the current allocation go through with - * the original cache. + * If the cache does not exist yet, if we are the first user of it, we + * create it asynchronously in a workqueue and let the current allocation + * go through with the original cache. * - * Can't be called in interrupt context or from kernel threads. - * This function needs to be called with rcu_read_lock() held. + * This function takes a reference to the cache it returns to assure it + * won't get destroyed while we are working with it. Once the caller is + * done with it, memcg_kmem_put_cache() must be called to release the + * reference. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2293,10 +2262,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) VM_BUG_ON(!is_root_cache(cachep)); - if (cachep->flags & SLAB_ACCOUNT) - gfp |= __GFP_ACCOUNT; - - if (!(gfp & __GFP_ACCOUNT)) + if (memcg_kmem_bypass()) return cachep; if (current->memcg_kmem_skip_account) @@ -2329,14 +2295,27 @@ out: return cachep; } -void __memcg_kmem_put_cache(struct kmem_cache *cachep) +/** + * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache + * @cachep: the cache returned by memcg_kmem_get_cache + */ +void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +/** + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * @memcg: memory cgroup to charge + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; struct page_counter *counter; @@ -2357,19 +2336,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, return 0; } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +/** + * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; + if (memcg_kmem_bypass()) + return 0; + memcg = get_mem_cgroup_from_mm(current->mm); if (!mem_cgroup_is_root(memcg)) - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } - -void __memcg_kmem_uncharge(struct page *page, int order) +/** + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order + */ +void memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -4057,6 +4051,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4116,6 +4164,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return NULL; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto fail; + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto fail; @@ -4142,8 +4196,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); mem_cgroup_free(memcg); return NULL; } @@ -4206,12 +4263,11 @@ fail: return ERR_PTR(-ENOMEM); } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); return 0; } @@ -4234,6 +4290,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4345,7 +4403,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, #ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); @@ -4364,7 +4422,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, } #else static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { return NULL; } @@ -4407,7 +4465,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /** * mem_cgroup_move_account - move account of the page * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) + * @compound: charge the page as compound or small page * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -4529,7 +4587,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, addr, ptent, &ent); + page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); @@ -5269,6 +5327,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * @mm: mm context of the victim * @gfp_mask: reclaim mode * @memcgp: charged memcg return + * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. @@ -5331,6 +5390,7 @@ out: * @page: page to charge * @memcg: memcg to charge the page to * @lrucare: page might be on LRU already + * @compound: charge the page as compound or small page * * Finalize a charge transaction started by mem_cgroup_try_charge(), * after page->mapping has been set up. This must happen atomically @@ -5382,6 +5442,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * mem_cgroup_cancel_charge - cancel a page charge * @page: page to charge * @memcg: memcg to charge the page to + * @compound: charge the page as compound or small page * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ @@ -5405,15 +5466,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, struct page *dummy_page) + unsigned long nr_huge, unsigned long nr_kmem, + struct page *dummy_page) { - unsigned long nr_pages = nr_anon + nr_file; + unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) + page_counter_uncharge(&memcg->kmem, nr_kmem); memcg_oom_recover(memcg); } @@ -5436,6 +5500,7 @@ static void uncharge_list(struct list_head *page_list) unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; + unsigned long nr_kmem = 0; unsigned long pgpgout = 0; struct list_head *next; struct page *page; @@ -5446,8 +5511,6 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { - unsigned int nr_pages = 1; - page = list_entry(next, struct page, lru); next = page->lru.next; @@ -5466,31 +5529,34 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); - pgpgout = nr_anon = nr_file = nr_huge = 0; + nr_huge, nr_kmem, page); + pgpgout = nr_anon = nr_file = + nr_huge = nr_kmem = 0; } memcg = page->mem_cgroup; } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - nr_huge += nr_pages; - } + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; - if (PageAnon(page)) - nr_anon += nr_pages; - else - nr_file += nr_pages; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + nr_huge += nr_pages; + } + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + pgpgout++; + } else + nr_kmem += 1 << compound_order(page); page->mem_cgroup = NULL; - - pgpgout++; } while (next != page_list); if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); + nr_huge, nr_kmem, page); } /** @@ -5756,6 +5822,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5774,6 +5841,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /* @@ -5804,11 +5874,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) !page_counter_try_charge(&memcg->swap, 1, &counter)) return -ENOMEM; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); return 0; } @@ -5837,7 +5907,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) page_counter_uncharge(&memcg->memsw, 1); } mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/memory.c b/mm/memory.c index cd1f29e4897e..0df349c37517 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -233,6 +233,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif + tlb->page_size = 0; __tlb_reset_range(tlb); } @@ -292,23 +293,31 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e * handling the additional races in SMP caused by other CPUs caching valid * mappings in their TLBs. Returns the number of free page slots left. * When out of page slots we must call tlb_flush_mmu(). + *returns true if the caller should flush. */ -int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + if (!tlb->page_size) + tlb->page_size = page_size; + else { + if (page_size != tlb->page_size) + return true; + } + batch = tlb->active; - batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) - return 0; + return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - return batch->max - batch->nr; + batch->pages[batch->nr++] = page; + return false; } #endif /* HAVE_GENERIC_MMU_GATHER */ @@ -1109,6 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + struct page *pending_page = NULL; again: init_rss_vec(rss); @@ -1132,7 +1142,7 @@ again: * unmap shared but keep private pages. */ if (details->check_mapping && - details->check_mapping != page->mapping) + details->check_mapping != page_rmapping(page)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1160,8 +1170,9 @@ again: page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(!__tlb_remove_page(tlb, page))) { + if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; + pending_page = page; addr += PAGE_SIZE; break; } @@ -1202,7 +1213,11 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - + if (pending_page) { + /* remove the page with new size */ + __tlb_remove_pte_page(tlb, pending_page); + pending_page = NULL; + } if (addr != end) goto again; } @@ -1479,7 +1494,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page); + page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; @@ -2055,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - struct page *page, int page_mkwrite, - int dirty_shared) - __releases(ptl) +static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, + struct page *page, int page_mkwrite, int dirty_shared) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2071,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, address, pte_pfn(orig_pte)); + flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry, 1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); + if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) + update_mmu_cache(vma, fe->address, fe->pte); + pte_unmap_unlock(fe->pte, fe->ptl); if (dirty_shared) { struct address_space *mapping; @@ -2122,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - pte_t orig_pte, struct page *old_page) +static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, + struct page *old_page) { + struct vm_area_struct *vma = fe->vma; + struct mm_struct *mm = vma->vm_mm; struct page *new_page = NULL; - spinlock_t *ptl = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ - const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, address); + new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); if (!new_page) goto oom; } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + fe->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address, vma); + cow_user_page(new_page, old_page, fe->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2158,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, /* * Re-check the pte - we dropped the lock */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { + fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); + if (likely(pte_same(*fe->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2169,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, address, pte_pfn(orig_pte)); + flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2178,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address, false); + ptep_clear_flush_notify(vma, fe->address, fe->pte); + page_add_new_anon_rmap(new_page, vma, fe->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2187,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, page_table); + set_pte_at_notify(mm, fe->address, fe->pte, entry); + update_mmu_cache(vma, fe->address, fe->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2225,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (new_page) put_page(new_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2253,44 +2267,43 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - pmd_t *pmd) +static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) { + struct vm_area_struct *vma = fe->vma; + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { struct vm_fault vmf = { .page = NULL, - .pgoff = linear_page_index(vma, address), - .virtual_address = (void __user *)(address & PAGE_MASK), + .pgoff = linear_page_index(vma, fe->address), + .virtual_address = + (void __user *)(fe->address & PAGE_MASK), .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); if (ret & VM_FAULT_ERROR) return ret; - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); /* * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*page_table, orig_pte)) { - pte_unmap_unlock(page_table, ptl); + if (!pte_same(*fe->pte, orig_pte)) { + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } } - return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, - NULL, 0, 0); + return wp_page_reuse(fe, orig_pte, NULL, 0, 0); } -static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, - pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, - struct page *old_page) - __releases(ptl) +static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, + struct page *old_page) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; int page_mkwrite = 0; get_page(old_page); @@ -2298,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(page_table, ptl); - tmp = do_page_mkwrite(vma, old_page, address); + pte_unmap_unlock(fe->pte, fe->ptl); + tmp = do_page_mkwrite(vma, old_page, fe->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(old_page); @@ -2311,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); put_page(old_page); return 0; } page_mkwrite = 1; } - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); } /* @@ -2344,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) - __releases(ptl) +static int do_wp_page(struct fault_env *fe, pte_t orig_pte) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; struct page *old_page; - old_page = vm_normal_page(vma, address, orig_pte); + old_page = vm_normal_page(vma, fe->address, orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2362,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(mm, vma, address, page_table, ptl, - orig_pte, pmd); + return wp_pfn_shared(fe, orig_pte); - pte_unmap_unlock(page_table, ptl); - return wp_page_copy(mm, vma, address, page_table, pmd, - orig_pte, old_page); + pte_unmap_unlock(fe->pte, fe->ptl); + return wp_page_copy(fe, orig_pte, old_page); } /* @@ -2378,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); put_page(old_page); return 0; } @@ -2400,17 +2409,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * the page lock. */ page_move_anon_rmap(compound_head(old_page), - vma, address); + vma, fe->address); } unlock_page(old_page); - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); + return wp_page_reuse(fe, orig_pte, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(mm, vma, address, page_table, pmd, - ptl, orig_pte, old_page); + return wp_page_shared(fe, orig_pte, old_page); } /* @@ -2418,9 +2425,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ get_page(old_page); - pte_unmap_unlock(page_table, ptl); - return wp_page_copy(mm, vma, address, page_table, pmd, - orig_pte, old_page); + pte_unmap_unlock(fe->pte, fe->ptl); + return wp_page_copy(fe, orig_pte, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2508,11 +2514,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) +int do_swap_page(struct fault_env *fe, pte_t orig_pte) { - spinlock_t *ptl; + struct vm_area_struct *vma = fe->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2521,17 +2525,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, int exclusive = 0; int ret = 0; - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, address, orig_pte, NULL); + print_bad_pte(vma, fe->address, orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2540,14 +2544,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page = lookup_swap_cache(entry); if (!page) { page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, address); + GFP_HIGHUSER_MOVABLE, vma, fe->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + if (likely(pte_same(*fe->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2556,7 +2561,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -2569,7 +2574,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } swapcache = page; - locked = lock_page_or_retry(page, mm, flags); + locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2586,14 +2591,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, address); + page = ksm_might_need_to_copy(page, vma, fe->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2601,8 +2607,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Back out if somebody else already faulted in this pte. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*page_table, orig_pte))) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (unlikely(!pte_same(*fe->pte, orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2620,24 +2627,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * must be called after the swap_free(), or it will never succeed. */ - inc_mm_counter_fast(mm, MM_ANONPAGES); - dec_mm_counter_fast(mm, MM_SWAPENTS); + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - flags &= ~FAULT_FLAG_WRITE; + fe->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(mm, address, page_table, pte); + set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); if (page == swapcache) { - do_page_add_anon_rmap(page, vma, address, exclusive); + do_page_add_anon_rmap(page, vma, fe->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address, false); + page_add_new_anon_rmap(page, vma, fe->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2660,22 +2667,22 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(swapcache); } - if (flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (fe->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(fe, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, page_table); + update_mmu_cache(vma, fe->address, fe->pte); unlock: - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); out_page: unlock_page(page); out_release: @@ -2726,37 +2733,51 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags) +static int do_anonymous_page(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; struct mem_cgroup *memcg; struct page *page; - spinlock_t *ptl; pte_t entry; - pte_unmap(page_table); - /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, address) < 0) + if (check_stack_guard_page(vma, fe->address) < 0) return VM_FAULT_SIGSEGV; + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(fe->pmd))) + return 0; + /* Use the zero-page for reads */ - if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + if (!(fe->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), vma->vm_page_prot)); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_none(*fe->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(page_table, ptl); - return handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + pte_unmap_unlock(fe->pte, fe->ptl); + return handle_userfault(fe, VM_UFFD_MISSING); } goto setpte; } @@ -2764,11 +2785,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, address); + page = alloc_zeroed_user_highpage_movable(vma, fe->address); if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2782,30 +2803,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_none(*fe->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + return handle_userfault(fe, VM_UFFD_MISSING); } - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address, false); + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, fe->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(mm, address, page_table, entry); + set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, page_table); + update_mmu_cache(vma, fe->address, fe->pte); unlock: - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2822,17 +2843,16 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, - struct page *cow_page, struct page **page, - void **entry) +static int __do_fault(struct fault_env *fe, pgoff_t pgoff, + struct page *cow_page, struct page **page, void **entry) { + struct vm_area_struct *vma = fe->vma; struct vm_fault vmf; int ret; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); vmf.pgoff = pgoff; - vmf.flags = flags; + vmf.flags = fe->flags; vmf.page = NULL; vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; @@ -2861,41 +2881,168 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } +static int pte_alloc_one_map(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + if (!pmd_none(*fe->pmd)) + goto map_pte; + if (fe->prealloc_pte) { + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) { + spin_unlock(fe->ptl); + goto map_pte; + } + + atomic_long_inc(&vma->vm_mm->nr_ptes); + pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); + spin_unlock(fe->ptl); + fe->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + return VM_FAULT_OOM; + } +map_pte: + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + return VM_FAULT_NOPAGE; + + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + +#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) +static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, + unsigned long haddr) +{ + if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != + (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) + return false; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return false; + return true; +} + +static int do_set_pmd(struct fault_env *fe, struct page *page) +{ + struct vm_area_struct *vma = fe->vma; + bool write = fe->flags & FAULT_FLAG_WRITE; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; + pmd_t entry; + int i, ret; + + if (!transhuge_vma_suitable(vma, haddr)) + return VM_FAULT_FALLBACK; + + ret = VM_FAULT_FALLBACK; + page = compound_head(page); + + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) + goto out; + + for (i = 0; i < HPAGE_PMD_NR; i++) + flush_icache_page(vma, page + i); + + entry = mk_huge_pmd(page, vma->vm_page_prot); + if (write) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); + page_add_file_rmap(page, true); + + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + + update_mmu_cache_pmd(vma, haddr, fe->pmd); + + /* fault is handled */ + ret = 0; + count_vm_event(THP_FILE_MAPPED); +out: + spin_unlock(fe->ptl); + return ret; +} +#else +static int do_set_pmd(struct fault_env *fe, struct page *page) +{ + BUILD_BUG(); + return 0; +} +#endif + /** - * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * alloc_set_pte - setup new PTE entry for given page and add reverse page + * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @vma: virtual memory area - * @address: user virtual address + * @fe: fault environment + * @memcg: memcg to charge page (only for private mappings) * @page: page to map - * @pte: pointer to target page table entry - * @write: true, if new entry is writable - * @anon: true, if it's anonymous page * - * Caller must hold page table lock relevant for @pte. + * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon) +int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, + struct page *page) { + struct vm_area_struct *vma = fe->vma; + bool write = fe->flags & FAULT_FLAG_WRITE; pte_t entry; + int ret; + + if (pmd_none(*fe->pmd) && PageTransCompound(page) && + IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + /* THP on COW? */ + VM_BUG_ON_PAGE(memcg, page); + + ret = do_set_pmd(fe, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (!fe->pte) { + ret = pte_alloc_one_map(fe); + if (ret) + return ret; + } + + /* Re-check under ptl */ + if (unlikely(!pte_none(*fe->pte))) + return VM_FAULT_NOPAGE; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { + /* copy-on-write page */ + if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address, false); + page_add_new_anon_rmap(page, vma, fe->address, false); + mem_cgroup_commit_charge(page, memcg, false, false); + lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page); + page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, fe->address, fe->pte); + + return 0; } static unsigned long fault_around_bytes __read_mostly = @@ -2962,57 +3109,66 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static void do_fault_around(struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pgoff_t pgoff, unsigned int flags) +static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) { - unsigned long start_addr, nr_pages, mask; - pgoff_t max_pgoff; - struct vm_fault vmf; - int off; + unsigned long address = fe->address, nr_pages, mask; + pgoff_t end_pgoff; + int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - start_addr = max(address & mask, vma->vm_start); - off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - pte -= off; - pgoff -= off; + fe->address = max(address & mask, fe->vma->vm_start); + off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + start_pgoff -= off; /* - * max_pgoff is either end of page table or end of vma - * or fault_around_pages() from pgoff, depending what is nearest. + * end_pgoff is either end of page table or end of vma + * or fault_around_pages() from start_pgoff, depending what is nearest. */ - max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + end_pgoff = start_pgoff - + ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + nr_pages - 1); + end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + start_pgoff + nr_pages - 1); - /* Check if it makes any sense to call ->map_pages */ - while (!pte_none(*pte)) { - if (++pgoff > max_pgoff) - return; - start_addr += PAGE_SIZE; - if (start_addr >= vma->vm_end) - return; - pte++; + if (pmd_none(*fe->pmd)) { + fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); + smp_wmb(); /* See comment in __pte_alloc() */ } - vmf.virtual_address = (void __user *) start_addr; - vmf.pte = pte; - vmf.pgoff = pgoff; - vmf.max_pgoff = max_pgoff; - vmf.flags = flags; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vma->vm_ops->map_pages(vma, &vmf); + fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + /* Huge page is mapped? Page fault is solved */ + if (pmd_trans_huge(*fe->pmd)) { + ret = VM_FAULT_NOPAGE; + goto out; + } + + /* ->map_pages() haven't done anything useful. Cold page cache? */ + if (!fe->pte) + goto out; + + /* check if the page fault is solved */ + fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*fe->pte)) + ret = VM_FAULT_NOPAGE; + pte_unmap_unlock(fe->pte, fe->ptl); +out: + fe->address = address; + fe->pte = NULL; + return ret; } -static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page; - spinlock_t *ptl; - pte_t *pte; int ret = 0; /* @@ -3021,85 +3177,64 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - do_fault_around(vma, address, pte, pgoff, flags); - if (!pte_same(*pte, orig_pte)) - goto unlock_out; - pte_unmap_unlock(pte, ptl); + ret = do_fault_around(fe, pgoff); + if (ret) + return ret; } - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); + ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - put_page(fault_page); - return ret; - } - do_set_pte(vma, address, fault_page, pte, false, false); + ret |= alloc_set_pte(fe, NULL, fault_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); unlock_page(fault_page); -unlock_out: - pte_unmap_unlock(pte, ptl); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + put_page(fault_page); return ret; } -static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page, *new_page; void *fault_entry; struct mem_cgroup *memcg; - spinlock_t *ptl; - pte_t *pte; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { + if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { put_page(new_page); return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, - &fault_entry); + ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, address, vma); + copy_user_highpage(new_page, fault_page, fe->address, vma); __SetPageUptodate(new_page); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, - pgoff); - } - goto uncharge_out; - } - do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); - pte_unmap_unlock(pte, ptl); + ret |= alloc_set_pte(fe, memcg, new_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); } + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg, false); @@ -3107,18 +3242,15 @@ uncharge_out: return ret; } -static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page; struct address_space *mapping; - spinlock_t *ptl; - pte_t *pte; int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); + ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3128,7 +3260,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->page_mkwrite) { unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, address); + tmp = do_page_mkwrite(vma, fault_page, fe->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(fault_page); @@ -3136,15 +3268,15 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); + ret |= alloc_set_pte(fe, NULL, fault_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) { unlock_page(fault_page); put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false); - pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) dirtied = 1; @@ -3176,23 +3308,19 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) +static int do_fault(struct fault_env *fe) { - pgoff_t pgoff = linear_page_index(vma, address); + struct vm_area_struct *vma = fe->vma; + pgoff_t pgoff = linear_page_index(vma, fe->address); - pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(flags & FAULT_FLAG_WRITE)) - return do_read_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); + if (!(fe->flags & FAULT_FLAG_WRITE)) + return do_read_fault(fe, pgoff); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_cow_fault(fe, pgoff); + return do_shared_fault(fe, pgoff); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3210,11 +3338,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +static int do_numa_page(struct fault_env *fe, pte_t pte) { + struct vm_area_struct *vma = fe->vma; struct page *page = NULL; - spinlock_t *ptl; int page_nid = -1; int last_cpupid; int target_nid; @@ -3234,10 +3361,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) { - pte_unmap_unlock(ptep, ptl); + fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + if (unlikely(!pte_same(*fe->pte, pte))) { + pte_unmap_unlock(fe->pte, fe->ptl); goto out; } @@ -3246,18 +3373,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(mm, addr, ptep, pte); - update_mmu_cache(vma, addr, ptep); + set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + update_mmu_cache(vma, fe->address, fe->pte); - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(vma, fe->address, pte); if (!page) { - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } @@ -3281,8 +3408,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); - pte_unmap_unlock(ptep, ptl); + target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + &flags); + pte_unmap_unlock(fe->pte, fe->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3302,24 +3430,29 @@ out: return 0; } -static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) +static int create_huge_pmd(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + return do_huge_pmd_anonymous_page(fe); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, + fe->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, pmd_t orig_pmd, - unsigned int flags) +static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) { - if (vma_is_anonymous(vma)) - return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + if (vma_is_anonymous(fe->vma)) + return do_huge_pmd_wp_page(fe, orig_pmd); + if (fe->vma->vm_ops->pmd_fault) + return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, + fe->flags); + + /* COW handled on pte level: split pmd */ + VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); + split_huge_pmd(fe->vma, fe->pmd, fe->address); + return VM_FAULT_FALLBACK; } @@ -3332,59 +3465,79 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with pte unmapped and unlocked. + * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow + * concurrent faults). * - * The mmap_sem may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * The mmap_sem may have been released depending on flags and our return value. + * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +static int handle_pte_fault(struct fault_env *fe) { pte_t entry; - spinlock_t *ptl; - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, - * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. - * The code below just needs a consistent view for the ifs and - * we later double check anyway with the ptl lock held. So here - * a barrier will do. - */ - entry = *pte; - barrier(); - if (!pte_present(entry)) { + if (unlikely(pmd_none(*fe->pmd))) { + /* + * Leave __pte_alloc() until later: because vm_ops->fault may + * want to allocate huge page, and if we expose page table + * for an instant, it will be difficult to retract from + * concurrent faults and from rmap lookups. + */ + fe->pte = NULL; + } else { + /* See comment in pte_alloc_one_map() */ + if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + return 0; + /* + * A regular pmd is established and it can't morph into a huge + * pmd from under us anymore at this point because we hold the + * mmap_sem read mode and khugepaged takes it in write mode. + * So now it's safe to run pte_offset_map(). + */ + fe->pte = pte_offset_map(fe->pmd, fe->address); + + entry = *fe->pte; + + /* + * some architectures can have larger ptes than wordsize, + * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and + * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee + * atomic accesses. The code below just needs a consistent + * view for the ifs and we later double check anyway with the + * ptl lock held. So here a barrier will do. + */ + barrier(); if (pte_none(entry)) { - if (vma_is_anonymous(vma)) - return do_anonymous_page(mm, vma, address, - pte, pmd, flags); - else - return do_fault(mm, vma, address, pte, pmd, - flags, entry); + pte_unmap(fe->pte); + fe->pte = NULL; } - return do_swap_page(mm, vma, address, - pte, pmd, flags, entry); } + if (!fe->pte) { + if (vma_is_anonymous(fe->vma)) + return do_anonymous_page(fe); + else + return do_fault(fe); + } + + if (!pte_present(entry)) + return do_swap_page(fe, entry); + if (pte_protnone(entry)) - return do_numa_page(mm, vma, address, entry, pte, pmd); + return do_numa_page(fe, entry); - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*pte, entry))) + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + if (unlikely(!pte_same(*fe->pte, entry))) goto unlock; - if (flags & FAULT_FLAG_WRITE) { + if (fe->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); + return do_wp_page(fe, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, pte); + if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, + fe->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(fe->vma, fe->address, fe->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3392,11 +3545,11 @@ static int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vma, address); + if (fe->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(fe->vma, fe->address); } unlock: - pte_unmap_unlock(pte, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } @@ -3406,87 +3559,51 @@ unlock: * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { + struct fault_env fe = { + .vma = vma, + .address = address, + .flags = flags, + }; + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) - return VM_FAULT_SIGSEGV; - - if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) + fe.pmd = pmd_alloc(mm, pud, address); + if (!fe.pmd) return VM_FAULT_OOM; - if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(mm, vma, address, pmd, flags); + if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&fe); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *pmd; + pmd_t orig_pmd = *fe.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - unsigned int dirty = flags & FAULT_FLAG_WRITE; - if (pmd_protnone(orig_pmd)) - return do_huge_pmd_numa_page(mm, vma, address, - orig_pmd, pmd); + return do_huge_pmd_numa_page(&fe, orig_pmd); - if (dirty && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(mm, vma, address, pmd, - orig_pmd, flags); + if ((fe.flags & FAULT_FLAG_WRITE) && + !pmd_write(orig_pmd)) { + ret = wp_huge_pmd(&fe, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(mm, vma, address, pmd, - orig_pmd, dirty); + huge_pmd_set_accessed(&fe, orig_pmd); return 0; } } } - /* - * Use pte_alloc() instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pte_alloc(mm, pmd, address))) - return VM_FAULT_OOM; - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd - * didn't become pmd_trans_huge under us and then back to pmd_none, as - * a result of MADV_DONTNEED running immediately after a huge pmd fault - * in a different thread of this mm, in turn leading to a misleading - * pmd_trans_huge() retval. All we have to ensure is that it is a - * regular pmd that we can walk with pte_offset_map() and we can do that - * through an atomic read in C, which is what pmd_trans_unstable() - * provides. - */ - if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + return handle_pte_fault(&fe); } /* @@ -3495,15 +3612,15 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { int ret; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -3515,7 +3632,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_USER) mem_cgroup_oom_enable(); - ret = __handle_mm_fault(mm, vma, address, flags); + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { mem_cgroup_oom_disable(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3cbdcaff2a5..82d0b98d27f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -449,6 +449,25 @@ out_fail: return -1; } +static struct zone * __meminit move_pfn_range(int zone_shift, + unsigned long start_pfn, unsigned long end_pfn) +{ + struct zone *zone = page_zone(pfn_to_page(start_pfn)); + int ret = 0; + + if (zone_shift < 0) + ret = move_pfn_range_left(zone + zone_shift, zone, + start_pfn, end_pfn); + else if (zone_shift) + ret = move_pfn_range_right(zone, zone + zone_shift, + start_pfn, end_pfn); + + if (ret) + return NULL; + + return zone + zone_shift; +} + static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -1028,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } +int zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target) +{ + struct zone *zone = page_zone(pfn_to_page(pfn)); + enum zone_type idx = zone_idx(zone); + int i; + + if (idx < target) { + /* pages must be at end of current zone */ + if (pfn + nr_pages != zone_end_pfn(zone)) + return 0; + + /* no zones in use between current zone and target */ + for (i = idx + 1; i < target; i++) + if (zone_is_initialized(zone - idx + i)) + return 0; + } + + if (target < idx) { + /* pages must be at beginning of current zone */ + if (pfn != zone->zone_start_pfn) + return 0; + + /* no zones in use between current zone and target */ + for (i = target + 1; i < idx; i++) + if (zone_is_initialized(zone - idx + i)) + return 0; + } + + return target - idx; +} /* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) @@ -1039,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + int zone_shift = 0; /* * This doesn't need a lock to do pfn_to_page(). @@ -1052,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ !can_online_high_movable(zone)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL && - zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - return -EINVAL; - } - if (online_type == MMOP_ONLINE_MOVABLE && - zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - return -EINVAL; - } + if (online_type == MMOP_ONLINE_KERNEL) + zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); + else if (online_type == MMOP_ONLINE_MOVABLE) + zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); - /* Previous code may changed the zone of the pfn range */ - zone = page_zone(pfn_to_page(pfn)); + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); + if (!zone) + return -EINVAL; arg.start_pfn = pfn; arg.nr_pages = nr_pages; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 297d6854f849..53e40d3f3933 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -512,6 +512,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, } } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -529,7 +531,7 @@ retry: nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; - if (PageTransCompound(page) && PageAnon(page)) { + if (PageTransCompound(page)) { get_page(page); pte_unmap_unlock(pte, ptl); lock_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index bd3fdc202e8b..2232f6923cc7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -31,6 +31,7 @@ #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/backing-dev.h> +#include <linux/compaction.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> @@ -73,6 +74,81 @@ int migrate_prep_local(void) return 0; } +bool isolate_movable_page(struct page *page, isolate_mode_t mode) +{ + struct address_space *mapping; + + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a movable page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (unlikely(!get_page_unless_zero(page))) + goto out; + + /* + * Check PageMovable before holding a PG_lock because page's owner + * assumes anybody doesn't touch PG_lock of newly allocated page + * so unconditionally grapping the lock ruins page's owner side. + */ + if (unlikely(!__PageMovable(page))) + goto out_putpage; + /* + * As movable pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the releasing a page. + * + * In order to avoid having an already isolated movable page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released, + * lets be sure we have the page lock + * before proceeding with the movable page isolation steps. + */ + if (unlikely(!trylock_page(page))) + goto out_putpage; + + if (!PageMovable(page) || PageIsolated(page)) + goto out_no_isolated; + + mapping = page_mapping(page); + VM_BUG_ON_PAGE(!mapping, page); + + if (!mapping->a_ops->isolate_page(page, mode)) + goto out_no_isolated; + + /* Driver shouldn't use PG_isolated bit of page->flags */ + WARN_ON_ONCE(PageIsolated(page)); + __SetPageIsolated(page); + unlock_page(page); + + return true; + +out_no_isolated: + unlock_page(page); +out_putpage: + put_page(page); +out: + return false; +} + +/* It should be called on page which is PG_movable */ +void putback_movable_page(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + mapping = page_mapping(page); + mapping->a_ops->putback_page(page); + __ClearPageIsolated(page); +} + /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. @@ -94,10 +170,23 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(isolated_balloon_page(page))) - balloon_page_putback(page); - else + /* + * We isolated non-lru movable page so here we can use + * __PageMovable because LRU page's mapping cannot have + * PAGE_MAPPING_MOVABLE. + */ + if (unlikely(__PageMovable(page))) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } else { putback_lru_page(page); + } } } @@ -170,7 +259,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr, false); else - page_add_file_rmap(new); + page_add_file_rmap(new, false); if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -594,7 +683,7 @@ EXPORT_SYMBOL(migrate_page_copy); ***********************************************************/ /* - * Common logic to directly migrate a single page suitable for + * Common logic to directly migrate a single LRU page suitable for * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. @@ -757,33 +846,72 @@ static int move_to_new_page(struct page *newpage, struct page *page, enum migrate_mode mode) { struct address_space *mapping; - int rc; + int rc = -EAGAIN; + bool is_lru = !__PageMovable(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); mapping = page_mapping(page); - if (!mapping) - rc = migrate_page(mapping, newpage, page, mode); - else if (mapping->a_ops->migratepage) + + if (likely(is_lru)) { + if (!mapping) + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * provide a migratepage callback. Anonymous pages + * are part of swap space which also has its own + * migratepage callback. This is the most common path + * for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + else + rc = fallback_migrate_page(mapping, newpage, + page, mode); + } else { /* - * Most pages have a mapping and most filesystems provide a - * migratepage callback. Anonymous pages are part of swap - * space which also has its own migratepage callback. This - * is the most common path for page migration. + * In case of non-lru page, it could be released after + * isolation step. In that case, we shouldn't try migration. */ - rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); - else - rc = fallback_migrate_page(mapping, newpage, page, mode); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + if (!PageMovable(page)) { + rc = MIGRATEPAGE_SUCCESS; + __ClearPageIsolated(page); + goto out; + } + + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && + !PageIsolated(page)); + } /* * When successful, old pagecache page->mapping must be cleared before * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (!PageAnon(page)) + if (__PageMovable(page)) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + /* + * We clear PG_movable under page_lock so any compactor + * cannot try to migrate this page. + */ + __ClearPageIsolated(page); + } + + /* + * Anonymous and movable page->mapping will be cleard by + * free_pages_prepare so don't reset it here for keeping + * the type to work PageAnon, for example. + */ + if (!PageMappingFlags(page)) page->mapping = NULL; } +out: return rc; } @@ -793,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(page); if (!trylock_page(page)) { if (!force || mode == MIGRATE_ASYNC) @@ -861,15 +990,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (unlikely(!trylock_page(newpage))) goto out_unlock; - if (unlikely(isolated_balloon_page(page))) { - /* - * A ballooned page does not need any special attention from - * physical to virtual reverse mapping procedures. - * Skip any attempt to unmap PTEs or to remap swap cache, - * in order to avoid burning cycles at rmap level, and perform - * the page migration right away (proteced by page lock). - */ - rc = balloon_page_migrate(newpage, page, mode); + if (unlikely(!is_lru)) { + rc = move_to_new_page(newpage, page, mode); goto out_unlock_both; } @@ -915,6 +1037,19 @@ out_unlock: put_anon_vma(anon_vma); unlock_page(page); out: + /* + * If migration is successful, decrease refcount of the newpage + * which will not free the page because new page owner increased + * refcounter. As well, if it is LRU page, add the page to LRU + * list in here. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + if (unlikely(__PageMovable(newpage))) + put_page(newpage); + else + putback_lru_page(newpage); + } + return rc; } @@ -948,6 +1083,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ + ClearPageActive(page); + ClearPageUnevictable(page); + if (unlikely(__PageMovable(page))) { + lock_page(page); + if (!PageMovable(page)) + __ClearPageIsolated(page); + unlock_page(page); + } + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); goto out; } @@ -960,10 +1107,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) { - put_new_page = NULL; + if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); - } out: if (rc != -EAGAIN) { @@ -976,33 +1121,45 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + } + + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + put_page(page); + if (reason == MR_MEMORY_FAILURE) { /* - * With this release, we free successfully migrated - * page and set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, it's how - * HWPoison flag works at the moment. + * Set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, + * it's how HWPoison flag works at the moment. */ - put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); - } else - putback_lru_page(page); - } + } + } else { + if (rc != -EAGAIN) { + if (likely(!__PageMovable(page))) { + putback_lru_page(page); + goto put_new; + } - /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, putback_lru_page() will drop the reference grabbed - * during isolation. - */ - if (put_new_page) - put_new_page(newpage, private); - else if (unlikely(__is_movable_balloon_page(newpage))) { - /* drop our reference, page already in the balloon */ - put_page(newpage); - } else - putback_lru_page(newpage); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } +put_new: + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); + } if (result) { if (rc) @@ -1829,8 +1986,7 @@ fail_putback: } orig_entry = *pmd; - entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* diff --git a/mm/mmap.c b/mm/mmap.c index de2c1769cc68..25c2b4e0fbdc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -25,6 +25,7 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> @@ -675,6 +676,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + if (file) { mapping = file->f_mapping; root = &mapping->i_mmap; @@ -695,8 +698,6 @@ again: remove_next = 1 + (end > next->vm_end); } } - vma_adjust_trans_huge(vma, start, end, adjust_next); - anon_vma = vma->anon_vma; if (!anon_vma && adjust_next) anon_vma = next->anon_vma; @@ -1897,8 +1898,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; + if (file) { + if (file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + } else if (flags & MAP_SHARED) { + /* + * mmap_region() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge. + * do_mmap_pgoff() will clear pgoff, so match alignment. + */ + pgoff = 0; + get_area = shmem_get_unmapped_area; + } + addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; @@ -2591,6 +2603,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* drop PG_Mlocked flag for over-mapped range */ for (tmp = vma; tmp->vm_start >= start + size; tmp = tmp->vm_next) { + /* + * Split pmd and munlock page on the border + * of the range. + */ + vma_adjust_trans_huge(tmp, start, start + size, 0); + munlock_vma_pages_range(tmp, max(tmp->vm_start, start), min(tmp->vm_end, start + size)); diff --git a/mm/mprotect.c b/mm/mprotect.c index 5019a1ef2848..a4830f0325fe 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -163,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { split_huge_pmd(vma, pmd, addr); - if (pmd_none(*pmd)) + if (pmd_trans_unstable(pmd)) continue; } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, diff --git a/mm/mremap.c b/mm/mremap.c index 1f157adfdaf9..da22ad2a5678 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -210,9 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, } } split_huge_pmd(vma, old_pmd, old_addr); - if (pmd_none(*old_pmd)) + if (pmd_trans_unstable(old_pmd)) continue; - VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; diff --git a/mm/nommu.c b/mm/nommu.c index c2e58880207f..95daf81a4855 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ddf74487f848..4c21f744daa6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, /* * Do not even consider tasks which are explicitly marked oom - * unkillable or have been already oom reaped. + * unkillable or have been already oom reaped or the are in + * the middle of vfork */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags)) { + test_bit(MMF_OOM_REAPED, &p->mm->flags) || + in_vfork(p)) { task_unlock(p); return 0; } @@ -274,17 +276,29 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, #endif enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task, unsigned long totalpages) + struct task_struct *task) { if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; /* * This task already has access to memory reserves and is being killed. - * Don't allow any other task to have access to the reserves. + * Don't allow any other task to have access to the reserves unless + * the task has MMF_OOM_REAPED because chances that it would release + * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) - return OOM_SCAN_ABORT; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { + struct task_struct *p = find_lock_task_mm(task); + enum oom_scan_t ret = OOM_SCAN_ABORT; + + if (p) { + if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) + ret = OOM_SCAN_CONTINUE; + task_unlock(p); + } + + return ret; + } /* * If task is allocating a lot of memory and has been marked to be @@ -311,7 +325,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc, for_each_process(p) { unsigned int points; - switch (oom_scan_process_thread(oc, p, totalpages)) { + switch (oom_scan_process_thread(oc, p)) { case OOM_SCAN_SELECT: chosen = p; chosen_points = ULONG_MAX; @@ -383,8 +397,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } -static void dump_header(struct oom_control *oc, struct task_struct *p, - struct mem_cgroup *memcg) +static void dump_header(struct oom_control *oc, struct task_struct *p) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, @@ -392,12 +405,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p, cpuset_print_current_mems_allowed(); dump_stack(); - if (memcg) - mem_cgroup_print_oom_info(memcg, p); + if (oc->memcg) + mem_cgroup_print_oom_info(oc->memcg, p); else show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(memcg, oc->nodemask); + dump_tasks(oc->memcg, oc->nodemask); } /* @@ -416,7 +429,7 @@ bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { struct task_struct *t; @@ -453,7 +466,7 @@ static bool __oom_reap_task(struct task_struct *tsk) * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: * __oom_reap_task exit_mm - * atomic_inc_not_zero + * mmget_not_zero * mmput * atomic_dec_and_test * exit_oom_victim @@ -475,12 +488,22 @@ static bool __oom_reap_task(struct task_struct *tsk) if (!p) goto unlock_oom; mm = p->mm; - atomic_inc(&mm->mm_users); + atomic_inc(&mm->mm_count); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto unlock_oom; + goto mm_drop; + } + + /* + * increase mm_users only after we know we will reap something so + * that the mmput_async is called only when we have reaped something + * and delayed __mmput doesn't matter that much + */ + if (!mmget_not_zero(mm)) { + up_read(&mm->mmap_sem); + goto mm_drop; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -522,15 +545,16 @@ static bool __oom_reap_task(struct task_struct *tsk) * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); -unlock_oom: - mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ - if (mm) - mmput_async(mm); + mmput_async(mm); +mm_drop: + mmdrop(mm); +unlock_oom: + mutex_unlock(&oom_lock); return ret; } @@ -544,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts > MAX_OOM_REAP_RETRIES) { + struct task_struct *p; + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); + + /* + * If we've already tried to reap this task in the past and + * failed it probably doesn't make much sense to try yet again + * so hide the mm from the oom killer so that it can move on + * to another task with a different mm struct. + */ + p = find_lock_task_mm(tsk); + if (p) { + if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { + pr_info("oom_reaper: giving up pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + set_bit(MMF_OOM_REAPED, &p->mm->flags); + } + task_unlock(p); + } + debug_show_all_locks(); } @@ -584,7 +627,7 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -602,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } -/* Check if we can reap the given task. This has to be called with stable - * tsk->mm - */ -void try_oom_reaper(struct task_struct *tsk) -{ - struct mm_struct *mm = tsk->mm; - struct task_struct *p; - - if (!mm) - return; - - /* - * There might be other threads/processes which are either not - * dying or even not killable. - */ - if (atomic_read(&mm->mm_users) > 1) { - rcu_read_lock(); - for_each_process(p) { - if (!process_shares_mm(p, mm)) - continue; - if (fatal_signal_pending(p)) - continue; - - /* - * If the task is exiting make sure the whole thread group - * is exiting and cannot acces mm anymore. - */ - if (signal_group_exit(p->signal)) - continue; - - /* Give up */ - rcu_read_unlock(); - return; - } - rcu_read_unlock(); - } - - wake_oom_reaper(tsk); -} - static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -653,10 +656,6 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#else -static void wake_oom_reaper(struct task_struct *tsk) -{ -} #endif /** @@ -733,13 +732,98 @@ void oom_killer_enable(void) oom_killer_disabled = false; } +static inline bool __task_will_free_mem(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + + /* + * A coredumping process may sleep for an extended period in exit_mm(), + * so the oom killer cannot assume that the process will promptly exit + * and release memory. + */ + if (sig->flags & SIGNAL_GROUP_COREDUMP) + return false; + + if (sig->flags & SIGNAL_GROUP_EXIT) + return true; + + if (thread_group_empty(task) && (task->flags & PF_EXITING)) + return true; + + return false; +} + +/* + * Checks whether the given task is dying or exiting and likely to + * release its address space. This means that all threads and processes + * sharing the same mm have to be killed or exiting. + */ +bool task_will_free_mem(struct task_struct *task) +{ + struct mm_struct *mm; + struct task_struct *p; + bool ret; + + if (!__task_will_free_mem(task)) + return false; + + /* + * If the process has passed exit_mm we have to skip it because + * we have lost a link to other tasks sharing this mm, we do not + * have anything to reap and the task might then get stuck waiting + * for parent as zombie and we do not want it to hold TIF_MEMDIE + */ + p = find_lock_task_mm(task); + if (!p) + return false; + + mm = p->mm; + + /* + * This task has already been drained by the oom reaper so there are + * only small chances it will free some more + */ + if (test_bit(MMF_OOM_REAPED, &mm->flags)) { + task_unlock(p); + return false; + } + + if (atomic_read(&mm->mm_users) <= 1) { + task_unlock(p); + return true; + } + + /* pin the mm to not get freed and reused */ + atomic_inc(&mm->mm_count); + task_unlock(p); + + /* + * This is really pessimistic but we do not have any reliable way + * to check that external processes share with our mm + */ + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(task, p)) + continue; + ret = __task_will_free_mem(p); + if (!ret) + break; + } + rcu_read_unlock(); + mmdrop(mm); + + return ret; +} + /* * Must be called while holding a reference to p, which will be released upon * returning. */ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, const char *message) + const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -754,18 +838,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ - task_lock(p); - if (p->mm && task_will_free_mem(p)) { + if (task_will_free_mem(p)) { mark_oom_victim(p); - try_oom_reaper(p); - task_unlock(p); + wake_oom_reaper(p); put_task_struct(p); return; } - task_unlock(p); if (__ratelimit(&oom_rs)) - dump_header(oc, p, memcg); + dump_header(oc, p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); @@ -786,8 +867,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, memcg, oc->nodemask, - totalpages); + child_points = oom_badness(child, + oc->memcg, oc->nodemask, totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -840,14 +921,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || - p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { /* * We cannot use oom_reaper for the mm shared by this * process because it wouldn't get killed and so the - * memory might be still used. + * memory might be still used. Hide the mm from the oom + * killer to guarantee OOM forward progress. */ can_oom_reap = false; + set_bit(MMF_OOM_REAPED, &mm->flags); + pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", + task_pid_nr(victim), victim->comm, + task_pid_nr(p), p->comm); continue; } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); @@ -865,8 +950,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, - struct mem_cgroup *memcg) +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -882,7 +966,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) return; - dump_header(oc, NULL, memcg); + dump_header(oc, NULL); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -934,10 +1018,9 @@ bool out_of_memory(struct oom_control *oc) * But don't select if current has already released its mm and cleared * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. */ - if (current->mm && - (fatal_signal_pending(current) || task_will_free_mem(current))) { + if (current->mm && task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); return true; } @@ -957,13 +1040,13 @@ bool out_of_memory(struct oom_control *oc) constraint = constrained_alloc(oc, &totalpages); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; - check_panic_on_oom(oc, constraint, NULL); + check_panic_on_oom(oc, constraint); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, NULL, + oom_kill_process(oc, current, 0, totalpages, "Out of memory (oom_kill_allocating_task)"); return true; } @@ -971,12 +1054,11 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p && !is_sysrq_oom(oc)) { - dump_header(oc, NULL, NULL); + dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, NULL, - "Out of memory"); + oom_kill_process(oc, p, points, totalpages, "Out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. @@ -988,14 +1070,15 @@ bool out_of_memory(struct oom_control *oc) /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a - * parallel oom killing is already in progress so do nothing. + * memory-hogging task. If oom_lock is held by somebody else, a parallel oom + * killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = NULL, .gfp_mask = 0, .order = 0, }; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e2481949494c..d578d2a56b19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2563,6 +2563,7 @@ int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); + page = compound_head(page); if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* @@ -2747,6 +2748,11 @@ int test_clear_page_writeback(struct page *page) __wb_writeout_inc(wb); } } + + if (mapping->host && !mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + sb_clear_inode_writeback(mapping->host); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); @@ -2774,11 +2780,24 @@ int __test_set_page_writeback(struct page *page, bool keep_write) spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { + bool on_wblist; + + on_wblist = mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK); + radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + + /* + * We can come through here when swapping anonymous + * pages, so we don't necessarily have an inode to track + * for sync. + */ + if (mapping->host && !on_wblist) + sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 02c7a0e6e66a..63801c7f6370 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -64,6 +64,7 @@ #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/random.h> +#include <linux/memcontrol.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -1005,6 +1006,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + if (compound) + ClearPageDoubleMap(page); for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -1015,8 +1018,12 @@ static __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageAnonHead(page)) + if (PageMappingFlags(page)) page->mapping = NULL; + if (memcg_kmem_enabled() && PageKmemcg(page)) { + memcg_kmem_uncharge(page, order); + __ClearPageKmemcg(page); + } if (check_free) bad += free_pages_check(page); if (bad) @@ -1754,6 +1761,18 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) +{ + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); + set_page_owner(page, order, gfp_flags); +} + static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { @@ -1766,13 +1785,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags poisoned &= page_is_poisoned(p); } - set_page_private(page, 0); - set_page_refcounted(page); - - arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); - kasan_alloc_pages(page, order); + post_alloc_hook(page, order, gfp_flags); if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) @@ -1781,8 +1794,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - set_page_owner(page, order, gfp_flags); - /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking @@ -2491,7 +2502,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; - gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -2505,12 +2515,9 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - gfp_mask = get_page_owner_gfp(page); - set_page_owner(page, 0, gfp_mask); - for (i = 1; i < (1 << order); i++) { + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - set_page_owner(page + i, 0, gfp_mask); - } + split_page_owner(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2539,9 +2546,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - set_page_owner(page, order, __GFP_MOVABLE); - - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2557,33 +2565,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. - */ -int split_free_page(struct page *page) -{ - unsigned int order; - int nr_pages; - - order = page_order(page); - - nr_pages = __isolate_free_page(page, order); - if (!nr_pages) - return 0; - - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); - return nr_pages; -} - -/* * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. @@ -3135,6 +3116,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct oom_control oc = { .zonelist = ac->zonelist, .nodemask = ac->nodemask, + .memcg = NULL, .gfp_mask = gfp_mask, .order = order, }; @@ -3898,6 +3880,14 @@ no_zone: } out: + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { + if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { + __free_pages(page, order); + page = NULL; + } else + __SetPageKmemcg(page); + } + if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); @@ -4053,56 +4043,6 @@ void __free_page_frag(void *addr) } EXPORT_SYMBOL(__free_page_frag); -/* - * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is - * equivalent to alloc_pages. - * - * It should be used when the caller would like to use kmalloc, but since the - * allocation is large, it has to fall back to the page allocator. - */ -struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages(gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -/* - * __free_kmem_pages and free_kmem_pages will free pages allocated with - * alloc_kmem_pages. - */ -void __free_kmem_pages(struct page *page, unsigned int order) -{ - memcg_kmem_uncharge(page, order); - __free_pages(page, order); -} - -void free_kmem_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_kmem_pages(virt_to_page((void *)addr), order); - } -} - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { @@ -4404,6 +4344,9 @@ void show_free_areas(unsigned int filter) " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" +#endif " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), @@ -4421,6 +4364,11 @@ void show_free_areas(unsigned int filter) global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR, + global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR, + global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR, +#endif global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); @@ -4455,6 +4403,11 @@ void show_free_areas(unsigned int filter) " writeback:%lukB" " mapped:%lukB" " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" @@ -4487,6 +4440,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_WRITEBACK)), K(zone_page_state(zone, NR_FILE_MAPPED)), K(zone_page_state(zone, NR_SHMEM)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(zone_page_state(zone, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK) * @@ -5427,7 +5386,7 @@ void __init setup_per_cpu_pageset(void) setup_zone_pageset(zone); } -static noinline __init_refok +static noinline __ref int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -6057,7 +6016,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } -static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -6497,15 +6456,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 612122bf6a42..4639163b78f9 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -7,6 +7,7 @@ #include <linux/pageblock-flags.h> #include <linux/memory.h> #include <linux/hugetlb.h> +#include <linux/page_owner.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -108,8 +109,6 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (pfn_valid_within(page_to_pfn(buddy)) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); - kernel_map_pages(page, (1 << order), 1); - set_page_refcounted(page); isolated_page = page; } } @@ -128,8 +127,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); - if (isolated_page) + if (isolated_page) { + post_alloc_hook(page, order, __GFP_MOVABLE); __free_pages(isolated_page, order); + } } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c index fedeba88c9cb..8fa50831089f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -7,11 +7,18 @@ #include <linux/page_owner.h> #include <linux/jump_label.h> #include <linux/migrate.h> +#include <linux/stackdepot.h> + #include "internal.h" +#define PAGE_OWNER_STACK_DEPTH (16) + static bool page_owner_disabled = true; DEFINE_STATIC_KEY_FALSE(page_owner_inited); +static depot_stack_handle_t dummy_handle; +static depot_stack_handle_t failure_handle; + static void init_early_allocated_pages(void); static int early_page_owner_param(char *buf) @@ -34,11 +41,41 @@ static bool need_page_owner(void) return true; } +static noinline void register_dummy_stack(void) +{ + unsigned long entries[4]; + struct stack_trace dummy; + + dummy.nr_entries = 0; + dummy.max_entries = ARRAY_SIZE(entries); + dummy.entries = &entries[0]; + dummy.skip = 0; + + save_stack_trace(&dummy); + dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + struct stack_trace failure; + + failure.nr_entries = 0; + failure.max_entries = ARRAY_SIZE(entries); + failure.entries = &entries[0]; + failure.skip = 0; + + save_stack_trace(&failure); + failure_handle = depot_save_stack(&failure, GFP_KERNEL); +} + static void init_page_owner(void) { if (page_owner_disabled) return; + register_dummy_stack(); + register_failure_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -61,25 +98,66 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +static inline bool check_recursive_alloc(struct stack_trace *trace, + unsigned long ip) { - struct page_ext *page_ext = lookup_page_ext(page); + int i, count; + + if (!trace->nr_entries) + return false; + + for (i = 0, count = 0; i < trace->nr_entries; i++) { + if (trace->entries[i] == ip && ++count == 2) + return true; + } + + return false; +} +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, - .max_entries = ARRAY_SIZE(page_ext->trace_entries), - .entries = &page_ext->trace_entries[0], - .skip = 3, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; + depot_stack_handle_t handle; + + save_stack_trace(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + /* + * We need to check recursion here because our request to stackdepot + * could trigger memory allocation to save new entry. New memory + * allocation would reach here and call depot_save_stack() again + * if we don't catch it. There is still not enough memory in stackdepot + * so it would try to allocate memory again and loop forever. + */ + if (check_recursive_alloc(&trace, _RET_IP_)) + return dummy_handle; + + handle = depot_save_stack(&trace, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; - save_stack_trace(&trace); - + page_ext->handle = save_stack(gfp_mask); page_ext->order = order; page_ext->gfp_mask = gfp_mask; - page_ext->nr_entries = trace.nr_entries; page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); @@ -94,34 +172,31 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext->last_migrate_reason = reason; } -gfp_t __get_page_owner_gfp(struct page *page) +void __split_page_owner(struct page *page, unsigned int order) { + int i; struct page_ext *page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) - /* - * The caller just returns 0 if no valid gfp - * So return 0 here too. - */ - return 0; + return; - return page_ext->gfp_mask; + page_ext->order = 0; + for (i = 1; i < (1 << order); i++) + __copy_page_owner(page, page + i); } void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); - int i; if (unlikely(!old_ext || !new_ext)) return; new_ext->order = old_ext->order; new_ext->gfp_mask = old_ext->gfp_mask; - new_ext->nr_entries = old_ext->nr_entries; - - for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) - new_ext->trace_entries[i] = old_ext->trace_entries[i]; + new_ext->last_migrate_reason = old_ext->last_migrate_reason; + new_ext->handle = old_ext->handle; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -137,14 +212,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext) + struct page *page, struct page_ext *page_ext, + depot_stack_handle_t handle) { int ret; int pageblock_mt, page_mt; char *kbuf; + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { - .nr_entries = page_ext->nr_entries, - .entries = &page_ext->trace_entries[0], + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; kbuf = kmalloc(count, GFP_KERNEL); @@ -173,6 +252,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + depot_fetch_stack(handle, &trace); ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; @@ -203,10 +283,14 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { - .nr_entries = page_ext->nr_entries, - .entries = &page_ext->trace_entries[0], + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; + depot_stack_handle_t handle; gfp_t gfp_mask; int mt; @@ -222,6 +306,13 @@ void __dump_page_owner(struct page *page) return; } + handle = READ_ONCE(page_ext->handle); + if (!handle) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + depot_fetch_stack(handle, &trace); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); @@ -237,6 +328,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long pfn; struct page *page; struct page_ext *page_ext; + depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; @@ -285,10 +377,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Access to page_ext->handle isn't synchronous so we should + * be careful to access it. + */ + handle = READ_ONCE(page_ext->handle); + if (!handle) + continue; + /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; - return print_page_owner(buf, count, pfn, page, page_ext); + return print_page_owner(buf, count, pfn, page, + page_ext, handle); } return 0; diff --git a/mm/readahead.c b/mm/readahead.c index 40be3ae0afe3..65ec288dc057 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -89,7 +89,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { + readahead_gfp_mask(mapping))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -108,7 +108,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned int nr_pages, gfp_t gfp) { struct blk_plug plug; unsigned page_idx; @@ -126,10 +126,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = lru_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) mapping->a_ops->readpage(filp, page); - } put_page(page); } ret = 0; @@ -159,6 +157,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, int page_idx; int ret = 0; loff_t isize = i_size_read(inode); + gfp_t gfp_mask = readahead_gfp_mask(mapping); if (isize == 0) goto out; @@ -180,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (page && !radix_tree_exceptional_entry(page)) continue; - page = page_cache_alloc_readahead(mapping); + page = __page_cache_alloc(gfp_mask); if (!page) break; page->index = page_offset; @@ -196,7 +195,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, * will then handle the error. */ if (ret) - read_pages(mapping, filp, &page_pool, ret); + read_pages(mapping, filp, &page_pool, ret, gfp_mask); BUG_ON(!list_empty(&page_pool)); out: return ret; diff --git a/mm/rmap.c b/mm/rmap.c index 0ea5d9071b32..256e585c67ef 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1215,10 +1215,8 @@ void do_page_add_anon_rmap(struct page *page, * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (compound) { - __inc_zone_page_state(page, - NR_ANON_TRANSPARENT_HUGEPAGES); - } + if (compound) + __inc_zone_page_state(page, NR_ANON_THPS); __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) @@ -1256,7 +1254,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __inc_zone_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1273,18 +1271,42 @@ void page_add_new_anon_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page) +void page_add_file_rmap(struct page *page, bool compound) { + int i, nr = 1; + + VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (atomic_inc_and_test(&page->_mapcount)) { - __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + if (compound && PageTransHuge(page)) { + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_inc_and_test(&page[i]._mapcount)) + nr++; + } + if (!atomic_inc_and_test(compound_mapcount_ptr(page))) + goto out; + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); + } else { + if (PageTransCompound(page)) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + SetPageDoubleMap(compound_head(page)); + if (PageMlocked(page)) + clear_page_mlock(compound_head(page)); + } + if (!atomic_inc_and_test(&page->_mapcount)) + goto out; } + __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); +out: unlock_page_memcg(page); } -static void page_remove_file_rmap(struct page *page) +static void page_remove_file_rmap(struct page *page, bool compound) { + int i, nr = 1; + + VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ @@ -1295,15 +1317,26 @@ static void page_remove_file_rmap(struct page *page) } /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + if (compound && PageTransHuge(page)) { + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + goto out; + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); + } else { + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + } /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_zone_page_state(page, NR_FILE_MAPPED); + __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) @@ -1326,7 +1359,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __dec_zone_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1359,11 +1392,8 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) { - VM_BUG_ON_PAGE(compound && !PageHuge(page), page); - page_remove_file_rmap(page); - return; - } + if (!PageAnon(page)) + return page_remove_file_rmap(page, compound); if (compound) return page_remove_anon_compound_rmap(page); @@ -1438,8 +1468,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { - /* Holding pte lock, we do *not* need mmap_sem here */ - mlock_vma_page(page); + /* PTE-mapped THP are never mlocked */ + if (!PageTransCompound(page)) { + /* + * Holding pte lock, we do *not* need + * mmap_sem here + */ + mlock_vma_page(page); + } ret = SWAP_MLOCK; goto out_unmap; } diff --git a/mm/shmem.c b/mm/shmem.c index 24463b67b6ef..bfaa007ccb58 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/swap.h> #include <linux/uio.h> +#include <linux/khugepaged.h> static struct vfsmount *shm_mnt; @@ -97,14 +98,6 @@ struct shmem_falloc { pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; -/* Flag allocation requirements to shmem_getpage */ -enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ - SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ - SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ -}; - #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -124,7 +117,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); -static inline int shmem_getpage(struct inode *inode, pgoff_t index, +int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, @@ -173,10 +166,13 @@ static inline int shmem_reacct_size(unsigned long flags, * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags) +static inline int shmem_acct_block(unsigned long flags, long pages) { - return (flags & VM_NORESERVE) ? - security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; + if (!(flags & VM_NORESERVE)) + return 0; + + return security_vm_enough_memory_mm(current->mm, + pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) @@ -192,6 +188,7 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static struct file_system_type shmem_fs_type; static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -249,6 +246,53 @@ static void shmem_recalc_inode(struct inode *inode) } } +bool shmem_charge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + + if (shmem_acct_block(info->flags, pages)) + return false; + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + inode->i_mapping->nrpages += pages; + + if (!sbinfo->max_blocks) + return true; + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) { + inode->i_mapping->nrpages -= pages; + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + return false; + } + percpu_counter_add(&sbinfo->used_blocks, pages); + return true; +} + +void shmem_uncharge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + inode->i_blocks -= pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); +} + /* * Replace item expected in radix tree by a new item, while holding tree lock. */ @@ -289,36 +333,256 @@ static bool shmem_confirm_swap(struct address_space *mapping, } /* + * Definitions for "huge tmpfs": tmpfs mounted with the huge= option + * + * SHMEM_HUGE_NEVER: + * disables huge pages for the mount; + * SHMEM_HUGE_ALWAYS: + * enables huge pages for the mount; + * SHMEM_HUGE_WITHIN_SIZE: + * only allocate huge pages if the page will be fully within i_size, + * also respect fadvise()/madvise() hints; + * SHMEM_HUGE_ADVISE: + * only allocate huge pages if requested with fadvise()/madvise(); + */ + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +/* ifdef here to avoid bloating shmem.o when not necessary */ + +int shmem_huge __read_mostly; + +static int shmem_parse_huge(const char *str) +{ + if (!strcmp(str, "never")) + return SHMEM_HUGE_NEVER; + if (!strcmp(str, "always")) + return SHMEM_HUGE_ALWAYS; + if (!strcmp(str, "within_size")) + return SHMEM_HUGE_WITHIN_SIZE; + if (!strcmp(str, "advise")) + return SHMEM_HUGE_ADVISE; + if (!strcmp(str, "deny")) + return SHMEM_HUGE_DENY; + if (!strcmp(str, "force")) + return SHMEM_HUGE_FORCE; + return -EINVAL; +} + +static const char *shmem_format_huge(int huge) +{ + switch (huge) { + case SHMEM_HUGE_NEVER: + return "never"; + case SHMEM_HUGE_ALWAYS: + return "always"; + case SHMEM_HUGE_WITHIN_SIZE: + return "within_size"; + case SHMEM_HUGE_ADVISE: + return "advise"; + case SHMEM_HUGE_DENY: + return "deny"; + case SHMEM_HUGE_FORCE: + return "force"; + default: + VM_BUG_ON(1); + return "bad_val"; + } +} + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + LIST_HEAD(list), *pos, *next; + struct inode *inode; + struct shmem_inode_info *info; + struct page *page; + unsigned long batch = sc ? sc->nr_to_scan : 128; + int removed = 0, split = 0; + + if (list_empty(&sbinfo->shrinklist)) + return SHRINK_STOP; + + spin_lock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &sbinfo->shrinklist) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + + /* pin the inode */ + inode = igrab(&info->vfs_inode); + + /* inode is about to be evicted */ + if (!inode) { + list_del_init(&info->shrinklist); + removed++; + goto next; + } + + /* Check if there's anything to gain */ + if (round_up(inode->i_size, PAGE_SIZE) == + round_up(inode->i_size, HPAGE_PMD_SIZE)) { + list_del_init(&info->shrinklist); + removed++; + iput(inode); + goto next; + } + + list_move(&info->shrinklist, &list); +next: + if (!--batch) + break; + } + spin_unlock(&sbinfo->shrinklist_lock); + + list_for_each_safe(pos, next, &list) { + int ret; + + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + + if (nr_to_split && split >= nr_to_split) { + iput(inode); + continue; + } + + page = find_lock_page(inode->i_mapping, + (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); + if (!page) + goto drop; + + if (!PageTransHuge(page)) { + unlock_page(page); + put_page(page); + goto drop; + } + + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + + if (ret) { + /* split failed: leave it on the list */ + iput(inode); + continue; + } + + split++; +drop: + list_del_init(&info->shrinklist); + removed++; + iput(inode); + } + + spin_lock(&sbinfo->shrinklist_lock); + list_splice_tail(&list, &sbinfo->shrinklist); + sbinfo->shrinklist_len -= removed; + spin_unlock(&sbinfo->shrinklist_lock); + + return split; +} + +static long shmem_unused_huge_scan(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (!READ_ONCE(sbinfo->shrinklist_len)) + return SHRINK_STOP; + + return shmem_unused_huge_shrink(sbinfo, sc, 0); +} + +static long shmem_unused_huge_count(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + return READ_ONCE(sbinfo->shrinklist_len); +} +#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ + +#define shmem_huge SHMEM_HUGE_DENY + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, void *expected) { - int error; + int error, nr = hpage_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(index != round_down(index, nr), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON(expected && PageTransHuge(page)); - get_page(page); + page_ref_add(page, nr); page->mapping = mapping; page->index = index; spin_lock_irq(&mapping->tree_lock); - if (!expected) + if (PageTransHuge(page)) { + void __rcu **results; + pgoff_t idx; + int i; + + error = 0; + if (radix_tree_gang_lookup_slot(&mapping->page_tree, + &results, &idx, index, 1) && + idx < index + HPAGE_PMD_NR) { + error = -EEXIST; + } + + if (!error) { + for (i = 0; i < HPAGE_PMD_NR; i++) { + error = radix_tree_insert(&mapping->page_tree, + index + i, page + i); + VM_BUG_ON(error); + } + count_vm_event(THP_FILE_ALLOC); + } + } else if (!expected) { error = radix_tree_insert(&mapping->page_tree, index, page); - else + } else { error = shmem_radix_tree_replace(mapping, index, expected, page); + } + if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); + mapping->nrpages += nr; + if (PageTransHuge(page)) + __inc_zone_page_state(page, NR_SHMEM_THPS); + __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); + __mod_zone_page_state(page_zone(page), NR_SHMEM, nr); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; spin_unlock_irq(&mapping->tree_lock); - put_page(page); + page_ref_sub(page, nr); } return error; } @@ -331,6 +595,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) struct address_space *mapping = page->mapping; int error; + VM_BUG_ON_PAGE(PageCompound(page), page); + spin_lock_irq(&mapping->tree_lock); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; @@ -510,10 +776,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } + VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); + if (!trylock_page(page)) continue; + + if (PageTransTail(page)) { + /* Middle of THP: zero out the page */ + clear_highpage(page); + unlock_page(page); + continue; + } else if (PageTransHuge(page)) { + if (index == round_down(end, HPAGE_PMD_NR)) { + /* + * Range ends in the middle of THP: + * zero out the page + */ + clear_highpage(page); + unlock_page(page); + continue; + } + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + } + if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (page_mapping(page) == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } @@ -589,8 +878,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } lock_page(page); + + if (PageTransTail(page)) { + /* Middle of THP: zero out the page */ + clear_highpage(page); + unlock_page(page); + /* + * Partial thp truncate due 'start' in middle + * of THP: don't need to look on these pages + * again on !pvec.nr restart. + */ + if (index != round_down(end, HPAGE_PMD_NR)) + start++; + continue; + } else if (PageTransHuge(page)) { + if (index == round_down(end, HPAGE_PMD_NR)) { + /* + * Range ends in the middle of THP: + * zero out the page + */ + clear_highpage(page); + unlock_page(page); + continue; + } + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + } + if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (page_mapping(page) == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } else { @@ -607,10 +924,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index++; } - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) @@ -627,9 +944,9 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); return 0; @@ -639,6 +956,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = inode_change_ok(inode, attr); @@ -674,6 +992,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + + /* + * Part of the huge page can be beyond i_size: subject + * to shrink under memory pressure. + */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + spin_lock(&sbinfo->shrinklist_lock); + if (list_empty(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } } } @@ -686,11 +1018,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); + if (!list_empty(&info->shrinklist)) { + spin_lock(&sbinfo->shrinklist_lock); + if (!list_empty(&info->shrinklist)) { + list_del_init(&info->shrinklist); + sbinfo->shrinklist_len--; + } + spin_unlock(&sbinfo->shrinklist_lock); + } if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -773,9 +1114,9 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, delete_from_swap_cache(*pagep); set_page_dirty(*pagep); if (!error) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped--; - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); swap_free(swap); } } @@ -848,6 +1189,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; + VM_BUG_ON_PAGE(PageCompound(page), page); BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; @@ -922,10 +1264,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); @@ -984,24 +1326,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #define vm_policy vm_private_data #endif +static void shmem_pseudo_vma_init(struct vm_area_struct *vma, + struct shmem_inode_info *info, pgoff_t index) +{ + /* Create a pseudo vma that just contains the policy */ + vma->vm_start = 0; + /* Bias interleave by inode number to distribute better across nodes */ + vma->vm_pgoff = index + info->vfs_inode.i_ino; + vma->vm_ops = NULL; + vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); +} + +static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) +{ + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(vma->vm_policy); +} + static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct page *page; - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - + shmem_pseudo_vma_init(&pvma, info, index); page = swapin_readahead(swap, gfp, &pvma, 0); + shmem_pseudo_vma_destroy(&pvma); - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); + return page; +} + +static struct page *shmem_alloc_hugepage(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR); + void __rcu **results; + struct page *page; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return NULL; + + rcu_read_lock(); + if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + hindex, 1) && idx < hindex + HPAGE_PMD_NR) { + rcu_read_unlock(); + return NULL; + } + rcu_read_unlock(); + + shmem_pseudo_vma_init(&pvma, info, hindex); + page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + shmem_pseudo_vma_destroy(&pvma); + if (page) + prep_transhuge_page(page); return page; } @@ -1011,23 +1392,51 @@ static struct page *shmem_alloc_page(gfp_t gfp, struct vm_area_struct pvma; struct page *page; - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + shmem_pseudo_vma_init(&pvma, info, index); + page = alloc_page_vma(gfp, &pvma, 0); + shmem_pseudo_vma_destroy(&pvma); + + return page; +} + +static struct page *shmem_alloc_and_acct_page(gfp_t gfp, + struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, + pgoff_t index, bool huge) +{ + struct page *page; + int nr; + int err = -ENOSPC; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + huge = false; + nr = huge ? HPAGE_PMD_NR : 1; + + if (shmem_acct_block(info->flags, nr)) + goto failed; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - nr) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, nr); + } - page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (huge) + page = shmem_alloc_hugepage(gfp, info, index); + else + page = shmem_alloc_page(gfp, info, index); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); + return page; } - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); - - return page; + err = -ENOMEM; + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -nr); +unacct: + shmem_unacct_blocks(info->flags, nr); +failed: + return ERR_PTR(err); } /* @@ -1132,12 +1541,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; + enum sgp_type sgp_huge = sgp; + pgoff_t hindex = index; int error; int once = 0; int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; + if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) + sgp = SGP_CACHE; repeat: swap.val = 0; page = find_lock_entry(mapping, index); @@ -1240,10 +1653,10 @@ repeat: mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); if (sgp == SGP_WRITE) mark_page_accessed(page); @@ -1253,51 +1666,111 @@ repeat: swap_free(swap); } else { - if (shmem_acct_block(info->flags)) { - error = -ENOSPC; - goto failed; + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + loff_t i_size; + pgoff_t off; + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + goto alloc_huge; + /* fallthrough */ + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; } - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) { - error = -ENOSPC; - goto unacct; + +alloc_huge: + page = shmem_alloc_and_acct_page(gfp, info, sbinfo, + index, true); + if (IS_ERR(page)) { +alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, + index, false); + } + if (IS_ERR(page)) { + int retry = 5; + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto failed; + /* + * Try to reclaim some spece by splitting a huge page + * beyond i_size on the filesystem. + */ + while (retry--) { + int ret; + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; } - percpu_counter_inc(&sbinfo->used_blocks); + goto failed; } - page = shmem_alloc_page(gfp, info, index); - if (!page) { - error = -ENOMEM; - goto decused; - } + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; + if (sgp == SGP_WRITE) __SetPageReferenced(page); error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, - false); + PageTransHuge(page)); if (error) - goto decused; - error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + goto unacct; + error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, + compound_order(page)); if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, + error = shmem_add_to_page_cache(page, mapping, hindex, NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - goto decused; + mem_cgroup_cancel_charge(page, memcg, + PageTransHuge(page)); + goto unacct; } - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false, + PageTransHuge(page)); lru_cache_add_anon(page); - spin_lock(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; + spin_lock_irq(&info->lock); + info->alloced += 1 << compound_order(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); alloced = true; + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { + /* + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + if (list_empty(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. */ @@ -1309,10 +1782,15 @@ clear: * but SGP_FALLOC on a page fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ - if (sgp != SGP_WRITE) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + if (sgp != SGP_WRITE && !PageUptodate(page)) { + struct page *head = compound_head(page); + int i; + + for (i = 0; i < (1 << compound_order(head)); i++) { + clear_highpage(head + i); + flush_dcache_page(head + i); + } + SetPageUptodate(head); } } @@ -1322,24 +1800,30 @@ clear: if (alloced) { ClearPageDirty(page); delete_from_page_cache(page); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } error = -EINVAL; goto unlock; } - *pagep = page; + *pagep = page + index - hindex; return 0; /* * Error recovery. */ -decused: - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); unacct: - shmem_unacct_blocks(info->flags, 1); + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, + 1 << compound_order(page)); + shmem_unacct_blocks(info->flags, 1 << compound_order(page)); + + if (PageTransHuge(page)) { + unlock_page(page); + put_page(page); + goto alloc_nohuge; + } failed: if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; @@ -1350,9 +1834,9 @@ unlock: } if (error == -ENOSPC && !once++) { info = SHMEM_I(inode); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); goto repeat; } if (error == -EEXIST) /* from above or from radix_tree_insert */ @@ -1364,6 +1848,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + enum sgp_type sgp; int error; int ret = VM_FAULT_LOCKED; @@ -1425,13 +1910,107 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + sgp = SGP_CACHE; + if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; + else if (vma->vm_flags & VM_NOHUGEPAGE) + sgp = SGP_NOHUGE; + + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret; } +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long uaddr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, + unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long addr; + unsigned long offset; + unsigned long inflated_len; + unsigned long inflated_addr; + unsigned long inflated_offset; + + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + addr = get_area(file, uaddr, len, pgoff, flags); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return addr; + if (IS_ERR_VALUE(addr)) + return addr; + if (addr & ~PAGE_MASK) + return addr; + if (addr > TASK_SIZE - len) + return addr; + + if (shmem_huge == SHMEM_HUGE_DENY) + return addr; + if (len < HPAGE_PMD_SIZE) + return addr; + if (flags & MAP_FIXED) + return addr; + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. + * But if caller specified an address hint, respect that as before. + */ + if (uaddr) + return addr; + + if (shmem_huge != SHMEM_HUGE_FORCE) { + struct super_block *sb; + + if (file) { + VM_BUG_ON(file->f_op != &shmem_file_operations); + sb = file_inode(file)->i_sb; + } else { + /* + * Called directly from mm/mmap.c, or drivers/char/mem.c + * for "/dev/zero", to create a shared anonymous object. + */ + if (IS_ERR(shm_mnt)) + return addr; + sb = shm_mnt->mnt_sb; + } + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + return addr; + } + + offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); + if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + return addr; + if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + return addr; + + inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + if (inflated_len > TASK_SIZE) + return addr; + if (inflated_len < len) + return addr; + + inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) + return addr; + + inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_addr += offset - inflated_offset; + if (inflated_offset > offset) + inflated_addr += HPAGE_PMD_SIZE; + + if (inflated_addr > TASK_SIZE - len) + return addr; + return inflated_addr; +} + #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -1456,7 +2035,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; - spin_lock(&info->lock); + spin_lock_irq(&info->lock); if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, user)) goto out_nomem; @@ -1471,7 +2050,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) retval = 0; out_nomem: - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); return retval; } @@ -1479,6 +2058,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } return 0; } @@ -1504,6 +2088,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); @@ -1589,12 +2174,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, i_size_write(inode, pos + copied); if (!PageUptodate(page)) { + struct page *head = compound_head(page); + if (PageTransCompound(page)) { + int i; + + for (i = 0; i < HPAGE_PMD_NR; i++) { + if (head + i == page) + continue; + clear_highpage(head + i); + flush_dcache_page(head + i); + } + } if (copied < PAGE_SIZE) { unsigned from = pos & (PAGE_SIZE - 1); zero_user_segments(page, 0, from, from + copied, PAGE_SIZE); } - SetPageUptodate(page); + SetPageUptodate(head); } set_page_dirty(page); unlock_page(page); @@ -2858,11 +3454,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, sbinfo->gid = make_kgid(current_user_ns(), gid); if (!gid_valid(sbinfo->gid)) goto bad_val; +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + } else if (!strcmp(this_char, "huge")) { + int huge; + huge = shmem_parse_huge(value); + if (huge < 0) + goto bad_val; + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER) + goto bad_val; + sbinfo->huge = huge; +#endif +#ifdef CONFIG_NUMA } else if (!strcmp(this_char,"mpol")) { mpol_put(mpol); mpol = NULL; if (mpol_parse_str(value, &mpol)) goto bad_val; +#endif } else { pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; @@ -2908,6 +3517,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) goto out; error = 0; + sbinfo->huge = config.huge; sbinfo->max_blocks = config.max_blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -2941,6 +3551,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ + if (sbinfo->huge) + seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); +#endif shmem_show_mpol(seq, sbinfo->mpol); return 0; } @@ -3070,6 +3685,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; + spin_lock_init(&sbinfo->shrinklist_lock); + INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; @@ -3159,6 +3776,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, @@ -3231,6 +3849,10 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + .nr_cached_objects = shmem_unused_huge_count, + .free_cached_objects = shmem_unused_huge_scan, +#endif }; static const struct vm_operations_struct shmem_vm_ops = { @@ -3280,6 +3902,13 @@ int __init shmem_init(void) pr_err("Could not kern_mount tmpfs\n"); goto out1; } + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + else + shmem_huge = 0; /* just in case it was patched */ +#endif return 0; out1: @@ -3291,6 +3920,91 @@ out3: return error; } +#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +static ssize_t shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int values[] = { + SHMEM_HUGE_ALWAYS, + SHMEM_HUGE_WITHIN_SIZE, + SHMEM_HUGE_ADVISE, + SHMEM_HUGE_NEVER, + SHMEM_HUGE_DENY, + SHMEM_HUGE_FORCE, + }; + int i, count; + + for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { + const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; + + count += sprintf(buf + count, fmt, + shmem_format_huge(values[i])); + } + buf[count - 1] = '\n'; + return count; +} + +static ssize_t shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char tmp[16]; + int huge; + + if (count + 1 > sizeof(tmp)) + return -EINVAL; + memcpy(tmp, buf, count); + tmp[count] = '\0'; + if (count && tmp[count - 1] == '\n') + tmp[count - 1] = '\0'; + + huge = shmem_parse_huge(tmp); + if (huge == -EINVAL) + return -EINVAL; + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + shmem_huge = huge; + if (shmem_huge < SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + return count; +} + +struct kobj_attribute shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); + +bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size; + pgoff_t off; + + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + return false; + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; + case SHMEM_HUGE_ADVISE: + /* TODO: implement fadvise() hints */ + return (vma->vm_flags & VM_HUGEPAGE); + default: + VM_BUG_ON(1); + return false; + } +} +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ + #else /* !CONFIG_SHMEM */ /* @@ -3333,6 +4047,15 @@ void shmem_unlock_mapping(struct address_space *mapping) { } +#ifdef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif + void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); @@ -3459,6 +4182,13 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } + return 0; } diff --git a/mm/slab.c b/mm/slab.c index cc8bbc1e6bc9..582df5e8a588 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1236,61 +1236,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } -#ifdef CONFIG_SLAB_FREELIST_RANDOM -static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, - size_t count) -{ - size_t i; - unsigned int rand; - - for (i = 0; i < count; i++) - list[i] = i; - - /* Fisher-Yates shuffle */ - for (i = count - 1; i > 0; i--) { - rand = prandom_u32_state(state); - rand %= (i + 1); - swap(list[i], list[rand]); - } -} - -/* Create a random sequence per cache */ -static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) -{ - unsigned int seed, count = cachep->num; - struct rnd_state state; - - if (count < 2) - return 0; - - /* If it fails, we will just use the global lists */ - cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); - if (!cachep->random_seq) - return -ENOMEM; - - /* Get best entropy at this stage */ - get_random_bytes_arch(&seed, sizeof(seed)); - prandom_seed_state(&state, seed); - - freelist_randomize(&state, cachep->random_seq, count); - return 0; -} - -/* Destroy the per-cache random freelist sequence */ -static void cache_random_seq_destroy(struct kmem_cache *cachep) -{ - kfree(cachep->random_seq); - cachep->random_seq = NULL; -} -#else -static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) -{ - return 0; -} -static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } -#endif /* CONFIG_SLAB_FREELIST_RANDOM */ - - /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -1932,7 +1877,7 @@ static struct array_cache __percpu *alloc_kmem_cache_cpus( return cpu_cache; } -static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) +static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (slab_state >= FULL) return enable_cpucache(cachep, gfp); @@ -2535,7 +2480,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) union freelist_init_state { struct { unsigned int pos; - freelist_idx_t *list; + unsigned int *list; unsigned int count; unsigned int rand; }; @@ -2554,7 +2499,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, unsigned int rand; /* Use best entropy available to define a random shift */ - get_random_bytes_arch(&rand, sizeof(rand)); + rand = get_random_int(); /* Use a random state if the pre-computed list is not available */ if (!cachep->random_seq) { @@ -2576,13 +2521,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state) return (state->list[state->pos++] + state->rand) % state->count; } +/* Swap two freelist entries */ +static void swap_free_obj(struct page *page, unsigned int a, unsigned int b) +{ + swap(((freelist_idx_t *)page->freelist)[a], + ((freelist_idx_t *)page->freelist)[b]); +} + /* * Shuffle the freelist initialization state based on pre-computed lists. * return true if the list was successfully shuffled, false otherwise. */ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) { - unsigned int objfreelist = 0, i, count = cachep->num; + unsigned int objfreelist = 0, i, rand, count = cachep->num; union freelist_init_state state; bool precomputed; @@ -2607,7 +2559,15 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) * Later use a pre-computed list for speed. */ if (!precomputed) { - freelist_randomize(&state.rnd_state, page->freelist, count); + for (i = 0; i < count; i++) + set_free_obj(page, i, i); + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(&state.rnd_state); + rand %= (i + 1); + swap_free_obj(page, i, rand); + } } else { for (i = 0; i < count; i++) set_free_obj(page, i, next_random_slot(&state)); @@ -2726,8 +2686,11 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * critical path in kmem_cache_alloc(). */ if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3979,7 +3942,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; - err = cache_random_seq_create(cachep, gfp); + err = cache_random_seq_create(cachep, cachep->num, gfp); if (err) goto end; @@ -4506,3 +4469,14 @@ size_t ksize(const void *objp) return size; } EXPORT_SYMBOL(ksize); + +void *nearest_obj(struct kmem_cache *cache, struct page *page, void *x) +{ + void *object = x - (x - page->s_mem) % cache->size; + void *last_object = page->s_mem + (cache->num - 1) * cache->size; + + if (unlikely(object > last_object)) + return last_object; + else + return object; +} diff --git a/mm/slab.h b/mm/slab.h index dedb1a920fb8..9be75e39449f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -42,6 +42,7 @@ struct kmem_cache { #include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/kmemleak.h> +#include <linux/random.h> /* * State of the slab allocator. @@ -253,8 +254,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (is_root_cache(s)) return 0; - ret = __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); if (ret) return ret; @@ -268,6 +268,9 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { + if (!memcg_kmem_enabled()) + return; + memcg_kmem_update_page_stat(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, @@ -366,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; # endif + if (s->flags & SLAB_KASAN) + return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can @@ -390,7 +395,11 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (should_failslab(s, flags)) return NULL; - return memcg_kmem_get_cache(s, flags); + if (memcg_kmem_enabled() && + ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) + return memcg_kmem_get_cache(s); + + return s; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, @@ -407,7 +416,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, s->flags, flags); kasan_slab_alloc(s, object, flags); } - memcg_kmem_put_cache(s); + + if (memcg_kmem_enabled()) + memcg_kmem_put_cache(s); } #ifndef CONFIG_SLOB @@ -462,6 +473,26 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +void *nearest_obj(struct kmem_cache *cache, struct page *page, void *x); + void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); +#if defined(CONFIG_SLUB) +void do_slab_free(struct kmem_cache *s, + struct page *page, void *head, void *tail, + int cnt, unsigned long addr); +#endif + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index a65dad7fdcd1..71f0b28a1bec 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; @@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) struct page *page; flags |= __GFP_COMP; - page = alloc_kmem_pages(flags, order); + page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); kasan_kmalloc_large(ret, size, flags); @@ -1030,6 +1030,53 @@ void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) EXPORT_SYMBOL(kmalloc_order_trace); #endif +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Randomize a generic freelist */ +static void freelist_randomize(struct rnd_state *state, unsigned int *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp) +{ + struct rnd_state state; + + if (count < 2 || cachep->random_seq) + return 0; + + cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage of boot */ + prandom_seed_state(&state, get_random_long()); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + #ifdef CONFIG_SLABINFO #ifdef CONFIG_SLAB diff --git a/mm/slub.c b/mm/slub.c index 825ff4505336..d4b62ed8208e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -191,7 +191,11 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ +#ifndef CONFIG_KASAN #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#else +#define __OBJECT_POISON 0x00000000UL /* Disable object poisoning */ +#endif #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ #ifdef CONFIG_SMP @@ -454,8 +458,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) */ #if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; -#elif defined(CONFIG_KASAN) -static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif @@ -1322,7 +1324,7 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline void slab_free_hook(struct kmem_cache *s, void *x) +static inline bool slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); @@ -1344,11 +1346,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - kasan_slab_free(s, x); + return kasan_slab_free(s, x); } static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) + void **head, void **tail, int *cnt) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1360,13 +1362,27 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; + void *object = *head, *prev = NULL, *next = NULL; + void *tail_obj = *tail ? : *head; + bool skip = false; do { - slab_free_hook(s, object); - } while ((object != tail_obj) && - (object = get_freepointer(s, object))); + skip = slab_free_hook(s, object); + next = (object != tail_obj) ? + get_freepointer(s, object) : NULL; + if (skip) { + if (!prev) + *head = next; + else + set_freepointer(s, prev, next); + if (object == tail_obj) + *tail = prev; + (*cnt)--; + } else { + prev = object; + } + object = next; + } while (next); #endif } @@ -1405,6 +1421,109 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, return page; } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Pre-initialize the random sequence cache */ +static int init_cache_random_seq(struct kmem_cache *s) +{ + int err; + unsigned long i, count = oo_objects(s->oo); + + err = cache_random_seq_create(s, count, GFP_KERNEL); + if (err) { + pr_err("SLUB: Unable to initialize free list for %s\n", + s->name); + return err; + } + + /* Transform to an offset on the set of pages */ + if (s->random_seq) { + for (i = 0; i < count; i++) + s->random_seq[i] *= s->size; + } + return 0; +} + +/* Initialize each random sequence freelist per cache */ +static void __init init_freelist_randomization(void) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) + init_cache_random_seq(s); + + mutex_unlock(&slab_mutex); +} + +/* Get the next entry on the pre-computed freelist randomized */ +static void *next_freelist_entry(struct kmem_cache *s, struct page *page, + unsigned long *pos, void *start, + unsigned long page_limit, + unsigned long freelist_count) +{ + unsigned int idx; + + /* + * If the target page allocation failed, the number of objects on the + * page might be smaller than the usual size defined by the cache. + */ + do { + idx = s->random_seq[*pos]; + *pos += 1; + if (*pos >= freelist_count) + *pos = 0; + } while (unlikely(idx >= page_limit)); + + return (char *)start + idx; +} + +/* Shuffle the single linked freelist based on a random pre-computed sequence */ +static bool shuffle_freelist(struct kmem_cache *s, struct page *page) +{ + void *start; + void *cur; + void *next; + unsigned long idx, pos, page_limit, freelist_count; + + if (page->objects < 2 || !s->random_seq) + return false; + + freelist_count = oo_objects(s->oo); + pos = get_random_int() % freelist_count; + + page_limit = page->objects * s->size; + start = fixup_red_left(s, page_address(page)); + + /* First entry is used as the base of the freelist */ + cur = next_freelist_entry(s, page, &pos, start, page_limit, + freelist_count); + page->freelist = cur; + + for (idx = 1; idx < page->objects; idx++) { + setup_object(s, page, cur); + next = next_freelist_entry(s, page, &pos, start, page_limit, + freelist_count); + set_freepointer(s, cur, next); + cur = next; + } + setup_object(s, page, cur); + set_freepointer(s, cur, NULL); + + return true; +} +#else +static inline int init_cache_random_seq(struct kmem_cache *s) +{ + return 0; +} +static inline void init_freelist_randomization(void) { } +static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; @@ -1412,6 +1531,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) gfp_t alloc_gfp; void *start, *p; int idx, order; + bool shuffle; flags &= gfp_allowed_mask; @@ -1473,15 +1593,19 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kasan_poison_slab(page); - for_each_object_idx(p, idx, s, start, page->objects) { - setup_object(s, page, p); - if (likely(idx < page->objects)) - set_freepointer(s, p, p + s->size); - else - set_freepointer(s, p, NULL); + shuffle = shuffle_freelist(s, page); + + if (!shuffle) { + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); + } + page->freelist = fixup_red_left(s, start); } - page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -1504,8 +1628,10 @@ out: static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); } return allocate_slab(s, @@ -2772,12 +2898,22 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { + void *free_head = head, *free_tail = tail; + + slab_free_freelist_hook(s, &free_head, &free_tail, &cnt); + /* slab_free_freelist_hook() could have emptied the freelist. */ + if (cnt == 0) + return; + do_slab_free(s, page, free_head, free_tail, cnt, addr); +} + +__always_inline void do_slab_free(struct kmem_cache *s, + struct page *page, void *head, void *tail, + int cnt, unsigned long addr) +{ void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - - slab_free_freelist_hook(s, head, tail); - redo: /* * Determine the currently cpus per cpu slab. @@ -2811,6 +2947,12 @@ redo: } +/* Helper function to be used from qlink_free() in mm/kasan/quarantine.c */ +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); @@ -2867,7 +3009,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); p[size] = NULL; /* mark object processed */ return size; } @@ -3207,6 +3349,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { + cache_random_seq_destroy(s); free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -3252,7 +3395,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->object_size; + size_t size = s->object_size; int order; /* @@ -3311,7 +3454,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the object. */ size += 2 * sizeof(struct track); +#endif + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch @@ -3431,6 +3577,13 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto error; + } + if (!init_kmem_cache_nodes(s)) goto error; @@ -3575,7 +3728,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; - page = alloc_kmem_pages_node(node, flags, get_order(size)); + page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3656,7 +3809,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); @@ -3947,6 +4100,9 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); + /* Setup random freelists for each cache */ + init_freelist_randomization(); + #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif @@ -5585,3 +5741,16 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, return -EIO; } #endif /* CONFIG_SLABINFO */ + +void *nearest_obj(struct kmem_cache *cache, struct page *page, + void *x) { + void *object = x - (x - page_address(page)) % cache->size; + void *last_object = page_address(page) + + (page->objects - 1) * cache->size; + void *result = (unlikely(object > last_object)) ? last_object : object; + + if (cache->flags & SLAB_RED_ZONE) + return (void *)((char *)result + cache->red_left_pad); + else + return result; +} diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 68885dcbaf40..574c67b663fe 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -36,7 +36,7 @@ * Uses the main allocators if they are available, else bootmem. */ -static void * __init_refok __earlyonly_bootmem_alloc(int node, +static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long size, unsigned long align, unsigned long goal) diff --git a/mm/sparse.c b/mm/sparse.c index 5d0cf4540364..d8c8ae410796 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -59,7 +59,7 @@ static inline void set_section_nid(unsigned long section_nr, int nid) #endif #ifdef CONFIG_SPARSEMEM_EXTREME -static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) +static noinline struct mem_section __ref *sparse_index_alloc(int nid) { struct mem_section *section = NULL; unsigned long array_size = SECTIONS_PER_ROOT * diff --git a/mm/swap.c b/mm/swap.c index 90530ff8ed16..616df4ddd870 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -292,6 +292,7 @@ static bool need_activate_page_drain(int cpu) void activate_page(struct page *page) { + page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); @@ -316,6 +317,7 @@ void activate_page(struct page *page) { struct zone *zone = page_zone(page); + page = compound_head(page); spin_lock_irq(&zone->lru_lock); __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 031713ab40ce..78cfa292a29a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2493,7 +2493,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } /* frontswap enabled? set up bit-per-page map for frontswap */ - if (frontswap_enabled) + if (IS_ENABLED(CONFIG_FRONTSWAP)) frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { diff --git a/mm/truncate.c b/mm/truncate.c index 4064f8f53daa..a01cce450a26 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -155,10 +155,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { + loff_t holelen; + VM_BUG_ON_PAGE(PageTail(page), page); + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; if (page_mapped(page)) { unmap_mapping_range(mapping, (loff_t)page->index << PAGE_SHIFT, - PAGE_SIZE, 0); + holelen, 0); } return truncate_complete_page(mapping, page); } @@ -279,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -367,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -487,7 +491,21 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page->index != index); + + WARN_ON(page_to_pgoff(page) != index); + + /* Middle of THP: skip */ + if (PageTransTail(page)) { + unlock_page(page); + continue; + } else if (PageTransHuge(page)) { + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + /* 'end' is in the middle of THP */ + if (index == round_down(end, HPAGE_PMD_NR)) + continue; + } + ret = invalidate_inode_page(page); unlock_page(page); /* @@ -594,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; diff --git a/mm/util.c b/mm/util.c index 917e0e3d0f8e..8d010ef2ce1c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page) } mapping = page->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; - return mapping; + + return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } +EXPORT_SYMBOL(page_mapping); /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) @@ -410,6 +412,12 @@ int __page_mapcount(struct page *page) int ret; ret = atomic_read(&page->_mapcount) + 1; + /* + * For file THP page->_mapcount contains total number of mapping + * of the page: no need to look into compound_mapcount. + */ + if (!PageAnon(page) && !PageHuge(page)) + return ret; page = compound_head(page); ret += atomic_read(compound_mapcount_ptr(page)) + 1; if (PageDoubleMap(page)) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e11475cdeb7a..91f44e78c516 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_kmem_pages(page, 0); + __free_pages(page, 0); } kvfree(area->pages); @@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_kmem_pages(alloc_mask, order); + page = alloc_pages(alloc_mask, order); else - page = alloc_kmem_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ diff --git a/mm/vmscan.c b/mm/vmscan.c index c4a2f4512fca..21d417ccff69 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1055,8 +1055,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Adding to swap updated mapping */ mapping = page_mapping(page); + } else if (unlikely(PageTransHuge(page))) { + /* Split file THP */ + if (split_huge_page_to_list(page, page_list)) + goto keep_locked; } + VM_BUG_ON_PAGE(PageTransHuge(page), page); + /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. @@ -1254,7 +1260,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !isolated_balloon_page(page)) { + !__PageMovable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } diff --git a/mm/vmstat.c b/mm/vmstat.c index cb2a67bb4158..7997f52935c9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -718,7 +718,9 @@ const char * const vmstat_text[] = { "nr_dirtied", "nr_written", "nr_pages_scanned", - +#if IS_ENABLED(CONFIG_ZSMALLOC) + "nr_zspages", +#endif #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -731,6 +733,8 @@ const char * const vmstat_text[] = { "workingset_activate", "workingset_nodereclaim", "nr_anon_transparent_hugepages", + "nr_shmem_hugepages", + "nr_shmem_pmdmapped", "nr_free_cma", /* enum writeback_stat_item counters */ @@ -815,6 +819,8 @@ const char * const vmstat_text[] = { "thp_fault_fallback", "thp_collapse_alloc", "thp_collapse_alloc_failed", + "thp_file_alloc", + "thp_file_mapped", "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", diff --git a/mm/workingset.c b/mm/workingset.c index 8a75f8d2916a..8252de4566e9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -305,9 +305,10 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { + struct mem_cgroup *memcg; struct lruvec *lruvec; - lock_page_memcg(page); + rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. @@ -315,12 +316,13 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - if (!mem_cgroup_disabled() && !page_memcg(page)) + memcg = page_memcg_rcu(page); + if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg); atomic_long_inc(&lruvec->inactive_age); out: - unlock_page_memcg(page); + rcu_read_unlock(); } /* diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b6d4f258cb53..e425de45e326 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,37 +16,22 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->private: points to the first component (0-order) page - * page->index (union with page->freelist): offset of the first object - * starting in this page. For the first page, this is - * always 0, so we use this field (aka freelist) to point - * to the first free object in zspage. - * page->lru: links together all component pages (except the first page) - * of a zspage - * - * For _first_ page only: - * - * page->private: refers to the component page after the first page - * If the page is first_page for huge object, it stores handle. - * Look at size_class->huge. - * page->freelist: points to the first free object in zspage. - * Free objects are linked together using in-place - * metadata. - * page->objects: maximum number of objects we can store in this - * zspage (class->zspage_order * PAGE_SIZE / class->size) - * page->lru: links together first pages of various zspages. - * Basically forming list of zspages in a fullness group. - * page->mapping: class index and fullness group of the zspage - * page->inuse: the number of objects that are used in this zspage + * page->private: points to zspage + * page->freelist(index): links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. * * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page + * PG_owner_priv_1: indentifies the huge component page * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define CREATE_TRACE_POINTS + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -66,6 +51,12 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> +#include <linux/mount.h> +#include <linux/migrate.h> +#include <linux/pagemap.h> +#include <trace/events/zsmalloc.h> + +#define ZSPAGE_MAGIC 0x58 /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -88,9 +79,7 @@ * Object location (<PFN>, <obj_idx>) is encoded as * as single (unsigned long) handle value. * - * Note that object index <obj_idx> is relative to system - * page <PFN> it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. + * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE. */ @@ -149,33 +138,29 @@ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above) */ -#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) /* * We do not maintain any list for completely empty or full pages */ enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - ZS_EMPTY, - ZS_FULL + ZS_ALMOST_EMPTY, + ZS_ALMOST_FULL, + ZS_FULL, + NR_ZS_FULLNESS, }; enum zs_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, + CLASS_FULL, OBJ_ALLOCATED, OBJ_USED, - CLASS_ALMOST_FULL, - CLASS_ALMOST_EMPTY, + NR_ZS_STAT_TYPE, }; -#ifdef CONFIG_ZSMALLOC_STAT -#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) -#else -#define NR_ZS_STAT_TYPE (OBJ_USED + 1) -#endif - struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -184,6 +169,10 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif +#ifdef CONFIG_COMPACTION +static struct vfsmount *zsmalloc_mnt; +#endif + /* * number of size_classes */ @@ -207,35 +196,49 @@ static const int fullness_threshold_frac = 4; struct size_class { spinlock_t lock; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. */ int size; - unsigned int index; - - struct zs_size_stat stats; - + int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ - bool huge; + + unsigned int index; + struct zs_size_stat stats; }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetPageHugeObject(struct page *page) +{ + SetPageOwnerPriv1(page); +} + +static void ClearPageHugeObject(struct page *page) +{ + ClearPageOwnerPriv1(page); +} + +static int PageHugeObject(struct page *page) +{ + return PageOwnerPriv1(page); +} + /* * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. + * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { union { /* - * Position of next free chunk (encodes <PFN, obj_idx>) + * Free object index; * It's valid for non-allocated object */ - void *next; + unsigned long next; /* * Handle of allocated object. */ @@ -248,6 +251,7 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; + struct kmem_cache *zspage_cachep; atomic_long_t pages_allocated; @@ -263,16 +267,36 @@ struct zs_pool { #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif +#ifdef CONFIG_COMPACTION + struct inode *inode; + struct work_struct free_work; +#endif }; /* * A zspage's class index and fullness group * are encoded in its (first)page->mapping */ -#define CLASS_IDX_BITS 28 -#define FULLNESS_BITS 4 -#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) -#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + +struct zspage { + struct { + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS; + unsigned int isolated:ISOLATED_BITS; + unsigned int magic:MAGIC_VAL_BITS; + }; + unsigned int inuse; + unsigned int freeobj; + struct page *first_page; + struct list_head list; /* fullness list */ +#ifdef CONFIG_COMPACTION + rwlock_t lock; +#endif +}; struct mapping_area { #ifdef CONFIG_PGTABLE_MAPPING @@ -284,29 +308,74 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; -static int create_handle_cache(struct zs_pool *pool) +#ifdef CONFIG_COMPACTION +static int zs_register_migration(struct zs_pool *pool); +static void zs_unregister_migration(struct zs_pool *pool); +static void migrate_lock_init(struct zspage *zspage); +static void migrate_read_lock(struct zspage *zspage); +static void migrate_read_unlock(struct zspage *zspage); +static void kick_deferred_free(struct zs_pool *pool); +static void init_deferred_free(struct zs_pool *pool); +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +#else +static int zsmalloc_mount(void) { return 0; } +static void zsmalloc_unmount(void) {} +static int zs_register_migration(struct zs_pool *pool) { return 0; } +static void zs_unregister_migration(struct zs_pool *pool) {} +static void migrate_lock_init(struct zspage *zspage) {} +static void migrate_read_lock(struct zspage *zspage) {} +static void migrate_read_unlock(struct zspage *zspage) {} +static void kick_deferred_free(struct zs_pool *pool) {} +static void init_deferred_free(struct zs_pool *pool) {} +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +#endif + +static int create_cache(struct zs_pool *pool) { pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0, NULL); - return pool->handle_cachep ? 0 : 1; + if (!pool->handle_cachep) + return 1; + + pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), + 0, 0, NULL); + if (!pool->zspage_cachep) { + kmem_cache_destroy(pool->handle_cachep); + pool->handle_cachep = NULL; + return 1; + } + + return 0; } -static void destroy_handle_cache(struct zs_pool *pool) +static void destroy_cache(struct zs_pool *pool) { kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->zspage_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) +static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~__GFP_HIGHMEM); + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } -static void free_handle(struct zs_pool *pool, unsigned long handle) +static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); } +static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) +{ + return kmem_cache_alloc(pool->zspage_cachep, + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +}; + +static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +{ + kmem_cache_free(pool->zspage_cachep, zspage); +} + static void record_obj(unsigned long handle, unsigned long obj) { /* @@ -409,38 +478,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static bool is_zspage_isolated(struct zspage *zspage) +{ + return zspage->isolated; +} + static int is_first_page(struct page *page) { return PagePrivate(page); } -static int is_last_page(struct page *page) +/* Protected by class->lock */ +static inline int get_zspage_inuse(struct zspage *zspage) +{ + return zspage->inuse; +} + +static inline void set_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse = val; +} + +static inline void mod_zspage_inuse(struct zspage *zspage, int val) { - return PagePrivate2(page); + zspage->inuse += val; } -static void get_zspage_mapping(struct page *first_page, +static inline struct page *get_first_page(struct zspage *zspage) +{ + struct page *first_page = zspage->first_page; + + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + return first_page; +} + +static inline int get_first_obj_offset(struct page *page) +{ + return page->units; +} + +static inline void set_first_obj_offset(struct page *page, int offset) +{ + page->units = offset; +} + +static inline unsigned int get_freeobj(struct zspage *zspage) +{ + return zspage->freeobj; +} + +static inline void set_freeobj(struct zspage *zspage, unsigned int obj) +{ + zspage->freeobj = obj; +} + +static void get_zspage_mapping(struct zspage *zspage, unsigned int *class_idx, enum fullness_group *fullness) { - unsigned long m; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + BUG_ON(zspage->magic != ZSPAGE_MAGIC); - m = (unsigned long)first_page->mapping; - *fullness = m & FULLNESS_MASK; - *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; + *fullness = zspage->fullness; + *class_idx = zspage->class; } -static void set_zspage_mapping(struct page *first_page, +static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) { - unsigned long m; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | - (fullness & FULLNESS_MASK); - first_page->mapping = (struct address_space *)m; + zspage->class = class_idx; + zspage->fullness = fullness; } /* @@ -464,23 +571,19 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] += cnt; + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] -= cnt; + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - if (type < NR_ZS_STAT_TYPE) - return class->stats.objs[type]; - return 0; + return class->stats.objs[type]; } #ifdef CONFIG_ZSMALLOC_STAT @@ -624,6 +727,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) } #endif + /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -631,21 +735,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *first_page) +static enum fullness_group get_fullness_group(struct size_class *class, + struct zspage *zspage) { - int inuse, max_objects; + int inuse, objs_per_zspage; enum fullness_group fg; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - inuse = first_page->inuse; - max_objects = first_page->objects; + inuse = get_zspage_inuse(zspage); + objs_per_zspage = class->objs_per_zspage; if (inuse == 0) fg = ZS_EMPTY; - else if (inuse == max_objects) + else if (inuse == objs_per_zspage) fg = ZS_FULL; - else if (inuse <= 3 * max_objects / fullness_threshold_frac) + else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -660,32 +763,25 @@ static enum fullness_group get_fullness_group(struct page *first_page) * identified by <class, fullness_group>. */ static void insert_zspage(struct size_class *class, - enum fullness_group fullness, - struct page *first_page) + struct zspage *zspage, + enum fullness_group fullness) { - struct page **head; - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); - - head = &class->fullness_list[fullness]; - if (!*head) { - *head = first_page; - return; - } + struct zspage *head; + zs_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); /* - * We want to see more ZS_FULL pages and less almost - * empty/full. Put pages with higher ->inuse first. + * We want to see more ZS_FULL pages and less almost empty/full. + * Put pages with higher ->inuse first. */ - list_add_tail(&first_page->lru, &(*head)->lru); - if (first_page->inuse >= (*head)->inuse) - *head = first_page; + if (head) { + if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { + list_add(&zspage->list, &head->list); + return; + } + } + list_add(&zspage->list, &class->fullness_list[fullness]); } /* @@ -693,27 +789,14 @@ static void insert_zspage(struct size_class *class, * by <class, fullness_group>. */ static void remove_zspage(struct size_class *class, - enum fullness_group fullness, - struct page *first_page) + struct zspage *zspage, + enum fullness_group fullness) { - struct page **head; - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - VM_BUG_ON_PAGE(!*head, first_page); - if (list_empty(&(*head)->lru)) - *head = NULL; - else if (*head == first_page) - *head = (struct page *)list_entry((*head)->lru.next, - struct page, lru); + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); + VM_BUG_ON(is_zspage_isolated(zspage)); - list_del_init(&first_page->lru); - zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + list_del_init(&zspage->list); + zs_stat_dec(class, fullness, 1); } /* @@ -726,19 +809,22 @@ static void remove_zspage(struct size_class *class, * fullness group. */ static enum fullness_group fix_fullness_group(struct size_class *class, - struct page *first_page) + struct zspage *zspage) { int class_idx; enum fullness_group currfg, newfg; - get_zspage_mapping(first_page, &class_idx, &currfg); - newfg = get_fullness_group(first_page); + get_zspage_mapping(zspage, &class_idx, &currfg); + newfg = get_fullness_group(class, zspage); if (newfg == currfg) goto out; - remove_zspage(class, currfg, first_page); - insert_zspage(class, newfg, first_page); - set_zspage_mapping(first_page, class_idx, newfg); + if (!is_zspage_isolated(zspage)) { + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); + } + + set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; @@ -780,64 +866,49 @@ static int get_pages_per_zspage(int class_size) return max_usedpc_order; } -/* - * A single 'zspage' is composed of many system pages which are - * linked together using fields in struct page. This function finds - * the first/head page, given any component page of a zspage. - */ -static struct page *get_first_page(struct page *page) +static struct zspage *get_zspage(struct page *page) { - if (is_first_page(page)) - return page; - else - return (struct page *)page_private(page); + struct zspage *zspage = (struct zspage *)page->private; + + BUG_ON(zspage->magic != ZSPAGE_MAGIC); + return zspage; } static struct page *get_next_page(struct page *page) { - struct page *next; + if (unlikely(PageHugeObject(page))) + return NULL; - if (is_last_page(page)) - next = NULL; - else if (is_first_page(page)) - next = (struct page *)page_private(page); - else - next = list_entry(page->lru.next, struct page, lru); + return page->freelist; +} - return next; +/** + * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * @page: page object resides in zspage + * @obj_idx: object index + */ +static void obj_to_location(unsigned long obj, struct page **page, + unsigned int *obj_idx) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); } -/* - * Encode <page, obj_idx> as a single handle value. - * We use the least bit of handle for tagging. +/** + * location_to_obj - get obj value encoded from (<page>, <obj_idx>) + * @page: page object resides in zspage + * @obj_idx: object index */ -static void *location_to_obj(struct page *page, unsigned long obj_idx) +static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) { unsigned long obj; - if (!page) { - VM_BUG_ON(obj_idx); - return NULL; - } - obj = page_to_pfn(page) << OBJ_INDEX_BITS; - obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj |= obj_idx & OBJ_INDEX_MASK; obj <<= OBJ_TAG_BITS; - return (void *)obj; -} - -/* - * Decode <page, obj_idx> pair from the given object handle. We adjust the - * decoded obj_idx back to its original value since it was adjusted in - * location_to_obj(). - */ -static void obj_to_location(unsigned long obj, struct page **page, - unsigned long *obj_idx) -{ - obj >>= OBJ_TAG_BITS; - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); - *obj_idx = (obj & OBJ_INDEX_MASK); + return obj; } static unsigned long handle_to_obj(unsigned long handle) @@ -845,109 +916,147 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct size_class *class, struct page *page, - void *obj) +static unsigned long obj_to_head(struct page *page, void *obj) { - if (class->huge) { + if (unlikely(PageHugeObject(page))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page_private(page); + return page->index; } else return *(unsigned long *)obj; } -static unsigned long obj_idx_to_offset(struct page *page, - unsigned long obj_idx, int class_size) +static inline int testpin_tag(unsigned long handle) { - unsigned long off = 0; - - if (!is_first_page(page)) - off = page->index; - - return off + obj_idx * class_size; + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); } static inline int trypin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void pin_tag(unsigned long handle) { - while (!trypin_tag(handle)); + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void unpin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - clear_bit_unlock(HANDLE_PIN_BIT, ptr); + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void reset_page(struct page *page) { + __ClearPageMovable(page); clear_bit(PG_private, &page->flags); clear_bit(PG_private_2, &page->flags); set_page_private(page, 0); - page->mapping = NULL; - page->freelist = NULL; page_mapcount_reset(page); + ClearPageHugeObject(page); + page->freelist = NULL; +} + +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +void lock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); } -static void free_zspage(struct page *first_page) +int trylock_zspage(struct zspage *zspage) { - struct page *nextp, *tmp, *head_extra; + struct page *cursor, *fail; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - VM_BUG_ON_PAGE(first_page->inuse, first_page); + for (cursor = get_first_page(zspage); cursor != NULL; cursor = + get_next_page(cursor)) { + if (!trylock_page(cursor)) { + fail = cursor; + goto unlock; + } + } - head_extra = (struct page *)page_private(first_page); + return 1; +unlock: + for (cursor = get_first_page(zspage); cursor != fail; cursor = + get_next_page(cursor)) + unlock_page(cursor); - reset_page(first_page); - __free_page(first_page); + return 0; +} - /* zspage with only 1 system page */ - if (!head_extra) - return; +static void __free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + struct page *page, *next; + enum fullness_group fg; + unsigned int class_idx; - list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { - list_del(&nextp->lru); - reset_page(nextp); - __free_page(nextp); + get_zspage_mapping(zspage, &class_idx, &fg); + + assert_spin_locked(&class->lock); + + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(fg != ZS_EMPTY); + + next = page = get_first_page(zspage); + do { + VM_BUG_ON_PAGE(!PageLocked(page), page); + next = get_next_page(page); + reset_page(page); + unlock_page(page); + dec_zone_page_state(page, NR_ZSPAGES); + put_page(page); + page = next; + } while (page != NULL); + + cache_free_zspage(pool, zspage); + + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); +} + +static void free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; } - reset_page(head_extra); - __free_page(head_extra); + + remove_zspage(class, zspage, ZS_EMPTY); + __free_zspage(pool, class, zspage); } /* Initialize a newly allocated zspage */ -static void init_zspage(struct size_class *class, struct page *first_page) +static void init_zspage(struct size_class *class, struct zspage *zspage) { + unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = first_page; - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + struct page *page = get_first_page(zspage); while (page) { struct page *next_page; struct link_free *link; - unsigned int i = 1; void *vaddr; - /* - * page->index stores offset of first object starting - * in the page. For the first page, this is always 0, - * so we use first_page->index (aka ->freelist) to store - * head of corresponding zspage's freelist. - */ - if (page != first_page) - page->index = off; + set_first_obj_offset(page, off); vaddr = kmap_atomic(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = location_to_obj(page, i++); + link->next = freeobj++ << OBJ_TAG_BITS; link += class->size / sizeof(*link); } @@ -957,87 +1066,112 @@ static void init_zspage(struct size_class *class, struct page *first_page) * page (if present) */ next_page = get_next_page(page); - link->next = location_to_obj(next_page, 0); + if (next_page) { + link->next = freeobj++ << OBJ_TAG_BITS; + } else { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + link->next = -1 << OBJ_TAG_BITS; + } kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } + + set_freeobj(zspage, 0); } -/* - * Allocate a zspage for the given size class - */ -static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +static void create_page_chain(struct size_class *class, struct zspage *zspage, + struct page *pages[]) { - int i, error; - struct page *first_page = NULL, *uninitialized_var(prev_page); + int i; + struct page *page; + struct page *prev_page = NULL; + int nr_pages = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. first page->private = first sub-page - * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->private + * 1. all pages are linked together using page->freelist + * 2. each sub-page point to zspage using page->private * - * For each size class, First/Head pages are linked together using - * page->lru. Also, we set PG_private to identify the first page - * (i.e. no other sub-page has this flag set) and PG_private_2 to - * identify the last page. + * we set PG_private to identify the first page (i.e. no other sub-page + * has this flag set) and PG_private_2 to identify the last page. */ - error = -ENOMEM; - for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; - - page = alloc_page(flags); - if (!page) - goto cleanup; - - INIT_LIST_HEAD(&page->lru); - if (i == 0) { /* first page */ + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + set_page_private(page, (unsigned long)zspage); + page->freelist = NULL; + if (i == 0) { + zspage->first_page = page; SetPagePrivate(page); - set_page_private(page, 0); - first_page = page; - first_page->inuse = 0; + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) + SetPageHugeObject(page); + } else { + prev_page->freelist = page; } - if (i == 1) - set_page_private(first_page, (unsigned long)page); - if (i >= 1) - set_page_private(page, (unsigned long)first_page); - if (i >= 2) - list_add(&page->lru, &prev_page->lru); - if (i == class->pages_per_zspage - 1) /* last page */ + if (i == nr_pages - 1) SetPagePrivate2(page); prev_page = page; } +} - init_zspage(class, first_page); +/* + * Allocate a zspage for the given size class + */ +static struct zspage *alloc_zspage(struct zs_pool *pool, + struct size_class *class, + gfp_t gfp) +{ + int i; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zspage *zspage = cache_alloc_zspage(pool, gfp); + + if (!zspage) + return NULL; + + memset(zspage, 0, sizeof(struct zspage)); + zspage->magic = ZSPAGE_MAGIC; + migrate_lock_init(zspage); - first_page->freelist = location_to_obj(first_page, 0); - /* Maximum number of objects we can store in this zspage */ - first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; - error = 0; /* Success */ + page = alloc_page(gfp); + if (!page) { + while (--i >= 0) { + dec_zone_page_state(pages[i], NR_ZSPAGES); + __free_page(pages[i]); + } + cache_free_zspage(pool, zspage); + return NULL; + } -cleanup: - if (unlikely(error) && first_page) { - free_zspage(first_page); - first_page = NULL; + inc_zone_page_state(page, NR_ZSPAGES); + pages[i] = page; } - return first_page; + create_page_chain(class, zspage, pages); + init_zspage(class, zspage); + + return zspage; } -static struct page *find_get_zspage(struct size_class *class) +static struct zspage *find_get_zspage(struct size_class *class) { int i; - struct page *page; + struct zspage *zspage; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) + for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + zspage = list_first_entry_or_null(&class->fullness_list[i], + struct zspage, list); + if (zspage) break; } - return page; + return zspage; } #ifdef CONFIG_PGTABLE_MAPPING @@ -1242,11 +1376,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } -static bool zspage_full(struct page *first_page) +static bool zspage_full(struct size_class *class, struct zspage *zspage) { - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - return first_page->inuse == first_page->objects; + return get_zspage_inuse(zspage) == class->objs_per_zspage; } unsigned long zs_get_total_pages(struct zs_pool *pool) @@ -1272,8 +1404,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; @@ -1294,9 +1428,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + + /* migration cannot move any subpage in this zspage */ + migrate_read_lock(zspage); + + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = &get_cpu_var(zs_map_area); area->vm_mm = mm; @@ -1314,7 +1453,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (!class->huge) + if (likely(!PageHugeObject(page))) ret += ZS_HANDLE_SIZE; return ret; @@ -1323,8 +1462,10 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; @@ -1333,9 +1474,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1350,38 +1492,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + + migrate_read_unlock(zspage); unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); static unsigned long obj_malloc(struct size_class *class, - struct page *first_page, unsigned long handle) + struct zspage *zspage, unsigned long handle) { + int i, nr_page, offset; unsigned long obj; struct link_free *link; struct page *m_page; - unsigned long m_objidx, m_offset; + unsigned long m_offset; void *vaddr; handle |= OBJ_ALLOCATED_TAG; - obj = (unsigned long)first_page->freelist; - obj_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + obj = get_freeobj(zspage); + + offset = obj * class->size; + nr_page = offset >> PAGE_SHIFT; + m_offset = offset & ~PAGE_MASK; + m_page = get_first_page(zspage); + + for (i = 0; i < nr_page; i++) + m_page = get_next_page(m_page); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); - first_page->freelist = link->next; - if (!class->huge) + set_freeobj(zspage, link->next >> OBJ_TAG_BITS); + if (likely(!PageHugeObject(m_page))) /* record handle in the header of allocated chunk */ link->handle = handle; else - /* record handle in first_page->private */ - set_page_private(first_page, handle); + /* record handle to page->index */ + zspage->first_page->index = handle; + kunmap_atomic(vaddr); - first_page->inuse++; + mod_zspage_inuse(zspage, 1); zs_stat_inc(class, OBJ_USED, 1); + obj = location_to_obj(m_page, obj); + return obj; } @@ -1399,12 +1553,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; - struct page *first_page; + enum fullness_group newfg; + struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool, gfp); + handle = cache_alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,29 +1568,38 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { + zspage = find_get_zspage(class); + if (likely(zspage)) { + obj = obj_malloc(class, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); spin_unlock(&class->lock); - first_page = alloc_zspage(class, gfp); - if (unlikely(!first_page)) { - free_handle(pool, handle); - return 0; - } - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - atomic_long_add(class->pages_per_zspage, - &pool->pages_allocated); + return handle; + } - spin_lock(&class->lock); - zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); + spin_unlock(&class->lock); + + zspage = alloc_zspage(pool, class, gfp); + if (!zspage) { + cache_free_handle(pool, handle); + return 0; } - obj = obj_malloc(class, first_page, handle); - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(class, first_page); + spin_lock(&class->lock); + obj = obj_malloc(class, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); spin_unlock(&class->lock); return handle; @@ -1445,36 +1609,38 @@ EXPORT_SYMBOL_GPL(zs_malloc); static void obj_free(struct size_class *class, unsigned long obj) { struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; + struct zspage *zspage; + struct page *f_page; + unsigned long f_offset; + unsigned int f_objidx; void *vaddr; obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + f_offset = (class->size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = first_page->freelist; - if (class->huge) - set_page_private(first_page, 0); + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; kunmap_atomic(vaddr); - first_page->freelist = (void *)obj; - first_page->inuse--; + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) { - struct page *first_page, *f_page; - unsigned long obj, f_objidx; + struct zspage *zspage; + struct page *f_page; + unsigned long obj; + unsigned int f_objidx; int class_idx; struct size_class *class; enum fullness_group fullness; + bool isolated; if (unlikely(!handle)) return; @@ -1482,25 +1648,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle) pin_tag(handle); obj = handle_to_obj(handle); obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); + zspage = get_zspage(f_page); + + migrate_read_lock(zspage); - get_zspage_mapping(first_page, &class_idx, &fullness); + get_zspage_mapping(zspage, &class_idx, &fullness); class = pool->size_class[class_idx]; spin_lock(&class->lock); obj_free(class, obj); - fullness = fix_fullness_group(class, first_page); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); - free_zspage(first_page); + fullness = fix_fullness_group(class, zspage); + if (fullness != ZS_EMPTY) { + migrate_read_unlock(zspage); + goto out; } + + isolated = is_zspage_isolated(zspage); + migrate_read_unlock(zspage); + /* If zspage is isolated, zs_page_putback will free the zspage */ + if (likely(!isolated)) + free_zspage(pool, class, zspage); +out: + spin_unlock(&class->lock); unpin_tag(handle); - - free_handle(pool, handle); + cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1508,7 +1680,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { struct page *s_page, *d_page; - unsigned long s_objidx, d_objidx; + unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; int s_size, d_size, size; @@ -1519,8 +1691,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = obj_idx_to_offset(s_page, s_objidx, class->size); - d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + s_off = (class->size * s_objidx) & ~PAGE_MASK; + d_off = (class->size * d_objidx) & ~PAGE_MASK; if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; @@ -1579,12 +1751,11 @@ static unsigned long find_alloced_obj(struct size_class *class, unsigned long handle = 0; void *addr = kmap_atomic(page); - if (!is_first_page(page)) - offset = page->index; + offset = get_first_obj_offset(page); offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(class, page, addr + offset); + head = obj_to_head(page, addr + offset); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; if (trypin_tag(handle)) @@ -1601,7 +1772,7 @@ static unsigned long find_alloced_obj(struct size_class *class, } struct zs_compact_control { - /* Source page for migration which could be a subpage of zspage. */ + /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; /* Destination page for migration which should be a first page * of zspage. */ @@ -1632,14 +1803,14 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } /* Stop if there is no more space */ - if (zspage_full(d_page)) { + if (zspage_full(class, get_zspage(d_page))) { unpin_tag(handle); ret = -ENOMEM; break; } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, d_page, handle); + free_obj = obj_malloc(class, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); index++; /* @@ -1661,68 +1832,422 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, return ret; } -static struct page *isolate_target_page(struct size_class *class) +static struct zspage *isolate_zspage(struct size_class *class, bool source) { int i; - struct page *page; + struct zspage *zspage; + enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) { - remove_zspage(class, i, page); - break; + if (!source) { + fg[0] = ZS_ALMOST_FULL; + fg[1] = ZS_ALMOST_EMPTY; + } + + for (i = 0; i < 2; i++) { + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { + VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; } } - return page; + return zspage; } /* - * putback_zspage - add @first_page into right class's fullness list - * @pool: target pool + * putback_zspage - add @zspage into right class's fullness list * @class: destination class - * @first_page: target page + * @zspage: target page * - * Return @fist_page's fullness_group + * Return @zspage's fullness_group */ -static enum fullness_group putback_zspage(struct zs_pool *pool, - struct size_class *class, - struct page *first_page) +static enum fullness_group putback_zspage(struct size_class *class, + struct zspage *zspage) { enum fullness_group fullness; - fullness = get_fullness_group(first_page); - insert_zspage(class, fullness, first_page); - set_zspage_mapping(first_page, class->index, fullness); + VM_BUG_ON(is_zspage_isolated(zspage)); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); + + return fullness; +} + +#ifdef CONFIG_COMPACTION +static struct dentry *zs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); +} + +static struct file_system_type zsmalloc_fs = { + .name = "zsmalloc", + .mount = zs_mount, + .kill_sb = kill_anon_super, +}; + +static int zsmalloc_mount(void) +{ + int ret = 0; + + zsmalloc_mnt = kern_mount(&zsmalloc_fs); + if (IS_ERR(zsmalloc_mnt)) + ret = PTR_ERR(zsmalloc_mnt); + + return ret; +} + +static void zsmalloc_unmount(void) +{ + kern_unmount(zsmalloc_mnt); +} + +static void migrate_lock_init(struct zspage *zspage) +{ + rwlock_init(&zspage->lock); +} - free_zspage(first_page); +static void migrate_read_lock(struct zspage *zspage) +{ + read_lock(&zspage->lock); +} + +static void migrate_read_unlock(struct zspage *zspage) +{ + read_unlock(&zspage->lock); +} + +static void migrate_write_lock(struct zspage *zspage) +{ + write_lock(&zspage->lock); +} + +static void migrate_write_unlock(struct zspage *zspage) +{ + write_unlock(&zspage->lock); +} + +/* Number of isolated subpage for *page migration* in this zspage */ +static void inc_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated++; +} + +static void dec_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated--; +} + +static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) +{ + struct page *page; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + int idx = 0; + + page = get_first_page(zspage); + do { + if (page == oldpage) + pages[idx] = newpage; + else + pages[idx] = page; + idx++; + } while ((page = get_next_page(page)) != NULL); + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + if (unlikely(PageHugeObject(oldpage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, page_mapping(oldpage)); +} + +bool zs_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct address_space *mapping; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at + * lock_zspage in free_zspage. + */ + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); + + /* + * Without class lock, fullness could be stale while class_idx is okay + * because class_idx is constant unless page is freed so we should get + * fullness again under class lock. + */ + get_zspage_mapping(zspage, &class_idx, &fullness); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + if (get_zspage_inuse(zspage) == 0) { + spin_unlock(&class->lock); + return false; } - return fullness; + /* zspage is isolated for object migration */ + if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + spin_unlock(&class->lock); + return false; + } + + /* + * If this is first time isolation for the zspage, isolate zspage from + * size_class to prevent further object allocation from the zspage. + */ + if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + get_zspage_mapping(zspage, &class_idx, &fullness); + remove_zspage(class, zspage, fullness); + } + + inc_zspage_isolation(zspage); + spin_unlock(&class->lock); + + return true; } -static struct page *isolate_source_page(struct size_class *class) +int zs_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; + int offset, pos; + unsigned long handle, head; + unsigned long old_obj, new_obj; + unsigned int obj_idx; + int ret = -EAGAIN; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + + /* Concurrent compactor cannot migrate any subpage in zspage */ + migrate_write_lock(zspage); + get_zspage_mapping(zspage, &class_idx, &fullness); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + offset = get_first_obj_offset(page); + + spin_lock(&class->lock); + if (!get_zspage_inuse(zspage)) { + ret = -EBUSY; + goto unlock_class; + } + + pos = offset; + s_addr = kmap_atomic(page); + while (pos < PAGE_SIZE) { + head = obj_to_head(page, s_addr + pos); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!trypin_tag(handle)) + goto unpin_objects; + } + pos += class->size; + } + + /* + * Here, any user cannot access all objects in the zspage so let's move. + */ + d_addr = kmap_atomic(newpage); + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); + new_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, new_obj); + } + } + + replace_sub_page(class, zspage, newpage, page); + get_page(newpage); + + dec_zspage_isolation(zspage); + + /* + * Page migration is done so let's putback isolated zspage to + * the list if @page is final isolated subpage in the zspage. + */ + if (!is_zspage_isolated(zspage)) + putback_zspage(class, zspage); + + reset_page(page); + put_page(page); + page = newpage; + + ret = MIGRATEPAGE_SUCCESS; +unpin_objects: + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + unpin_tag(handle); + } + } + kunmap_atomic(s_addr); +unlock_class: + spin_unlock(&class->lock); + migrate_write_unlock(zspage); + + return ret; +} + +void zs_page_putback(struct page *page) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fg; + struct address_space *mapping; + struct zspage *zspage; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + dec_zspage_isolation(zspage); + if (!is_zspage_isolated(zspage)) { + fg = putback_zspage(class, zspage); + /* + * Due to page_lock, we cannot free zspage immediately + * so let's defer. + */ + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + } + spin_unlock(&class->lock); +} + +const struct address_space_operations zsmalloc_aops = { + .isolate_page = zs_page_isolate, + .migratepage = zs_page_migrate, + .putback_page = zs_page_putback, +}; + +static int zs_register_migration(struct zs_pool *pool) +{ + pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &zsmalloc_aops; + return 0; +} + +static void zs_unregister_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); + if (pool->inode) + iput(pool->inode); +} + +/* + * Caller should hold page_lock of all pages in the zspage + * In here, we cannot use zspage meta data. + */ +static void async_free_zspage(struct work_struct *work) { int i; - struct page *page = NULL; + struct size_class *class; + unsigned int class_idx; + enum fullness_group fullness; + struct zspage *zspage, *tmp; + LIST_HEAD(free_pages); + struct zs_pool *pool = container_of(work, struct zs_pool, + free_work); - for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { - page = class->fullness_list[i]; - if (!page) + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + if (class->index != i) continue; - remove_zspage(class, i, page); - break; + spin_lock(&class->lock); + list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); + spin_unlock(&class->lock); + } + + + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); + + get_zspage_mapping(zspage, &class_idx, &fullness); + VM_BUG_ON(fullness != ZS_EMPTY); + class = pool->size_class[class_idx]; + spin_lock(&class->lock); + __free_zspage(pool, pool->size_class[class_idx], zspage); + spin_unlock(&class->lock); } +}; + +static void kick_deferred_free(struct zs_pool *pool) +{ + schedule_work(&pool->free_work); +} + +static void init_deferred_free(struct zs_pool *pool) +{ + INIT_WORK(&pool->free_work, async_free_zspage); +} + +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); - return page; + do { + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); } +#endif /* * @@ -1748,20 +2273,20 @@ static unsigned long zs_can_compact(struct size_class *class) static void __zs_compact(struct zs_pool *pool, struct size_class *class) { struct zs_compact_control cc; - struct page *src_page; - struct page *dst_page = NULL; + struct zspage *src_zspage; + struct zspage *dst_zspage = NULL; spin_lock(&class->lock); - while ((src_page = isolate_source_page(class))) { + while ((src_zspage = isolate_zspage(class, true))) { if (!zs_can_compact(class)) break; cc.index = 0; - cc.s_page = src_page; + cc.s_page = get_first_page(src_zspage); - while ((dst_page = isolate_target_page(class))) { - cc.d_page = dst_page; + while ((dst_zspage = isolate_zspage(class, false))) { + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched * and see if anyone had allocated another zspage. @@ -1769,23 +2294,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) if (!migrate_zspage(pool, class, &cc)) break; - putback_zspage(pool, class, dst_page); + putback_zspage(class, dst_zspage); } /* Stop if we couldn't find slot */ - if (dst_page == NULL) + if (dst_zspage == NULL) break; - putback_zspage(pool, class, dst_page); - if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + putback_zspage(class, dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + free_zspage(pool, class, src_zspage); pool->stats.pages_compacted += class->pages_per_zspage; + } spin_unlock(&class->lock); cond_resched(); spin_lock(&class->lock); } - if (src_page) - putback_zspage(pool, class, src_page); + if (src_zspage) + putback_zspage(class, src_zspage); spin_unlock(&class->lock); } @@ -1794,6 +2321,9 @@ unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; + unsigned long pages_compacted_before = pool->stats.pages_compacted; + + trace_zsmalloc_compact_start(pool->name); for (i = zs_size_classes - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -1804,6 +2334,10 @@ unsigned long zs_compact(struct zs_pool *pool) __zs_compact(pool, class); } + trace_zsmalloc_compact_end(pool->name, + pool->stats.pages_compacted - pages_compacted_before, + pool->stats.pages_compacted); + return pool->stats.pages_compacted; } EXPORT_SYMBOL_GPL(zs_compact); @@ -1892,6 +2426,7 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool) return NULL; + init_deferred_free(pool); pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { @@ -1903,7 +2438,7 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; - if (create_handle_cache(pool)) + if (create_cache(pool)) goto err; /* @@ -1914,6 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name) int size; int pages_per_zspage; struct size_class *class; + int fullness = 0; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) @@ -1943,11 +2479,13 @@ struct zs_pool *zs_create_pool(const char *name) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; - if (pages_per_zspage == 1 && - get_maxobj_per_zspage(size, pages_per_zspage) == 1) - class->huge = true; + class->objs_per_zspage = class->pages_per_zspage * + PAGE_SIZE / class->size; spin_lock_init(&class->lock); pool->size_class[i] = class; + for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; + fullness++) + INIT_LIST_HEAD(&class->fullness_list[fullness]); prev_class = class; } @@ -1955,6 +2493,9 @@ struct zs_pool *zs_create_pool(const char *name) /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); + if (zs_register_migration(pool)) + goto err; + /* * Not critical, we still can use the pool * and user can trigger compaction manually. @@ -1974,6 +2515,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); + zs_unregister_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < zs_size_classes; i++) { @@ -1986,8 +2528,8 @@ void zs_destroy_pool(struct zs_pool *pool) if (class->index != i) continue; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { + for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { + if (!list_empty(&class->fullness_list[fg])) { pr_info("Freeing non-empty class with size %db, fullness group %d\n", class->size, fg); } @@ -1995,7 +2537,7 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } - destroy_handle_cache(pool); + destroy_cache(pool); kfree(pool->size_class); kfree(pool->name); kfree(pool); @@ -2004,7 +2546,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { - int ret = zs_register_cpu_notifier(); + int ret; + + ret = zsmalloc_mount(); + if (ret) + goto out; + + ret = zs_register_cpu_notifier(); if (ret) goto notifier_fail; @@ -2021,7 +2569,8 @@ static int __init zs_init(void) notifier_fail: zs_unregister_cpu_notifier(); - + zsmalloc_unmount(); +out: return ret; } @@ -2030,6 +2579,7 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif + zsmalloc_unmount(); zs_unregister_cpu_notifier(); zs_stat_exit(); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 735362c26c8e..f1dffe84f0d5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -769,6 +769,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) lockdep_set_class(&sk->sk_receive_queue.lock, &af_unix_sk_receive_queue_lock_key); + sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c index c3108bb15789..e3c0a40909f7 100644 --- a/samples/kprobes/jprobe_example.c +++ b/samples/kprobes/jprobe_example.c @@ -48,10 +48,10 @@ static int __init jprobe_init(void) ret = register_jprobe(&my_jprobe); if (ret < 0) { - printk(KERN_INFO "register_jprobe failed, returned %d\n", ret); + pr_err("register_jprobe failed, returned %d\n", ret); return -1; } - printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n", + pr_info("Planted jprobe at %p, handler addr %p\n", my_jprobe.kp.addr, my_jprobe.entry); return 0; } @@ -59,7 +59,7 @@ static int __init jprobe_init(void) static void __exit jprobe_exit(void) { unregister_jprobe(&my_jprobe); - printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr); + pr_info("jprobe at %p unregistered\n", my_jprobe.kp.addr); } module_init(jprobe_init) diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index ed0ca0c07242..e5754448146e 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -27,23 +27,19 @@ static struct kprobe kp = { static int handler_pre(struct kprobe *p, struct pt_regs *regs) { #ifdef CONFIG_X86 - printk(KERN_INFO "<%s> pre_handler: p->addr = 0x%p, ip = %lx," - " flags = 0x%lx\n", + pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", p->symbol_name, p->addr, regs->ip, regs->flags); #endif #ifdef CONFIG_PPC - printk(KERN_INFO "<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx," - " msr = 0x%lx\n", + pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", p->symbol_name, p->addr, regs->nip, regs->msr); #endif #ifdef CONFIG_MIPS - printk(KERN_INFO "<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx," - " status = 0x%lx\n", + pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status); #endif #ifdef CONFIG_TILEGX - printk(KERN_INFO "<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx," - " ex1 = 0x%lx\n", + pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, ex1 = 0x%lx\n", p->symbol_name, p->addr, regs->pc, regs->ex1); #endif @@ -56,19 +52,19 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { #ifdef CONFIG_X86 - printk(KERN_INFO "<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", + pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", p->symbol_name, p->addr, regs->flags); #endif #ifdef CONFIG_PPC - printk(KERN_INFO "<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", + pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", p->symbol_name, p->addr, regs->msr); #endif #ifdef CONFIG_MIPS - printk(KERN_INFO "<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", + pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_status); #endif #ifdef CONFIG_TILEGX - printk(KERN_INFO "<%s> post_handler: p->addr = 0x%p, ex1 = 0x%lx\n", + pr_info("<%s> post_handler: p->addr = 0x%p, ex1 = 0x%lx\n", p->symbol_name, p->addr, regs->ex1); #endif } @@ -80,8 +76,7 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs, */ static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) { - printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn", - p->addr, trapnr); + pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); /* Return 0 because we don't handle the fault. */ return 0; } @@ -95,17 +90,17 @@ static int __init kprobe_init(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_INFO "register_kprobe failed, returned %d\n", ret); + pr_err("register_kprobe failed, returned %d\n", ret); return ret; } - printk(KERN_INFO "Planted kprobe at %p\n", kp.addr); + pr_info("Planted kprobe at %p\n", kp.addr); return 0; } static void __exit kprobe_exit(void) { unregister_kprobe(&kp); - printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr); + pr_info("kprobe at %p unregistered\n", kp.addr); } module_init(kprobe_init) diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index ebb1d1aed547..7f9060f435cd 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -55,14 +55,14 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) */ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { - int retval = regs_return_value(regs); + unsigned long retval = regs_return_value(regs); struct my_data *data = (struct my_data *)ri->data; s64 delta; ktime_t now; now = ktime_get(); delta = ktime_to_ns(ktime_sub(now, data->entry_stamp)); - printk(KERN_INFO "%s returned %d and took %lld ns to execute\n", + pr_info("%s returned %lu and took %lld ns to execute\n", func_name, retval, (long long)delta); return 0; } @@ -82,11 +82,10 @@ static int __init kretprobe_init(void) my_kretprobe.kp.symbol_name = func_name; ret = register_kretprobe(&my_kretprobe); if (ret < 0) { - printk(KERN_INFO "register_kretprobe failed, returned %d\n", - ret); + pr_err("register_kretprobe failed, returned %d\n", ret); return -1; } - printk(KERN_INFO "Planted return probe at %s: %p\n", + pr_info("Planted return probe at %s: %p\n", my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); return 0; } @@ -94,11 +93,10 @@ static int __init kretprobe_init(void) static void __exit kretprobe_exit(void) { unregister_kretprobe(&my_kretprobe); - printk(KERN_INFO "kretprobe at %p unregistered\n", - my_kretprobe.kp.addr); + pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr); /* nmissed > 0 suggests that maxactive was set too low. */ - printk(KERN_INFO "Missed probing %d instances of %s\n", + pr_info("Missed probing %d instances of %s\n", my_kretprobe.nmissed, my_kretprobe.kp.symbol_name); } diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index e7df0f5db7ec..76494e15417b 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -138,7 +138,7 @@ endif ifeq ($(CONFIG_KCOV),y) _c_flags += $(if $(patsubst n%,, \ - $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \ + $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)$(CONFIG_KCOV_INSTRUMENT_ALL)), \ $(CFLAGS_KCOV)) endif diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 0254f3ba0dba..19f5adfd877d 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -67,5 +67,5 @@ print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) for d, n in delta: if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) -print("Total: Before=%d, After=%d, chg %f%%" % \ - (otot, ntot, (ntot - otot)*100/otot)) +print("Total: Before=%d, After=%d, chg %+.2f%%" % \ + (otot, ntot, (ntot - otot)*100.0/otot)) diff --git a/scripts/tags.sh b/scripts/tags.sh index f72f48f638ae..ed7eef24ef89 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -185,6 +185,9 @@ regex_c=( '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/' '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/' '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/' + '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/' + '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/' + '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/' '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/' '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/' '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/' diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFcommon b/tools/testing/selftests/rcutorture/configs/lock/CFcommon index e372dc269254..779f984fb25c 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/lock/CFcommon @@ -1,2 +1,2 @@ CONFIG_LOCK_TORTURE_TEST=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index f824b4c9d9d9..7461db3a2422 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,5 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y -CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_TIME=1 CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 77147b42d598..f1c055f3c243 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -79,12 +79,12 @@ static void add_list(char *buf, int len) } } -#define BUF_SIZE 1024 +#define BUF_SIZE (128 * 1024) int main(int argc, char **argv) { FILE *fin, *fout; - char buf[BUF_SIZE]; + char *buf; int ret, i, count; struct block_list *list2; struct stat st; @@ -107,6 +107,11 @@ int main(int argc, char **argv) max_size = st.st_size / 100; /* hack ... */ list = malloc(max_size * sizeof(*list)); + buf = malloc(BUF_SIZE); + if (!list || !buf) { + printf("Out of memory\n"); + exit(1); + } for ( ; ; ) { ret = read_block(buf, BUF_SIZE, fin); |