diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-04-19 13:30:57 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-04-19 13:30:57 +1000 |
commit | 6c2db30cee55dad6872c51d087872314ea1b6e0a (patch) | |
tree | 29817f8bc5ad3861f8729c80f8b7b213e597fb64 | |
parent | 252c5f39268c0ec2f5ce68f0c0d53e84d4317984 (diff) | |
parent | fdb1a3b69d1b9ef528b8a728682499e3dcd7719d (diff) | |
download | linux-next-6c2db30cee55dad6872c51d087872314ea1b6e0a.tar.gz |
Merge branch 'akpm-current/current'
61 files changed, 1141 insertions, 314 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 74cdeaed9f7a..f728e55602b2 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/. 5-2-1. Memory Interface Files 5-2-2. Usage Guidelines 5-2-3. Memory Ownership + 5-2-4. OOM Killer 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback @@ -1039,6 +1040,31 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom_group + + A read-write single value file which exists on non-root + cgroups. The default is "0". + + If set, OOM killer will consider the memory cgroup as an + indivisible memory consumers and compare it with other memory + consumers by it's memory footprint. + If such memory cgroup is selected as an OOM victim, all + processes belonging to it or it's descendants will be killed. + + This applies to system-wide OOM conditions and reaching + the hard memory limit of the cgroup and their ancestor. + If OOM condition happens in a descendant cgroup with it's own + memory limit, the memory cgroup can't be considered + as an OOM victim, and OOM killer will not kill all belonging + tasks. + + Also, OOM killer respects the /proc/pid/oom_score_adj value -1000, + and will never kill the unkillable task, even if memory.oom_group + is set. + + If cgroup-aware OOM killer is not enabled, ENOTSUPP error + is returned on attempt to access the file. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified @@ -1242,6 +1268,54 @@ to be accessed repeatedly by other cgroups, it may make sense to use POSIX_FADV_DONTNEED to relinquish the ownership of memory areas belonging to the affected files to ensure correct memory ownership. +OOM Killer +~~~~~~~~~~ + +Cgroup v2 memory controller implements a cgroup-aware OOM killer. +It means that it treats cgroups as first class OOM entities. + +Cgroup-aware OOM logic is turned off by default and requires +passing the "groupoom" option on mounting cgroupfs. It can also +by remounting cgroupfs with the following command:: + + # mount -o remount,groupoom $MOUNT_POINT + +Under OOM conditions the memory controller tries to make the best +choice of a victim, looking for a memory cgroup with the largest +memory footprint, considering leaf cgroups and cgroups with the +memory.oom_group option set, which are considered to be an indivisible +memory consumers. + +By default, OOM killer will kill the biggest task in the selected +memory cgroup. A user can change this behavior by enabling +the per-cgroup memory.oom_group option. If set, it causes +the OOM killer to kill all processes attached to the cgroup, +except processes with oom_score_adj set to -1000. + +This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM +the memory controller considers only cgroups belonging to the sub-tree +of the OOM'ing cgroup. + +Leaf cgroups and cgroups with oom_group option set are compared based +on their cumulative memory usage. The root cgroup is treated as a +leaf memory cgroup as well, so it is compared with other leaf memory +cgroups. Due to internal implementation restrictions the size of +the root cgroup is the cumulative sum of oom_badness of all its tasks +(in other words oom_score_adj of each task is obeyed). Relying on +oom_score_adj (apart from OOM_SCORE_ADJ_MIN) can lead to over- or +underestimation of the root cgroup consumption and it is therefore +discouraged. This might change in the future, however. + +If there are no cgroups with the enabled memory controller, +the OOM killer is using the "traditional" process-based approach. + +Please, note that memory charges are not migrating if tasks +are moved between different memory cgroups. Moving tasks with +significant memory footprint may affect OOM victim selection logic. +If it's a case, please, consider creating a common ancestor for +the source and destination memory cgroups and enabling oom_group +on ancestor layer. + IO -- diff --git a/MAINTAINERS b/MAINTAINERS index 909fb93b28fc..f9bf4651db41 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1373,7 +1373,8 @@ F: arch/arm/mach-ebsa110/ F: drivers/net/ethernet/amd/am79c961a.* ARM/ENERGY MICRO (SILICON LABS) EFM32 SUPPORT -M: Uwe Kleine-König <kernel@pengutronix.de> +M: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> +R: Pengutronix Kernel Team <kernel@pengutronix.de> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained N: efm32 @@ -1401,7 +1402,8 @@ F: arch/arm/mach-footbridge/ ARM/FREESCALE IMX / MXC ARM ARCHITECTURE M: Shawn Guo <shawnguo@kernel.org> -M: Sascha Hauer <kernel@pengutronix.de> +M: Sascha Hauer <s.hauer@pengutronix.de> +R: Pengutronix Kernel Team <kernel@pengutronix.de> R: Fabio Estevam <fabio.estevam@nxp.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -1416,7 +1418,8 @@ F: include/soc/imx/ ARM/FREESCALE VYBRID ARM ARCHITECTURE M: Shawn Guo <shawnguo@kernel.org> -M: Sascha Hauer <kernel@pengutronix.de> +M: Sascha Hauer <s.hauer@pengutronix.de> +R: Pengutronix Kernel Team <kernel@pengutronix.de> R: Stefan Agner <stefan@agner.ch> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -5652,7 +5655,8 @@ F: drivers/net/ethernet/freescale/fec.h F: Documentation/devicetree/bindings/net/fsl-fec.txt FREESCALE IMX / MXC FRAMEBUFFER DRIVER -M: Sascha Hauer <kernel@pengutronix.de> +M: Sascha Hauer <s.hauer@pengutronix.de> +R: Pengutronix Kernel Team <kernel@pengutronix.de> L: linux-fbdev@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -12814,7 +12818,8 @@ F: include/linux/siphash.h SIOX M: Gavin Schenk <g.schenk@eckelmann.de> -M: Uwe Kleine-König <kernel@pengutronix.de> +M: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> +R: Pengutronix Kernel Team <kernel@pengutronix.de> S: Supported F: drivers/siox/* F: include/trace/events/siox.h diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 9033c8194eda..ccc421503363 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - idr_get_cursor(&task_active_pid_ns(current)->idr)); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } diff --git a/block/genhd.c b/block/genhd.c index dc7e089373b9..10dcc29b5e9d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -991,7 +991,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 9d27016c899e..0434ab7b6497 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -740,10 +740,7 @@ static int do_dma_request(struct mport_dma_req *req, tx->callback = dma_xfer_callback; tx->callback_param = req; - req->dmach = chan; - req->sync = sync; req->status = DMA_IN_PROGRESS; - init_completion(&req->req_comp); kref_get(&req->refcount); cookie = dmaengine_submit(tx); @@ -831,13 +828,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (!req) return -ENOMEM; - kref_init(&req->refcount); - ret = get_dma_channel(priv); if (ret) { kfree(req); return ret; } + chan = priv->dmach; + + kref_init(&req->refcount); + init_completion(&req->req_comp); + req->dir = dir; + req->filp = filp; + req->priv = priv; + req->dmach = chan; + req->sync = sync; /* * If parameter loc_addr != NULL, we are transferring data from/to @@ -925,11 +929,6 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, xfer->offset, xfer->length); } - req->dir = dir; - req->filp = filp; - req->priv = priv; - chan = priv->dmach; - nents = dma_map_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); if (nents == 0) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 82e8f6edfb48..b12e37f27530 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir, autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 41e04183e4ce..64ae6b1ebdd9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1621,8 +1621,8 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); - data = vmalloc(size); - if (!data) + data = kvmalloc(size, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; start_end_ofs = data + 2; @@ -1639,7 +1639,7 @@ static int fill_files_note(struct memelfnote *note) filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { - vfree(data); + kvfree(data); size = size * 5 / 4; goto alloc; } @@ -1932,7 +1932,7 @@ static void free_note_info(struct elf_note_info *info) kfree(t); } kfree(info->psinfo.data); - vfree(info->files.data); + kvfree(info->files.data); } #else @@ -2148,7 +2148,7 @@ static void free_note_info(struct elf_note_info *info) /* Free data possibly allocated by fill_files_note(): */ if (info->notes_files) - vfree(info->notes_files->data); + kvfree(info->notes_files->data); kfree(info->prstatus); kfree(info->psinfo); @@ -2294,8 +2294,8 @@ static int elf_core_dump(struct coredump_params *cprm) if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) goto end_coredump; - vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); - if (!vma_filesz) + vma_filesz = kvmalloc((segs - 1) * sizeof(*vma_filesz), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(vma_filesz)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2402,7 +2402,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - vfree(vma_filesz); + kvfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/dcache.c b/fs/dcache.c index 86d2de63461e..3507badeb60a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } @@ -1488,6 +1489,7 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); + cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); diff --git a/fs/exec.c b/fs/exec.c index 183059c427b9..32eea4c65909 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1338,6 +1338,7 @@ void setup_new_exec(struct linux_binprm * bprm) if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* * For secureexec, reset the stack limit to sane default to diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c156b7a..cfa862ea19d2 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -146,68 +146,82 @@ int _ore_get_io_state(struct ore_layout *layout, struct ore_io_state **pios) { struct ore_io_state *ios; - struct page **pages; - struct osd_sg_entry *sgilist; + size_t size_ios, size_extra, size_total; + void *ios_extra; + + /* + * The desired layout looks like this, with the extra_allocation + * items pointed at from fields within ios or per_dev: + struct __alloc_all_io_state { struct ore_io_state ios; struct ore_per_dev_state per_dev[numdevs]; union { struct osd_sg_entry sglist[sgs_per_dev * numdevs]; struct page *pages[num_par_pages]; - }; - } *_aios; - - if (likely(sizeof(*_aios) <= PAGE_SIZE)) { - _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); - if (unlikely(!_aios)) { - ORE_DBGMSG("Failed kzalloc bytes=%zd\n", - sizeof(*_aios)); + } extra_allocation; + } whole_allocation; + + */ + + /* This should never happen, so abort early if it ever does. */ + if (sgs_per_dev && num_par_pages) { + ORE_DBGMSG("Tried to use both pages and sglist\n"); + *pios = NULL; + return -EINVAL; + } + + if (numdevs > (INT_MAX - sizeof(*ios)) / + sizeof(struct ore_per_dev_state)) + return -ENOMEM; + size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs; + + if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry)) + return -ENOMEM; + if (num_par_pages > INT_MAX / sizeof(struct page *)) + return -ENOMEM; + size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs), + sizeof(struct page *) * num_par_pages); + + size_total = size_ios + size_extra; + + if (likely(size_total <= PAGE_SIZE)) { + ios = kzalloc(size_total, GFP_KERNEL); + if (unlikely(!ios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? _aios->pages : NULL; - sgilist = sgs_per_dev ? _aios->sglist : NULL; - ios = &_aios->ios; + ios_extra = (char *)ios + size_ios; } else { - struct __alloc_small_io_state { - struct ore_io_state ios; - struct ore_per_dev_state per_dev[numdevs]; - } *_aio_small; - union __extra_part { - struct osd_sg_entry sglist[sgs_per_dev * numdevs]; - struct page *pages[num_par_pages]; - } *extra_part; - - _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); - if (unlikely(!_aio_small)) { + ios = kzalloc(size_ios, GFP_KERNEL); + if (unlikely(!ios)) { ORE_DBGMSG("Failed alloc first part bytes=%zd\n", - sizeof(*_aio_small)); + size_ios); *pios = NULL; return -ENOMEM; } - extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); - if (unlikely(!extra_part)) { + ios_extra = kzalloc(size_extra, GFP_KERNEL); + if (unlikely(!ios_extra)) { ORE_DBGMSG("Failed alloc second part bytes=%zd\n", - sizeof(*extra_part)); - kfree(_aio_small); + size_extra); + kfree(ios); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? extra_part->pages : NULL; - sgilist = sgs_per_dev ? extra_part->sglist : NULL; /* In this case the per_dev[0].sgilist holds the pointer to * be freed */ - ios = &_aio_small->ios; ios->extra_part_alloc = true; } - if (pages) { - ios->parity_pages = pages; + if (num_par_pages) { + ios->parity_pages = ios_extra; ios->max_par_pages = num_par_pages; } - if (sgilist) { + if (sgs_per_dev) { + struct osd_sg_entry *sgilist = ios_extra; unsigned d; for (d = 0; d < numdevs; ++d) { diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 27cbdb697649..659129d5e9f7 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -71,6 +71,11 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, { struct __stripe_pages_2d *sp2d; unsigned data_devs = group_width - parity; + + /* + * Desired allocation layout is, though when larger than PAGE_SIZE, + * each struct __alloc_1p_arrays is separately allocated: + struct _alloc_all_bytes { struct __alloc_stripe_pages_2d { struct __stripe_pages_2d sp2d; @@ -82,55 +87,85 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, char page_is_read[data_devs]; } __a1pa[pages_in_unit]; } *_aab; + struct __alloc_1p_arrays *__a1pa; struct __alloc_1p_arrays *__a1pa_end; - const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + + */ + + char *__a1pa; + char *__a1pa_end; + + const size_t sizeof_stripe_pages_2d = + sizeof(struct __stripe_pages_2d) + + sizeof(struct __1_page_stripe) * pages_in_unit; + const size_t sizeof__a1pa = + ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, + sizeof(void *)); + const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; + const size_t alloc_total = sizeof_stripe_pages_2d + + sizeof__a1pa_arrays; + unsigned num_a1pa, alloc_size, i; /* FIXME: check these numbers in ore_verify_layout */ - BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); BUG_ON(sizeof__a1pa > PAGE_SIZE); - if (sizeof(*_aab) > PAGE_SIZE) { - num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; - alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + /* + * If alloc_total would be larger than PAGE_SIZE, only allocate + * as many a1pa items as would fill the rest of the page, instead + * of the full pages_in_unit count. + */ + if (alloc_total > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; + alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; } else { num_a1pa = pages_in_unit; - alloc_size = sizeof(*_aab); + alloc_size = alloc_total; } - _aab = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!_aab)) { + *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!sp2d)) { ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); return -ENOMEM; } + /* From here Just call _sp2d_free */ - sp2d = &_aab->__asp2d.sp2d; - *psp2d = sp2d; /* From here Just call _sp2d_free */ - - __a1pa = _aab->__a1pa; - __a1pa_end = __a1pa + num_a1pa; + /* Find start of a1pa area. */ + __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; + /* Find end of the _allocated_ a1pa area. */ + __a1pa_end = __a1pa + alloc_size; + /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ for (i = 0; i < pages_in_unit; ++i) { if (unlikely(__a1pa >= __a1pa_end)) { num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); + alloc_size = sizeof__a1pa * num_a1pa; - __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); + __a1pa = kzalloc(alloc_size, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); return -ENOMEM; } - __a1pa_end = __a1pa + num_a1pa; + __a1pa_end = __a1pa + alloc_size; /* First *pages is marked for kfree of the buffer */ sp2d->_1p_stripes[i].alloc = true; } - sp2d->_1p_stripes[i].pages = __a1pa->pages; - sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; - sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; - ++__a1pa; + /* + * Attach all _lp_stripes pointers to the allocation for + * it which was either part of the original PAGE_SIZE + * allocation or the subsequent allocation in this loop. + */ + sp2d->_1p_stripes[i].pages = (void *)__a1pa; + sp2d->_1p_stripes[i].scribble = + sp2d->_1p_stripes[i].pages + group_width; + sp2d->_1p_stripes[i].page_is_read = + (char *)(sp2d->_1p_stripes[i].scribble + group_width); + __a1pa += sizeof__a1pa; } sp2d->parity = parity; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c2f52a..f3c29e9326f1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -549,27 +549,26 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { - struct __alloc_ore_devs_and_exofs_devs { - /* Twice bigger table: See exofs_init_comps() and comment at - * exofs_read_lookup_dev_table() - */ - struct ore_dev *oreds[numdevs * 2 - 1]; - struct exofs_dev eds[numdevs]; - } *aoded; + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + size_t numores = numdevs * 2 - 1; struct exofs_dev *eds; unsigned i; - aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); - if (unlikely(!aoded)) { + sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) + + numdevs * sizeof(struct exofs_dev), GFP_KERNEL); + if (unlikely(!sbi->oc.ods)) { EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } - sbi->oc.ods = aoded->oreds; - *peds = eds = aoded->eds; + /* Start of allocated struct exofs_dev entries */ + *peds = eds = (void *)sbi->oc.ods[numores]; + /* Initialize pointers into struct exofs_dev */ for (i = 0; i < numdevs; ++i) - aoded->oreds[i] = &eds[i].ored; + sbi->oc.ods[i] = &eds[i].ored; return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..47d7c151fcba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 91a8889abf9b..2809e29d612d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct bio *bio; struct page *page; +#define O2HB_BIO_VECS 16 /* Testing has shown this allocation to take long enough under * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { + if (len == 0 && current_page == O2HB_BIO_VECS) { + /* bio is full now. */ + goto bail; + } else if (len != vec_len) { mlog(ML_ERROR, "Adding page[%d] to bio failed, " "page %p, len %d, vec_len %u, vec_start %u, " "bi_sector %llu\n", current_page, page, len, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972efab83..ce33ac354602 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3534,7 +3534,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5dcea6cee5f..b63c97f4318e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg) int rm_quota_used = 0, i; struct ocfs2_quota_recovery *qrec; + /* Whether the quota supported. */ + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); + status = ocfs2_wait_on_mount(osb); if (status < 0) { goto bail; } - rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); - if (!rm_quota) { - status = -ENOMEM; - goto bail; + if (quota_enabled) { + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } } restart: status = ocfs2_super_lock(osb, 1); @@ -1422,9 +1430,14 @@ restart: * then quota usage would be out of sync until some node takes * the slot. So we remember which nodes need quota recovery * and when everything else is done, we recover quotas. */ - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); - if (i == rm_quota_used) - rm_quota[rm_quota_used++] = slot_num; + if (quota_enabled) { + for (i = 0; i < rm_quota_used + && rm_quota[i] != slot_num; i++) + ; + + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + } status = ocfs2_recover_node(osb, node_num, slot_num); skip_recovery: @@ -1452,16 +1465,19 @@ skip_recovery: /* Now it is right time to recover quotas... We have to do this under * superblock lock so that no one can start using the slot (and crash) * before we recover it */ - for (i = 0; i < rm_quota_used; i++) { - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); - if (IS_ERR(qrec)) { - status = PTR_ERR(qrec); - mlog_errno(status); - continue; + if (quota_enabled) { + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, + rm_quota[i], + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec, - ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); @@ -1483,7 +1499,8 @@ bail: mutex_unlock(&osb->recovery_lock); - kfree(rm_quota); + if (quota_enabled) + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d6c350ba25b9..c4b029c43464 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index eafa39a3a88c..1b2ede6abcdf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1693,6 +1693,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t uid; kgid_t gid; + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index a000d7547479..b572cc865b92 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - idr_get_cursor(&task_active_pid_ns(current)->idr)); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 333cda80c3dd..30d8ee9ed46a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1310,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset = swp_offset(entry); + offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } spin_unlock(ptl); return err; diff --git a/fs/seq_file.c b/fs/seq_file.c index c6c27f1f9c98..8389da9d0eb8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + width >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -800,11 +795,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num num = -num; } - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index bfe86b54f6c1..0bd432a4d7bd 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 84adb5ec4011..d4561cb27d24 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This @@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - xa_lock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages @@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - xa_unlock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } @@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index dc5b70449dc6..9b2879bba00e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -81,6 +81,11 @@ enum { * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), + + /* + * Enable cgroup-aware OOM killer. + */ + CGRP_GROUP_OOM = (1 << 5), }; /* cftype->flags */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ceb96ecab96e..5a1d8580febe 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -25,6 +25,11 @@ #define __SANITIZE_ADDRESS__ #endif +#ifdef CONFIG_KASAN +#undef __no_sanitize_address +#define __no_sanitize_address __attribute__((no_sanitize("address"))) +#endif + /* Clang doesn't have a way to turn it off per-function, yet. */ #ifdef __noretpoline #undef __noretpoline diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c66..2da009958798 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,6 +35,7 @@ struct mem_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct oom_control; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -204,6 +205,13 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* + * Treat the sub-tree as an indivisible memory consumer, + * kill all belonging tasks if the memory cgroup selected + * as OOM victim. + */ + bool oom_group; + /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; @@ -348,6 +356,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -486,6 +499,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p) bool mem_cgroup_oom_synchronize(bool wait); +bool mem_cgroup_select_oom_victim(struct oom_control *oc); + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return memcg->oom_group; +} + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -793,6 +813,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -985,6 +1009,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + return false; +} + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bad038ac012..d4d41c01a74d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -10,6 +10,13 @@ #include <linux/sched/coredump.h> /* MMF_* */ #include <linux/mm.h> /* VM_FAULT* */ + +/* + * Special value returned by victim selection functions to indicate + * that are inflight OOM victims. + */ +#define INFLIGHT_VICTIM ((void *)-1UL) + struct zonelist; struct notifier_block; struct mem_cgroup; @@ -40,7 +47,8 @@ struct oom_control { /* Used by oom implementation, do not set */ unsigned long totalpages; - struct task_struct *chosen; + struct task_struct *chosen_task; + struct mem_cgroup *chosen_memcg; unsigned long chosen_points; }; @@ -111,6 +119,8 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern int oom_evaluate_task(struct task_struct *task, void *arg); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a7ce74c74e49..a52683a5fe90 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -104,6 +104,9 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + /* The signal sent when the parent dies: */ + int pdeath_signal_proc; + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 388ff2936a87..a3894918a436 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -75,6 +75,6 @@ struct shrinker { #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) -extern int register_shrinker(struct shrinker *); +extern __must_check int register_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *); #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d8638..f73eafcaf4e9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,8 +172,9 @@ enum { SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_VALID = (1 << 12), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + preempt_enable(); +} #else /* CONFIG_SWAP */ @@ -575,7 +582,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 34f053a150a9..cf2862bd134a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -43,11 +43,7 @@ enum { #define THREAD_ALIGN THREAD_SIZE #endif -#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) -#else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) -#endif +#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index af5f8c2df87a..3165863aa187 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -207,4 +207,8 @@ struct prctl_mm_map { # define PR_SVE_VL_LEN_MASK 0xffff # define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Process-based variant of PDEATHSIG */ +#define PR_SET_PDEATHSIG_PROC 48 +#define PR_GET_PDEATHSIG_PROC 49 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b07335be7164..b3232bda8f19 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1732,6 +1732,9 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) if (!strcmp(token, "nsdelegate")) { *root_flags |= CGRP_ROOT_NS_DELEGATE; continue; + } else if (!strcmp(token, "groupoom")) { + *root_flags |= CGRP_GROUP_OOM; + continue; } pr_err("cgroup2: unknown option \"%s\"\n", token); @@ -1748,6 +1751,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_GROUP_OOM) + cgrp_dfl_root.flags |= CGRP_GROUP_OOM; + else + cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM; } } @@ -1755,6 +1763,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_GROUP_OOM) + seq_puts(seq, ",groupoom"); return 0; } @@ -5915,7 +5925,8 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "groupoom\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cred.c b/kernel/cred.c index ecf03657e71c..0192a94670e1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -448,6 +448,7 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + task->signal->pdeath_signal_proc = 0; smp_wmb(); } diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac560114..63f7cea6456e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -635,6 +635,10 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (unlikely(p->exit_state == EXIT_DEAD)) return; + if (p->signal->pdeath_signal_proc) + group_send_sig_info(p->signal->pdeath_signal_proc, + SEND_SIG_NOINFO, p); + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; diff --git a/kernel/fork.c b/kernel/fork.c index 242c8c93d285..3398037de1cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; -#ifdef CONFIG_DEBUG_KMEMLEAK /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -#endif + tsk->stack_vm_area = s; return s->addr; } @@ -1471,6 +1470,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) mutex_init(&sig->cred_guard_mutex); + sig->pdeath_signal_proc = current->signal->pdeath_signal_proc; + return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index ad692183dfe9..f1b4c598f810 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2265,6 +2265,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; + case PR_SET_PDEATHSIG_PROC: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->signal->pdeath_signal_proc = arg2; + break; + case PR_GET_PDEATHSIG_PROC: + error = put_user(me->signal->pdeath_signal_proc, + (int __user *)arg2); + break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c index 0ec3f257ffdf..3e3becb836a6 100644 --- a/lib/reed_solomon/decode_rs.c +++ b/lib/reed_solomon/decode_rs.c @@ -31,9 +31,10 @@ * of nroots is 8. So the necessary stack size will be about * 220 bytes max. */ - uint16_t lambda[nroots + 1], syn[nroots]; - uint16_t b[nroots + 1], t[nroots + 1], omega[nroots + 1]; - uint16_t root[nroots], reg[nroots + 1], loc[nroots]; + uint16_t lambda[RS_MAX_ROOTS + 1], syn[RS_MAX_ROOTS]; + uint16_t b[RS_MAX_ROOTS + 1], t[RS_MAX_ROOTS + 1]; + uint16_t omega[RS_MAX_ROOTS + 1], root[RS_MAX_ROOTS]; + uint16_t reg[RS_MAX_ROOTS + 1], loc[RS_MAX_ROOTS]; int count = 0; uint16_t msk = (uint16_t) rs->nn; diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c index 06d04cfa9339..3e218e70ac2e 100644 --- a/lib/reed_solomon/reed_solomon.c +++ b/lib/reed_solomon/reed_solomon.c @@ -51,6 +51,9 @@ static LIST_HEAD (rslist); /* Protection for the list */ static DEFINE_MUTEX(rslistlock); +/* Ultimately controls the upper bounds of the on-stack buffers. */ +#define RS_MAX_ROOTS 24 + /** * rs_init - Initialize a Reed-Solomon codec * @symsize: symbol size, bits (1-8) @@ -210,7 +213,7 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly, return NULL; if (prim <= 0 || prim >= (1<<symsize)) return NULL; - if (nroots < 0 || nroots >= (1<<symsize)) + if (nroots < 0 || nroots >= (1<<symsize) || nroots > RS_MAX_ROOTS) return NULL; mutex_lock(&rslistlock); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce..2fc3f38e4c4f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -220,11 +220,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3e8cda7beb7f..323acdd14e6e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2925,7 +2925,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = maybe_pmd_mkwrite(pmde, vma); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); - page_add_anon_rmap(new, vma, mmun_start, true); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, mmun_start, true); + else + page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); if (vma->vm_flags & VM_LOCKED) mlock_vma_page(new); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index bc0e68f7dc75..135ce2838c89 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -792,6 +792,41 @@ DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); #ifdef CONFIG_MEMORY_HOTPLUG +static bool shadow_mapped(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + return false; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + /* + * We can't use pud_large() or pud_huge(), the first one + * is arch-specific, the last one depend on HUGETLB_PAGE. + * So let's abuse pud_bad(), if bud is bad it's has to + * because it's huge. + */ + if (pud_bad(*pud)) + return true; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_bad(*pmd)) + return true; + pte = pte_offset_kernel(pmd, addr); + return !pte_none(*pte); +} + static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -813,6 +848,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, case MEM_GOING_ONLINE: { void *ret; + /* + * If shadow is mapped already than it must have been mapped + * during the boot. This could happen if we onlining previously + * offlined memory. + */ + if (shadow_mapped(shadow_start)) + return NOTIFY_OK; + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, shadow_end, GFP_KERNEL, PAGE_KERNEL, VM_NO_GUARD, @@ -824,8 +867,18 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, kmemleak_ignore(ret); return NOTIFY_OK; } - case MEM_OFFLINE: - vfree((void *)shadow_start); + case MEM_OFFLINE: { + struct vm_struct *vm; + + /* + * Only hot-added memory have vm_area. Freeing shadow + * mapped during boot would be tricky, so we'll just + * have to keep it. + */ + vm = find_vm_area((void *)shadow_start); + if (vm) + vfree((void *)shadow_start); + } } return NOTIFY_OK; diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed47..d9c84c5bda1d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> +#include <linux/prefetch.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> @@ -133,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + /* + * Prefetch the neighboring list entries to reduce lock hold time. + */ + prefetchw(item->prev); + prefetchw(item->next); + spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_kmem(nlru, item); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e074f7c637aa..d455bc08eb55 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -888,7 +888,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * value, the function breaks the iteration loop and returns the value. * Otherwise, it will iterate over all tasks and return 0. * - * This function must not be called for the root memory cgroup. + * If memcg is the root memory cgroup, this function will iterate only + * over tasks belonging directly to the root memory cgroup. */ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) @@ -896,8 +897,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); - for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -906,7 +905,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); - if (ret) { + if (ret || memcg == root_mem_cgroup) { mem_cgroup_iter_break(memcg, iter); break; } @@ -2592,6 +2591,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) return ret; } +static long memcg_oom_badness(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + long points = 0; + int nid; + pg_data_t *pgdat; + + for_each_node_state(nid, N_MEMORY) { + if (nodemask && !node_isset(nid, *nodemask)) + continue; + + points += mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); + + pgdat = NODE_DATA(nid); + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), + NR_SLAB_UNRECLAIMABLE); + } + + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / + (PAGE_SIZE / 1024); + points += memcg_page_state(memcg, MEMCG_SOCK); + points += memcg_page_state(memcg, MEMCG_SWAP); + + return points; +} + +/* + * Checks if the given memcg is a valid OOM victim and returns a number, + * which means the folowing: + * -1: there are inflight OOM victim tasks, belonging to the memcg + * 0: memcg is not eligible, e.g. all belonging tasks are protected + * by oom_score_adj set to OOM_SCORE_ADJ_MIN + * >0: memcg is eligible, and the returned value is an estimation + * of the memory footprint + */ +static long oom_evaluate_memcg(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + struct css_task_iter it; + struct task_struct *task; + int eligible = 0; + + /* + * Root memory cgroup is a special case: + * we don't have necessary stats to evaluate it exactly as + * leaf memory cgroups, so we approximate it's oom_score + * by summing oom_score of all belonging tasks, which are + * owners of their mm structs. + * + * If there are inflight OOM victim tasks inside + * the root memcg, we return -1. + */ + if (memcg == root_mem_cgroup) { + struct css_task_iter it; + struct task_struct *task; + long score = 0; + + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, + &task->signal->oom_mm->flags)) { + score = -1; + break; + } + + task_lock(task); + if (!task->mm || task->mm->owner != task) { + task_unlock(task); + continue; + } + task_unlock(task); + + score += oom_badness(task, memcg, nodemask, + totalpages); + } + css_task_iter_end(&it); + + return score; + } + + /* + * Memcg is OOM eligible if there are OOM killable tasks inside. + * + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN + * as unkillable. + * + * If there are inflight OOM victim tasks inside the memcg, + * we return -1. + */ + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (!eligible && + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) + eligible = 1; + + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { + eligible = -1; + break; + } + } + css_task_iter_end(&it); + + if (eligible <= 0) + return eligible; + + return memcg_oom_badness(memcg, nodemask, totalpages); +} + +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) +{ + struct mem_cgroup *iter, *group = NULL; + long group_score = 0; + + oc->chosen_memcg = NULL; + oc->chosen_points = 0; + + /* + * If OOM is memcg-wide, and the memcg has the oom_group flag set, + * all tasks belonging to the memcg should be killed. + * So, we mark the memcg as a victim. + */ + if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) { + oc->chosen_memcg = oc->memcg; + css_get(&oc->chosen_memcg->css); + return; + } + + /* + * The oom_score is calculated for leaf memory cgroups (including + * the root memcg). + * Non-leaf oom_group cgroups accumulating score of descendant + * leaf memory cgroups. + */ + rcu_read_lock(); + for_each_mem_cgroup_tree(iter, root) { + long score; + + /* + * We don't consider non-leaf non-oom_group memory cgroups + * as OOM victims. + */ + if (memcg_has_children(iter) && iter != root_mem_cgroup && + !mem_cgroup_oom_group(iter)) + continue; + + /* + * If group is not set or we've ran out of the group's sub-tree, + * we should set group and reset group_score. + */ + if (!group || group == root_mem_cgroup || + !mem_cgroup_is_descendant(iter, group)) { + group = iter; + group_score = 0; + } + + if (memcg_has_children(iter) && iter != root_mem_cgroup) + continue; + + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); + + /* + * Ignore empty and non-eligible memory cgroups. + */ + if (score == 0) + continue; + + /* + * If there are inflight OOM victims, we don't need + * to look further for new victims. + */ + if (score == -1) { + oc->chosen_memcg = INFLIGHT_VICTIM; + mem_cgroup_iter_break(root, iter); + break; + } + + group_score += score; + + if (group_score > oc->chosen_points) { + oc->chosen_points = group_score; + oc->chosen_memcg = group; + } + } + + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) + css_get(&oc->chosen_memcg->css); + + rcu_read_unlock(); +} + +bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + struct mem_cgroup *root; + + if (mem_cgroup_disabled()) + return false; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return false; + + if (oc->memcg) + root = oc->memcg; + else + root = root_mem_cgroup; + + select_victim_memcg(root, oc); + + return oc->chosen_memcg; +} + /* * Reclaims as many pages from the given memcg as possible. * @@ -4031,6 +4248,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); @@ -4041,8 +4266,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { - idr_remove(&mem_cgroup_idr, memcg->id.id); - memcg->id.id = 0; + mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); @@ -4179,8 +4403,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: - if (memcg->id.id > 0) - idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); return NULL; } @@ -4239,6 +4462,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: + mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); return ERR_PTR(-ENOMEM); } @@ -5190,6 +5414,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + bool oom_group = memcg->oom_group; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + seq_printf(m, "%d\n", oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int oom_group; + int err; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + err = kstrtoint(strstrip(buf), 0, &oom_group); + if (err) + return err; + + memcg->oom_group = oom_group; + + return nbytes; +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5314,6 +5571,12 @@ static struct cftype memory_files[] = { .write = memory_max_write, }, { + .name = "oom_group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, + { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct mem_cgroup, events_file), diff --git a/mm/memory.c b/mm/memory.c index 01f5464e0fd2..a1f990e33e38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2925,7 +2925,7 @@ int do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -3035,7 +3035,6 @@ int do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3049,6 +3048,7 @@ int do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || diff --git a/mm/migrate.c b/mm/migrate.c index f65dd69e1fd1..bb6367d70a3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); - expected_count += 1 + page_has_private(page); + expected_count += hpage_nr_pages(page) + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { @@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - get_page(newpage); /* add cache reference */ + page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space *mapping, } radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + if (PageTransHuge(page)) { + int i; + int index = page_index(page); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pslot = radix_tree_lookup_slot(&mapping->i_pages, + index + i); + radix_tree_replace_slot(&mapping->i_pages, pslot, + newpage + i); + } + } else { + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + } /* * Drop cache reference from old page by unfreezing * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..a66f2052c7b1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760a..dfd370526909 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -304,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -static int oom_evaluate_task(struct task_struct *task, void *arg) +int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; @@ -338,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); + if (oc->chosen_task) + put_task_struct(oc->chosen_task); get_task_struct(task); - oc->chosen = task; + oc->chosen_task = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); - oc->chosen = (void *)-1UL; + if (oc->chosen_task) + put_task_struct(oc->chosen_task); + oc->chosen_task = INFLIGHT_VICTIM; return 1; } /* * Simple selection loop. We choose the process with the highest number of - * 'points'. In case scan was aborted, oc->chosen is set to -1. + * 'points'. In case scan was aborted, oc->chosen_task is set to -1. */ static void select_bad_process(struct oom_control *oc) { @@ -830,67 +830,22 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly + * __oom_kill_process() is used to kill all tasks belonging to + * the selected memory cgroup, so we should check that we're not + * trying to kill an unkillable task. */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + if (is_global_init(victim) || (victim->flags & PF_KTHREAD) || + victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + put_task_struct(victim); return; } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); p = find_lock_task_mm(victim); if (!p) { @@ -965,6 +920,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } #undef K +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen_task; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + __oom_kill_process(victim); +} + +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + get_task_struct(task); + __oom_kill_process(task); + return 0; +} + +static bool oom_kill_memcg_victim(struct oom_control *oc) +{ + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) + return oc->chosen_memcg; + + /* + * If memory.oom_group is set, kill all tasks belonging to the sub-tree + * of the chosen memory cgroup, otherwise kill the task with the biggest + * memory footprint. + */ + if (mem_cgroup_oom_group(oc->chosen_memcg)) { + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member, + NULL); + /* We have one or more terminating processes at this point. */ + oc->chosen_task = INFLIGHT_VICTIM; + } else { + oc->chosen_points = 0; + oc->chosen_task = NULL; + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); + + if (oc->chosen_task == NULL || + oc->chosen_task == INFLIGHT_VICTIM) + goto out; + + __oom_kill_process(oc->chosen_task); + } + +out: + mem_cgroup_put(oc->chosen_memcg); + return oc->chosen_task; +} + /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ @@ -1017,6 +1074,7 @@ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; + bool delay = false; /* if set, delay next allocation attempt */ if (oom_killer_disabled) return false; @@ -1061,27 +1119,37 @@ bool out_of_memory(struct oom_control *oc) current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oc->chosen = current; + oc->chosen_task = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } + if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) { + delay = true; + goto out; + } + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); + delay = true; } - return !!oc->chosen; + +out: + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ + if (delay) + schedule_timeout_killable(1); + + return !!oc->chosen_task; } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page); diff --git a/mm/page_owner.c b/mm/page_owner.c index 75d21a2259b3..77d9e791ae8a 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) unsigned long block_end_pfn; if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/rmap.c b/mm/rmap.c index 056213326651..6db729dc4c50 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!pvmw.pte && (flags & TTU_MIGRATION)) { VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); - if (!PageAnon(page)) - continue; - set_pmd_migration_entry(&pvmw, page); continue; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b..640e68f8324b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -303,7 +303,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } if (vmemmap_buf_start) { diff --git a/mm/sparse.c b/mm/sparse.c index 62eef264a7bd..4a58f8809542 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -202,6 +202,12 @@ static inline int next_present_section_nr(int section_nr) (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +/* + * Record how many memory sections are marked as present + * during system bootup. + */ +static int __initdata nr_present_sections; + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -231,6 +237,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); + nr_present_sections++; } } } @@ -446,7 +453,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -474,7 +480,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; return NULL; } #endif @@ -486,10 +491,12 @@ void __weak __meminit vmemmap_populate_print_last(void) /** * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap * @map: usemap_map for pageblock flags or mmap_map for vmemmap + * @unit_size: size of map unit */ static void __init alloc_usemap_and_memmap(void (*alloc_func) (void *, unsigned long, unsigned long, - unsigned long, int), void *data) + unsigned long, int), void *data, + int data_unit_size) { unsigned long pnum; unsigned long map_count; @@ -566,7 +573,8 @@ void __init sparse_init(void) if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); + (void *)usemap_map, + sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; @@ -574,21 +582,28 @@ void __init sparse_init(void) if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); + (void *)map_map, + sizeof(map_map[0])); #endif for_each_present_section_nr(0, pnum) { + struct mem_section *ms; + ms = __nr_to_section(pnum); usemap = usemap_map[pnum]; - if (!usemap) + if (!usemap) { + ms->section_mem_map = 0; continue; + } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER map = map_map[pnum]; #else map = sparse_early_mem_map_alloc(pnum); #endif - if (!map) + if (!map) { + ms->section_mem_map = 0; continue; + } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3..fe079756bb18 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -337,8 +337,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -381,8 +386,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -392,7 +397,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index cc2cf04d9018..5a280972bd87 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> +#include <linux/stop_machine.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -1107,6 +1108,64 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * preempt_disable() in get_swap_device() or after the + * preempt_enable() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long type, offset; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + si = swap_info[type]; + + preempt_disable(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + preempt_enable(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1328,11 +1387,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1357,9 +1423,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -1800,8 +1868,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1810,6 +1876,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not @@ -2451,9 +2519,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2478,7 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2497,6 +2569,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, add_to_avail_list(p); } +static int swap_onoff_stop(void *arg) +{ + return 0; +} + static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, @@ -2505,7 +2582,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are used + * between get/put_swap_device() only if SWP_VALID bit is set + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2514,7 +2601,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2617,6 +2705,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3360,22 +3459,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) + p = get_swap_device(entry); + if (!p) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = swap_info[type]; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3421,11 +3514,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3517,6 +3608,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3524,15 +3616,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3550,9 +3642,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3604,10 +3695,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e16d6713f236..24618dffc5cb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6257,6 +6257,13 @@ sub process { "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); } +# check for bool use in .h files + if ($realfile =~ /\.h$/ && + $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) { + CHK("BOOL_MEMBER", + "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index ce2b89e9ad94..cf00c8520d30 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -693,6 +693,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm) aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4cafe6a19167..a5acf9dae6b7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2641,6 +2641,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current |