From 8c8c5a4994a306c217fd061cbfc5903399fd4c1c Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 10 Jan 2020 18:19:33 +0100 Subject: dma-contiguous: CMA: give precedence to cmdline Although the device tree might contain a reserved-memory DT node dedicated as the default CMA pool, users might want to change CMA's parameters using the kernel command line for debugging purposes and whatnot. Honor this by bypassing the reserved memory CMA setup, which will ultimately end up freeing the memblock and allow the command line CMA configuration routine to run. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Phil Elwell Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; -- cgit v1.2.1 From 91ef26f914171cf753330f13724fd9142b5b1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 18:11:10 +0100 Subject: dma-direct: relax addressability checks in dma_direct_supported dma_direct_supported tries to find the minimum addressable bitmask based on the end pfn and optional magic that architectures can use to communicate the size of the magic ZONE_DMA that can be used for bounce buffering. But between the DMA offsets that can change per device (or sometimes even region), the fact the ZONE_DMA isn't even guaranteed to be the lowest addresses and failure of having proper interfaces to the MM code this fails at least for one arm subarchitecture. As all the legacy DMA implementations have supported 32-bit DMA masks, and 32-bit masks are guranteed to always work by the API contract (using bounce buffers if needed), we can short cut the complicated check and always return true without breaking existing assumptions. Hopefully we can properly clean up the interaction with the arch defined zones and the bootmem allocator eventually. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Peter Ujfalusi Signed-off-by: Christoph Hellwig Tested-by: Peter Ujfalusi --- kernel/dma/direct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..32ec69cdba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -472,28 +472,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; - - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } -- cgit v1.2.1 From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 16 +++++++--------- kernel/dma/swiotlb.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 32ec69cdba54..594bddd04e01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,13 +357,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,8 +364,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) -- cgit v1.2.1 From 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:54:50 +0100 Subject: dma-direct: improve DMA mask overflow reporting Remove the unset dma_mask case as that won't get into mapping calls anymore, and also report the other errors unconditonally and with a slightly improved message. Remove the now pointless report_addr helper. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 594bddd04e01..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -371,7 +359,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, if (swiotlb_force != SWIOTLB_NO_FORCE) return swiotlb_map(dev, phys, size, dir, attrs); - report_addr(dev, dma_addr, size); + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -409,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } -- cgit v1.2.1 From 2bf0eb9b3b0d099b20b2c4736436b666d78b94d5 Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Mon, 10 Feb 2020 09:14:41 +0800 Subject: bpf: Make btf_check_func_type_match() static Fix the following sparse warning: kernel/bpf/btf.c:4131:5: warning: symbol 'btf_check_func_type_match' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200210011441.147102-1-yaohongbo@huawei.com --- kernel/bpf/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; -- cgit v1.2.1 From cba6437a1854fde5934098ec3bd0ee83af3129f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 12:19:41 +0100 Subject: genirq/proc: Reject invalid affinity masks (again) Qian Cai reported that the WARN_ON() in the x86/msi affinity setting code, which catches cases where the affinity setting is not done on the CPU which is the current target of the interrupt, triggers during CPU hotplug stress testing. It turns out that the warning which was added with the commit addressing the MSI affinity race unearthed yet another long standing bug. If user space writes a bogus affinity mask, i.e. it contains no online CPUs, then it calls irq_select_affinity_usr(). This was introduced for ALPHA in eee45269b0f5 ("[PATCH] Alpha: convert to generic irq framework (generic part)") and subsequently made available for all architectures in 18404756765c ("genirq: Expose default irq affinity mask (take 3)") which introduced the circumvention of the affinity setting restrictions for interrupt which cannot be moved in process context. The whole exercise is bogus in various aspects: 1) If the interrupt is already started up then there is absolutely no point to honour a bogus interrupt affinity setting from user space. The interrupt is already assigned to an online CPU and it does not make any sense to reassign it to some other randomly chosen online CPU. 2) If the interupt is not yet started up then there is no point either. A subsequent startup of the interrupt will invoke irq_setup_affinity() anyway which will chose a valid target CPU. So the only correct solution is to just return -EINVAL in case user space wrote an affinity mask which does not contain any online CPUs, except for ALPHA which has it's own magic sauce for this. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Tested-by: Qian Cai Link: https://lkml.kernel.org/r/878sl8xdbm.fsf@nanos.tec.linutronix.de --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 18 ++---------------- kernel/irq/proc.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..7eee98c38f25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) -- cgit v1.2.1 From e20d3a055a457a10a4c748ce5b7c2ed3173a1324 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { -- cgit v1.2.1 From 140588bfed2727e9a4814211617a79401d74922f Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 14 Feb 2020 18:26:28 +0100 Subject: s390: remove obsolete ieee_emulation_warnings s390 math emulation was removed with commit 5a79859ae0f3 ("s390: remove 31 bit support"), rendering ieee_emulation_warnings useless. The code still built because it was protected by CONFIG_MATHEMU, which was no longer selectable. This patch removes the sysctl_ieee_emulation_warnings declaration and the sysctl entry declaration. Link: https://lkml.kernel.org/r/20200214172628.3598516-1-steve@sk2.org Reviewed-by: Vasily Gorbik Signed-off-by: Stephen Kitt Signed-off-by: Vasily Gorbik --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, -- cgit v1.2.1 From 492e0d0d6f2eb4badfd2868addf9da0f651eba0e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Tue, 18 Feb 2020 09:25:52 -0800 Subject: bpf: Do not grab the bucket spinlock by default on htab batch ops Grabbing the spinlock for every bucket even if it's empty, was causing significant perfomance cost when traversing htab maps that have only a few entries. This patch addresses the issue by checking first the bucket_cnt, if the bucket has some entries then we go and grab the spinlock and proceed with the batching. Tested with a htab of size 50K and different value of populated entries. Before: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 2759655 2752033 BM_DumpHashMap/10 2933722 2930825 BM_DumpHashMap/200 3171680 3170265 BM_DumpHashMap/500 3639607 3635511 BM_DumpHashMap/1000 4369008 4364981 BM_DumpHashMap/5k 11171919 11134028 BM_DumpHashMap/20k 69150080 69033496 BM_DumpHashMap/39k 190501036 190226162 After: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 202707 200109 BM_DumpHashMap/10 213441 210569 BM_DumpHashMap/200 478641 472350 BM_DumpHashMap/500 980061 967102 BM_DumpHashMap/1000 1863835 1839575 BM_DumpHashMap/5k 8961836 8902540 BM_DumpHashMap/20k 69761497 69322756 BM_DumpHashMap/39k 187437830 186551111 Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200218172552.215077-1-brianvv@google.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..9194479a2fa7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1259,7 +1259,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1320,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1348,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1360,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1380,6 +1398,8 @@ again_nocopy: } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ -- cgit v1.2.1 From b9aff38de2cb166476988020428985c5f7412ffc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 19 Feb 2020 15:47:57 -0800 Subject: bpf: Fix a potential deadlock with bpf_map_do_batch Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") added lookup_and_delete batch operation for hash table. The current implementation has bpf_lru_push_free() inside the bucket lock, which may cause a deadlock. syzbot reports: -> #2 (&htab->buckets[i].lock#2){....}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593 __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220 [inline] __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132 __htab_lru_percpu_map_update_elem+0x67e/0xa90 kernel/bpf/hashtab.c:1069 bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585 bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181 generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&loc_l->lock){....}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954 lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline] bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555 __htab_map_lookup_and_delete_batch+0x8d4/0x1540 kernel/bpf/hashtab.c:1374 htab_lru_map_lookup_and_delete_batch+0x34/0x40 kernel/bpf/hashtab.c:1491 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Possible unsafe locking scenario: CPU0 CPU2 ---- ---- lock(&htab->buckets[i].lock#2); lock(&l->lock); lock(&htab->buckets[i].lock#2); lock(&loc_l->lock); *** DEADLOCK *** To fix the issue, for htab_lru_map_lookup_and_delete_batch() in CPU0, let us do bpf_lru_push_free() out of the htab bucket lock. This can avoid the above deadlock scenario. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Reported-by: syzbot+a38ff3d9356388f2fb83@syzkaller.appspotmail.com Reported-by: syzbot+122b5421d14e68f29cd1@syzkaller.appspotmail.com Suggested-by: Hillf Danton Suggested-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Brian Vazquez Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200219234757.3544014-1-yhs@fb.com --- kernel/bpf/hashtab.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9194479a2fa7..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,6 +1268,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -1388,10 +1401,18 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; @@ -1399,6 +1420,13 @@ again_nocopy: raw_spin_unlock_irqrestore(&b->lock, flags); locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. -- cgit v1.2.1 From b0c609ab2057d0953fa05e7566f0c0e8a28fa9e1 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 14 Feb 2020 15:06:21 +0100 Subject: PM / hibernate: fix typo "reserverd_size" -> "reserved_size" Fix a mistake in a variable name in a comment. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * -- cgit v1.2.1 From 279eef0531928a6669230879a6eed081513ad5a3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:38 -0600 Subject: tracing: Make sure synth_event_trace() example always uses u64 synth_event_trace() is the varargs version of synth_event_trace_array(), which takes an array of u64, as do synth_event_add_val() et al. To not only be consistent with those, but also to address the fact that synth_event_trace() expects every arg to be of the same type since it doesn't also pass in e.g. a format string, the caller needs to make sure all args are of the same type, u64. u64 is used because it needs to accomodate the largest type available in synthetic events, which is u64. This fixes the bug reported by the kernel test robot/Rong Chen. Link: https://lore.kernel.org/lkml/20200212113444.GS12867@shao2-debian/ Link: http://lkml.kernel.org/r/894c4e955558b521210ee0642ba194a9e603354c.1581720155.git.zanussi@kernel.org Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module") Reported-by: kernel test robot Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..6866280a9b10 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -350,7 +350,7 @@ static int __init test_add_next_synth_val(void) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)smp_processor_id(),/* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } -- cgit v1.2.1 From 1d9d4c90194a8c3b2f7da9f4bf3f8ba2ed810656 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:39 -0600 Subject: tracing: Make synth_event trace functions endian-correct synth_event_trace(), synth_event_trace_array() and __synth_event_add_val() write directly into the trace buffer and need to take endianness into account, like trace_event_raw_event_synth() does. Link: http://lkml.kernel.org/r/2011354355e405af9c9d28abba430d1f5ff7771a.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 62 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65b54d6a1422..6a380fb83864 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1891,7 +1891,25 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -1943,7 +1961,26 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -2062,8 +2099,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } -- cgit v1.2.1 From 3843083772dc2afde790a6d7160658b00a808da1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:40 -0600 Subject: tracing: Check that number of vals matches number of synth event fields Commit 7276531d4036('tracing: Consolidate trace() functions') inadvertently dropped the synth_event_trace() and synth_event_trace_array() checks that verify the number of values passed in matches the number of fields in the synthetic event being traced, so add them back. Link: http://lkml.kernel.org/r/32819cac708714693669e0dfe10fe9d935e94a16.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6a380fb83864..45622194a34d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1878,6 +1878,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1914,7 +1919,7 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) } } va_end(args); - +out: __synth_event_trace_end(&state); return ret; @@ -1953,6 +1958,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; @@ -1984,7 +1994,7 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, n_u64++; } } - +out: __synth_event_trace_end(&state); return ret; -- cgit v1.2.1 From 784bd0847eda032ed2f3522f87250655a18c0190 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:41 -0600 Subject: tracing: Fix number printing bug in print_synth_event() Fix a varargs-related bug in print_synth_event() which resulted in strange output and oopses on 32-bit x86 systems. The problem is that trace_seq_printf() expects the varargs to match the format string, but print_synth_event() was always passing u64 values regardless. This results in unspecified behavior when unpacking with va_arg() in trace_seq_printf(). Add a function that takes the size into account when calling trace_seq_printf(). Before: modprobe-1731 [003] .... 919.039758: gen_synth_test: next_pid_field=777(null)next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=3(null)my_string_field=thneed my_int_field=598(null) After: insmod-1136 [001] .... 36.634590: gen_synth_test: next_pid_field=777 next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=1 my_string_field=thneed my_int_field=598 Link: http://lkml.kernel.org/r/a9b59eb515dbbd7d4abe53b347dccf7a8e285657.1581720155.git.zanussi@kernel.org Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45622194a34d..f068d55bd37f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -820,6 +820,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -858,10 +881,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); -- cgit v1.2.1 From 3c18a9be7c9d4f53239795282c5d927f73f534b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 16:29:50 -0500 Subject: tracing: Have synthetic event test use raw_smp_processor_id() The test code that tests synthetic event creation pushes in as one of its test fields the current CPU using "smp_processor_id()". As this is just something to see if the value is correctly passed in, and the actual CPU used does not matter, use raw_smp_processor_id(), otherwise with debug preemption enabled, a warning happens as the smp_processor_id() is called without preemption enabled. Link: http://lkml.kernel.org/r/20200220162950.35162579@gandalf.local.home Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 6866280a9b10..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -114,7 +114,7 @@ static int __init test_gen_synth_cmd(void) vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ @@ -221,7 +221,7 @@ static int __init test_empty_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ @@ -293,7 +293,7 @@ static int __init test_create_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ @@ -345,7 +345,7 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -427,7 +427,7 @@ static int __init test_trace_synth_event(void) (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ (u64)1000, /* ts_ms */ - (u64)smp_processor_id(),/* cpu */ + (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ (u64)999); /* my_int_field */ return ret; -- cgit v1.2.1 From 78041c0c9e935d9ce4086feeff6c569ed88ddfd4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 15:38:01 -0500 Subject: tracing: Disable trace_printk() on post poned tests The tracing seftests checks various aspects of the tracing infrastructure, and one is filtering. If trace_printk() is active during a self test, it can cause the filtering to fail, which will disable that part of the trace. To keep the selftests from failing because of trace_printk() calls, trace_printk() checks the variable tracing_selftest_running, and if set, it does not write to the tracing buffer. As some tracers were registered earlier in boot, the selftest they triggered would fail because not all the infrastructure was set up for the full selftest. Thus, some of the tests were post poned to when their infrastructure was ready (namely file system code). The postpone code did not set the tracing_seftest_running variable, and could fail if a trace_printk() was added and executed during their run. Cc: stable@vger.kernel.org Fixes: 9afecfbb95198 ("tracing: Postpone tracer start-up tests till the system is more robust") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183b031a3828..a89c562ffb8f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); -- cgit v1.2.1 From 7ab215f22d04067094de8c81c20ba4c565ff8dd4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 17 Feb 2020 18:52:39 +0900 Subject: tracing: Clear trace_state when starting trace Clear trace_state data structure when starting trace in __synth_event_trace_start() internal function. Currently trace_state is initialized only in the synth_event_trace_start() API, but the trace_state in synth_event_trace() and synth_event_trace_array() are on the stack without initialization. This means those APIs will see wrong parameters and wil skip closing process in __synth_event_trace_end() because trace_state->disabled may be !0. Link: http://lkml.kernel.org/r/158193315899.8868.1781259176894639952.stgit@devnote2 Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f068d55bd37f..9d87aa1f0b79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1824,6 +1824,8 @@ __synth_event_trace_start(struct trace_event_file *file, int entry_size, fields_size = 0; int ret = 0; + memset(trace_state, '\0', sizeof(*trace_state)); + /* * Normal event tracing doesn't get called at all unless the * ENABLED bit is set (which attaches the probe thus allowing @@ -2063,8 +2065,6 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - memset(trace_state, '\0', sizeof(*trace_state)); - ret = __synth_event_trace_start(file, trace_state); if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ -- cgit v1.2.1 From d8a953ddde5ec30a36810d0a892c3949b50849e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Feb 2020 21:18:33 +0900 Subject: bootconfig: Set CONFIG_BOOT_CONFIG=n by default Set CONFIG_BOOT_CONFIG=n by default. This also warns user if CONFIG_BOOT_CONFIG=n but "bootconfig" is given in the kernel command line. Link: http://lkml.kernel.org/r/158220111291.26565.9036889083940367969.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..795c3e02d3f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,7 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING + depends on TRACING + select BOOT_CONFIG default y help Enable developer to setup ftrace subsystem via supplemental -- cgit v1.2.1 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 64 ------------------------------------------------------ kernel/time/time.c | 43 ------------------------------------ 2 files changed, 107 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.1 From 2ad3e17ebf94b7b7f3f64c050ff168f9915345eb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 22 Feb 2020 20:36:47 -0500 Subject: audit: fix error handling in audit_data_to_entry() Commit 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") combined a number of separate fields in the audit_field struct into a single union. Generally this worked just fine because they are generally mutually exclusive. Unfortunately in audit_data_to_entry() the overlap can be a problem when a specific error case is triggered that causes the error path code to attempt to cleanup an audit_field struct and the cleanup involves attempting to free a stored LSM string (the lsm_str field). Currently the code always has a non-NULL value in the audit_field.lsm_str field as the top of the for-loop transfers a value into audit_field.val (both .lsm_str and .val are part of the same union); if audit_data_to_entry() fails and the audit_field struct is specified to contain a LSM string, but the audit_field.lsm_str has not yet been properly set, the error handling code will attempt to free the bogus audit_field.lsm_str value that was set with audit_field.val at the top of the for-loop. This patch corrects this by ensuring that the audit_field.val is only set when needed (it is cleared when the audit_field struct is allocated with kcalloc()). It also corrects a few other issues to ensure that in case of error the proper error code is returned. Cc: stable@vger.kernel.org Fixes: 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") Reported-by: syzbot+1f4d90ead370d72e450b@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/auditfilter.c | 71 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } -- cgit v1.2.1 From 756125289285f6e55a03861bf4b6257aa3d19a93 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 24 Feb 2020 16:38:57 -0500 Subject: audit: always check the netlink payload length in audit_receive_msg() This patch ensures that we always check the netlink payload length in audit_receive_msg() before we take any action on the payload itself. Cc: stable@vger.kernel.org Reported-by: syzbot+399c44bf1f43b8747403@syzkaller.appspotmail.com Reported-by: syzbot+e4b12d8d202701f08b6d@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/audit.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) -- cgit v1.2.1 From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Feb 2020 15:28:12 +0100 Subject: blktrace: Protect q->blk_trace with RCU KASAN is reporting that __blk_add_trace() has a use-after-free issue when accessing q->blk_trace. Indeed the switching of block tracing (and thus eventual freeing of q->blk_trace) is completely unsynchronized with the currently running tracing and thus it can happen that the blk_trace structure is being freed just while __blk_add_trace() works on it. Protect accesses to q->blk_trace by RCU during tracing and make sure we wait for the end of RCU grace period when shutting down tracing. Luckily that is rare enough event that we can afford that. Note that postponing the freeing of blk_trace to an RCU callback should better be avoided as it could have unexpected user visible side-effects as debugfs files would be still existing for a short while block tracing has been shut down. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 CC: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Bart Van Assche Reported-by: Tristan Madani Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 114 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: -- cgit v1.2.1 From 2910b5aa6f545c044173a5cab3dbb7f43e23916d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 25 Feb 2020 23:36:41 +0900 Subject: bootconfig: Fix CONFIG_BOOTTIME_TRACING dependency issue Since commit d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") also changed the CONFIG_BOOTTIME_TRACING to select CONFIG_BOOT_CONFIG to show the boot-time tracing on the menu, it introduced wrong dependencies with BLK_DEV_INITRD as below. WARNING: unmet direct dependencies detected for BOOT_CONFIG Depends on [n]: BLK_DEV_INITRD [=n] Selected by [y]: - BOOTTIME_TRACING [=y] && TRACING_SUPPORT [=y] && FTRACE [=y] && TRACING [=y] This makes the CONFIG_BOOT_CONFIG selects CONFIG_BLK_DEV_INITRD to fix this error and make CONFIG_BOOTTIME_TRACING=n by default, so that both boot-time tracing and boot configuration off but those appear on the menu list. Link: http://lkml.kernel.org/r/158264140162.23842.11237423518607465535.stgit@devnote2 Fixes: d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") Reported-by: Randy Dunlap Compiled-tested-by: Randy Dunlap Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 795c3e02d3f1..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -145,7 +145,6 @@ config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING select BOOT_CONFIG - default y help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver -- cgit v1.2.1 From fda31c50292a5062332fa0343c084bd9f46604d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Feb 2020 12:47:14 -0800 Subject: signal: avoid double atomic counter increments for user accounting When queueing a signal, we increment both the users count of pending signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount of the user struct itself (because we keep a reference to the user in the signal structure in order to correctly account for it when freeing). That turns out to be fairly expensive, because both of them are atomic updates, and particularly under extreme signal handling pressure on big machines, you can get a lot of cache contention on the user struct. That can then cause horrid cacheline ping-pong when you do these multiple accesses. So change the reference counting to only pin the user for the _first_ pending signal, and to unpin it when the last pending signal is dequeued. That means that when a user sees a lot of concurrent signal queuing - which is the only situation when this matters - the only atomic access needed is generally the 'sigpending' count update. This was noticed because of a particularly odd timing artifact on a dual-socket 96C/192T Cascade Lake platform: when you get into bad contention, on that machine for some reason seems to be much worse when the contention happens in the upper 32-byte half of the cacheline. As a result, the kernel test robot will-it-scale 'signal1' benchmark had an odd performance regression simply due to random alignment of the 'struct user_struct' (and pointed to a completely unrelated and apparently nonsensical commit for the regression). Avoiding the double increments (and decrements on the dequeueing side, of course) makes for much less contention and hugely improved performance on that will-it-scale microbenchmark. Quoting Feng Tang: "It makes a big difference, that the performance score is tripled! bump from original 17000 to 54000. Also the gap between 5.0-rc6 and 5.0-rc6+Jiri's patch is reduced to around 2%" [ The "2% gap" is the odd cacheline placement difference on that platform: under the extreme contention case, the effect of which half of the cacheline was hot was 5%, so with the reduced contention the odd timing artifact is reduced too ] It does help in the non-contended case too, but is not nearly as noticeable. Reported-and-tested-by: Feng Tang Cc: Eric W. Biederman Cc: Huang, Ying Cc: Philip Li Cc: Andi Kleen Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/signal.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } -- cgit v1.2.1 From 289de35984815576793f579ec27248609e75976e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 18 Feb 2020 15:45:34 +0100 Subject: sched/fair: Fix statistics for find_idlest_group() sgs->group_weight is not set while gathering statistics in update_sg_wakeup_stats(). This means that a group can be classified as fully busy with 0 running tasks if utilization is high enough. This path is mainly used for fork and exec. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20200218144534.4564-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..c1217bfe5e81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8337,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* -- cgit v1.2.1 From 0c282b068eb26db0e85e2ab4ec6d1e932acda841 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 27 Jan 2020 23:28:21 +0530 Subject: fork: Use RCU_INIT_POINTER() instead of rcu_access_pointer() Use RCU_INIT_POINTER() instead of rcu_access_pointer() in copy_sighand(). Suggested-by: Oleg Nesterov Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200127175821.10833-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..86425305cd4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; -- cgit v1.2.1 From 22a34c6fe0ffc1d92ee26a25913fadf347258fd6 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 30 Jan 2020 11:50:28 +0530 Subject: exit: Fix Sparse errors and warnings This patch fixes the following sparse error: kernel/exit.c:627:25: error: incompatible types in comparison expression And the following warning: kernel/exit.c:626:40: warning: incorrect type in assignment Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200130062028.4870-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..0b81b26a872a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -619,8 +619,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) -- cgit v1.2.1 From 153031a301bb07194e9c37466cfce8eacb977621 Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Wed, 4 Mar 2020 13:58:19 +0300 Subject: blktrace: fix dereference after null check There was a recent change in blktrace.c that added a RCU protection to `q->blk_trace` in order to fix a use-after-free issue during access. However the change missed an edge case that can lead to dereferencing of `bt` pointer even when it's NULL: Coverity static analyzer marked this as a FORWARD_NULL issue with CID 1460458. ``` /kernel/trace/blktrace.c: 1904 in sysfs_blk_trace_attr_store() 1898 ret = 0; 1899 if (bt == NULL) 1900 ret = blk_trace_setup_queue(q, bdev); 1901 1902 if (ret == 0) { 1903 if (attr == &dev_attr_act_mask) >>> CID 1460458: Null pointer dereferences (FORWARD_NULL) >>> Dereferencing null pointer "bt". 1904 bt->act_mask = value; 1905 else if (attr == &dev_attr_pid) 1906 bt->pid = value; 1907 else if (attr == &dev_attr_start_lba) 1908 bt->start_lba = value; 1909 else if (attr == &dev_attr_end_lba) ``` Added a reassignment with RCU annotation to fix the issue. Fixes: c780e86dd48 ("blktrace: Protect q->blk_trace with RCU") Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Reviewed-by: Bob Liu Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Cengiz Can Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4560878f0bac..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (bt == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) -- cgit v1.2.1