summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2016-02-12 15:55:28 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2016-02-12 15:55:28 +1100
commit92ddf11b0812f11a93c9449a8ff54e5d1637c9ac (patch)
tree152bc58c1bb4e7b7fcecd062c99a3565f179f096
parent7b7cc3897f3ba37e9f9bc1e5af42783b309299b2 (diff)
parent234d02a19863f0e35cbd946bb9371bc826b95435 (diff)
downloadlinux-next-92ddf11b0812f11a93c9449a8ff54e5d1637c9ac.tar.gz
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/blockdev/zram.txt10
-rw-r--r--Documentation/cgroup-v2.txt19
-rw-r--r--Documentation/filesystems/ocfs2-online-filecheck.txt94
-rw-r--r--Documentation/kcov.txt111
-rw-r--r--Documentation/kernel-parameters.txt17
-rw-r--r--Documentation/memory-hotplug.txt23
-rw-r--r--Documentation/printk-formats.txt18
-rw-r--r--Documentation/rapidio/mport_cdev.txt104
-rw-r--r--Documentation/rapidio/tsi721.txt9
-rw-r--r--Documentation/vm/ksm.txt63
-rw-r--r--Documentation/vm/page_owner.txt9
-rw-r--r--Makefile11
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/alpha/include/asm/uaccess.h10
-rw-r--r--arch/alpha/mm/Makefile2
-rw-r--r--arch/alpha/mm/extable.c92
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm/kernel/smp.c2
-rw-r--r--arch/arm/mach-lpc32xx/phy3250.c13
-rw-r--r--arch/arm/mach-netx/fb.c14
-rw-r--r--arch/arm/mach-nspire/clcd.c13
-rw-r--r--arch/arm/mm/mmap.c2
-rw-r--r--arch/arm64/include/asm/assembler.h15
-rw-r--r--arch/arm64/include/asm/futex.h12
-rw-r--r--arch/arm64/include/asm/uaccess.h30
-rw-r--r--arch/arm64/include/asm/word-at-a-time.h7
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c7
-rw-r--r--arch/arm64/mm/extable.c2
-rw-r--r--arch/arm64/mm/mmap.c4
-rw-r--r--arch/avr32/Kconfig1
-rw-r--r--arch/blackfin/Kconfig1
-rw-r--r--arch/cris/Kconfig1
-rw-r--r--arch/ia64/include/asm/uaccess.h8
-rw-r--r--arch/ia64/mm/extable.c97
-rw-r--r--arch/m32r/mm/init.c27
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/mm/mmap.c4
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/rtasd.c9
-rw-r--r--arch/powerpc/mm/mmap.c4
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c10
-rw-r--r--arch/powerpc/sysdev/fsl_rio.c18
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/uaccess.h8
-rw-r--r--arch/s390/kernel/dumpstack.c6
-rw-r--r--arch/s390/kernel/time.c8
-rw-r--r--arch/s390/kernel/topology.c7
-rw-r--r--arch/s390/mm/Makefile2
-rw-r--r--arch/s390/mm/extable.c85
-rw-r--r--arch/s390/mm/vmem.c10
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/compat.h7
-rw-r--r--arch/sparc/include/asm/syscall.h9
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c2
-rw-r--r--arch/tile/Kconfig1
-rw-r--r--arch/tile/mm/init.c11
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/boot/Makefile6
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/entry/vdso/Makefile2
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/uaccess.h5
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/aperture_64.c12
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/machine_kexec_64.c46
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/extable.c106
-rw-r--r--arch/x86/mm/init.c36
-rw-r--r--arch/x86/mm/mmap.c6
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--block/genhd.c2
-rw-r--r--block/partition-generic.c11
-rw-r--r--drivers/ata/pata_hpt366.c13
-rw-r--r--drivers/base/memory.c34
-rw-r--r--drivers/base/property.c12
-rw-r--r--drivers/block/zram/zcomp.c24
-rw-r--r--drivers/block/zram/zcomp.h2
-rw-r--r--drivers/block/zram/zram_drv.c7
-rw-r--r--drivers/char/random.c22
-rw-r--r--drivers/dma/iop-adma.c8
-rw-r--r--drivers/dma/mv_xor.c4
-rw-r--r--drivers/dma/qcom_bam_dma.c14
-rw-r--r--drivers/firewire/core-cdev.c4
-rw-r--r--drivers/firmware/efi/efivars.c2
-rw-r--r--drivers/firmware/efi/libstub/Makefile2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c2
-rw-r--r--drivers/gpu/drm/drm_edid_load.c17
-rw-r--r--drivers/gpu/drm/drm_gem_cma_helper.c13
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gpu.c8
-rw-r--r--drivers/gpu/drm/omapdrm/omap_dmm_tiler.c13
-rw-r--r--drivers/gpu/drm/omapdrm/omap_gem.c8
-rw-r--r--drivers/gpu/drm/sti/sti_cursor.c20
-rw-r--r--drivers/gpu/drm/sti/sti_gdp.c3
-rw-r--r--drivers/gpu/drm/sti/sti_hqvdp.c6
-rw-r--r--drivers/gpu/drm/tegra/gem.c11
-rw-r--r--drivers/gpu/drm/vc4/vc4_bo.c5
-rw-r--r--drivers/gpu/host1x/cdma.c8
-rw-r--r--drivers/gpu/host1x/job.c10
-rw-r--r--drivers/hid/uhid.c2
-rw-r--r--drivers/ide/hpt366.c9
-rw-r--r--drivers/input/input-compat.h12
-rw-r--r--drivers/media/platform/coda/coda-bit.c10
-rw-r--r--drivers/net/rionet.c277
-rw-r--r--drivers/net/wireless/marvell/mwifiex/debugfs.c10
-rw-r--r--drivers/pinctrl/pinmux.c13
-rw-r--r--drivers/power/ab8500_btemp.c15
-rw-r--r--drivers/power/ab8500_charger.c16
-rw-r--r--drivers/power/ab8500_fg.c15
-rw-r--r--drivers/power/abx500_chargalg.c14
-rw-r--r--drivers/power/charger-manager.c27
-rw-r--r--drivers/rapidio/Kconfig8
-rw-r--r--drivers/rapidio/devices/Makefile1
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c2711
-rw-r--r--drivers/rapidio/devices/tsi721.c1034
-rw-r--r--drivers/rapidio/devices/tsi721.h87
-rw-r--r--drivers/rapidio/devices/tsi721_dma.c397
-rw-r--r--drivers/rapidio/rio-driver.c12
-rw-r--r--drivers/rapidio/rio-scan.c135
-rw-r--r--drivers/rapidio/rio.c433
-rw-r--r--drivers/rapidio/rio.h5
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_internal.h2
-rw-r--r--drivers/usb/common/common.c22
-rw-r--r--drivers/video/fbdev/acornfb.c4
-rw-r--r--drivers/video/fbdev/amba-clcd-versatile.c14
-rw-r--r--drivers/video/fbdev/amba-clcd.c4
-rw-r--r--drivers/video/fbdev/atmel_lcdfb.c9
-rw-r--r--drivers/video/fbdev/ep93xx-fb.c8
-rw-r--r--drivers/video/fbdev/gbefb.c8
-rw-r--r--drivers/video/fbdev/imxfb.c12
-rw-r--r--drivers/video/fbdev/mx3fb.c9
-rw-r--r--drivers/video/fbdev/nuc900fb.c8
-rw-r--r--drivers/video/fbdev/omap/lcdc.c16
-rw-r--r--drivers/video/fbdev/pxa168fb.c8
-rw-r--r--drivers/video/fbdev/pxafb.c4
-rw-r--r--drivers/video/fbdev/s3c-fb.c7
-rw-r--r--drivers/video/fbdev/s3c2410fb.c8
-rw-r--r--drivers/video/fbdev/sa1100fb.c8
-rw-r--r--drivers/xen/Kconfig23
-rw-r--r--drivers/xen/balloon.c11
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c163
-rw-r--r--fs/autofs4/symlink.c11
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/cifs/cifs_debug.c56
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1146
-rw-r--r--fs/ocfs2/aops.h19
-rw-r--r--fs/ocfs2/cluster/heartbeat.c191
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h21
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c11
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c124
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c33
-rw-r--r--fs/ocfs2/dlm/dlmthread.c7
-rw-r--r--fs/ocfs2/file.c165
-rw-r--r--fs/ocfs2/filecheck.c605
-rw-r--r--fs/ocfs2/filecheck.h48
-rw-r--r--fs/ocfs2/inode.c203
-rw-r--r--fs/ocfs2/inode.h9
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h18
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/xfs/xfs_aops.c11
-rw-r--r--include/asm-generic/atomic-long.h6
-rw-r--r--include/linux/auto_dev-ioctl.h6
-rw-r--r--include/linux/auto_fs.h10
-rw-r--r--include/linux/buffer_head.h10
-rw-r--r--include/linux/compaction.h16
-rw-r--r--include/linux/compat.h15
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/dma-mapping.h25
-rw-r--r--include/linux/err.h4
-rw-r--r--include/linux/fault-inject.h5
-rw-r--r--include/linux/gfp.h45
-rw-r--r--include/linux/hardirq.h2
-rw-r--r--include/linux/kcov.h27
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/kexec.h4
-rw-r--r--include/linux/list_bl.h4
-rw-r--r--include/linux/memcontrol.h147
-rw-r--r--include/linux/memory.h3
-rw-r--r--include/linux/memory_hotplug.h7
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h41
-rw-r--r--include/linux/mmdebug.h3
-rw-r--r--include/linux/mmzone.h24
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags-layout.h2
-rw-r--r--include/linux/page-flags.h32
-rw-r--r--include/linux/page_ext.h1
-rw-r--r--include/linux/page_owner.h50
-rw-r--r--include/linux/pagemap.h3
-rw-r--r--include/linux/percpu.h3
-rw-r--r--include/linux/poison.h4
-rw-r--r--include/linux/printk.h12
-rw-r--r--include/linux/radix-tree.h27
-rw-r--r--include/linux/random.h1
-rw-r--r--include/linux/rio.h97
-rw-r--r--include/linux/rio_drv.h15
-rw-r--r--include/linux/rio_mport_cdev.h271
-rw-r--r--include/linux/rio_regs.h3
-rw-r--r--include/linux/sched.h15
-rw-r--r--include/linux/sem.h1
-rw-r--r--include/linux/slab.h11
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h1
-rw-r--r--include/linux/string.h9
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/tick.h2
-rw-r--r--include/linux/trace_events.h10
-rw-r--r--include/linux/tracepoint-defs.h14
-rw-r--r--include/linux/unaligned/access_ok.h24
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/trace/events/btrfs.h2
-rw-r--r--include/trace/events/compaction.h57
-rw-r--r--include/trace/events/gfpflags.h43
-rw-r--r--include/trace/events/huge_memory.h40
-rw-r--r--include/trace/events/kmem.h2
-rw-r--r--include/trace/events/mmflags.h165
-rw-r--r--include/trace/events/vmscan.h2
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/auto_fs.h21
-rw-r--r--include/uapi/linux/auto_fs4.h17
-rw-r--r--include/uapi/linux/byteorder/big_endian.h24
-rw-r--r--include/uapi/linux/byteorder/little_endian.h24
-rw-r--r--include/uapi/linux/kcov.h10
-rw-r--r--include/uapi/linux/swab.h10
-rw-r--r--init/Kconfig49
-rw-r--r--init/main.c5
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/sem.c124
-rw-r--r--ipc/shm.c53
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/hung_task.c21
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/kcov.c261
-rw-r--r--kernel/kexec.c9
-rw-r--r--kernel/kexec_core.c6
-rw-r--r--kernel/kexec_file.c8
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h55
-rw-r--r--kernel/printk/nmi.c222
-rw-r--r--kernel/printk/printk.c29
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/time/hrtimer.c10
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/time/timer.c11
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug21
-rw-r--r--lib/Kconfig.ubsan9
-rw-r--r--lib/Makefile13
-rw-r--r--lib/bug.c11
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/extable.c50
-rw-r--r--lib/kstrtox.c64
-rw-r--r--lib/nmi_backtrace.c89
-rw-r--r--lib/radix-tree.c182
-rw-r--r--lib/string.c45
-rw-r--r--lib/test_printf.c53
-rw-r--r--lib/vsprintf.c101
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Kconfig.debug53
-rw-r--r--mm/Makefile20
-rw-r--r--mm/backing-dev.c22
-rw-r--r--mm/compaction.c324
-rw-r--r--mm/debug-pagealloc.c127
-rw-r--r--mm/debug.c165
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c169
-rw-r--r--mm/huge_memory.c109
-rw-r--r--mm/internal.h31
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/ksm.c733
-rw-r--r--mm/madvise.c19
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memcontrol.c223
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c41
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/mprotect.c24
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c208
-rw-r--r--mm/page-writeback.c62
-rw-r--r--mm/page_alloc.c426
-rw-r--r--mm/page_owner.c100
-rw-r--r--mm/page_poison.c158
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c37
-rw-r--r--mm/slab.c1043
-rw-r--r--mm/slab.h95
-rw-r--r--mm/slab_common.c11
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c298
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c25
-rw-r--r--mm/vmscan.c234
-rw-r--r--mm/vmstat.c17
-rw-r--r--mm/workingset.c170
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/xfrm/xfrm_user.c2
-rw-r--r--scripts/Makefile.lib6
-rw-r--r--scripts/Makefile.ubsan4
-rw-r--r--scripts/gdb/linux/proc.py41
-rw-r--r--scripts/gdb/vmlinux-gdb.py1
-rw-r--r--scripts/kallsyms.c93
-rwxr-xr-xscripts/link-vmlinux.sh6
-rwxr-xr-xscripts/namespace.pl2
-rw-r--r--scripts/sortextable.c2
-rw-r--r--sound/arm/pxa2xx-pcm-lib.c12
-rw-r--r--sound/drivers/pcsp/pcsp.c9
-rw-r--r--sound/soc/fsl/imx-pcm-fiq.c10
-rw-r--r--sound/soc/nuc900/nuc900-pcm.c6
-rw-r--r--sound/soc/omap/omap-pcm.c12
-rw-r--r--tools/perf/builtin-kmem.c49
-rw-r--r--tools/testing/radix-tree/.gitignore2
-rw-r--r--tools/testing/radix-tree/Makefile20
-rw-r--r--tools/testing/radix-tree/find_next_bit.c57
-rw-r--r--tools/testing/radix-tree/linux.c60
-rw-r--r--tools/testing/radix-tree/linux/bitops.h150
-rw-r--r--tools/testing/radix-tree/linux/bitops/__ffs.h43
-rw-r--r--tools/testing/radix-tree/linux/bitops/ffs.h41
-rw-r--r--tools/testing/radix-tree/linux/bitops/ffz.h12
-rw-r--r--tools/testing/radix-tree/linux/bitops/find.h13
-rw-r--r--tools/testing/radix-tree/linux/bitops/fls.h41
-rw-r--r--tools/testing/radix-tree/linux/bitops/fls64.h14
-rw-r--r--tools/testing/radix-tree/linux/bitops/hweight.h11
-rw-r--r--tools/testing/radix-tree/linux/bitops/le.h53
-rw-r--r--tools/testing/radix-tree/linux/bitops/non-atomic.h111
-rw-r--r--tools/testing/radix-tree/linux/bug.h1
-rw-r--r--tools/testing/radix-tree/linux/cpu.h35
-rw-r--r--tools/testing/radix-tree/linux/export.h2
-rw-r--r--tools/testing/radix-tree/linux/gfp.h8
-rw-r--r--tools/testing/radix-tree/linux/kernel.h35
-rw-r--r--tools/testing/radix-tree/linux/kmemleak.h1
-rw-r--r--tools/testing/radix-tree/linux/mempool.h17
-rw-r--r--tools/testing/radix-tree/linux/notifier.h8
-rw-r--r--tools/testing/radix-tree/linux/percpu.h7
-rw-r--r--tools/testing/radix-tree/linux/preempt.h5
-rw-r--r--tools/testing/radix-tree/linux/radix-tree.h1
-rw-r--r--tools/testing/radix-tree/linux/rcupdate.h9
-rw-r--r--tools/testing/radix-tree/linux/slab.h28
-rw-r--r--tools/testing/radix-tree/linux/types.h28
-rw-r--r--tools/testing/radix-tree/main.c272
-rw-r--r--tools/testing/radix-tree/rcupdate.c86
-rw-r--r--tools/testing/radix-tree/regression.h8
-rw-r--r--tools/testing/radix-tree/regression1.c221
-rw-r--r--tools/testing/radix-tree/regression2.c126
-rw-r--r--tools/testing/radix-tree/regression3.c86
-rw-r--r--tools/testing/radix-tree/tag_check.c332
-rw-r--r--tools/testing/radix-tree/test.c219
-rw-r--r--tools/testing/radix-tree/test.h40
-rw-r--r--tools/vm/page-types.c125
401 files changed, 16605 insertions, 5074 deletions
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda5031c83d..f525b1663efa 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -227,6 +227,16 @@ line of text and contains the following stats separated by whitespace:
mem_used_max
zero_pages
num_migrated
+ avail_streams
+
+`avail_streams' column shows the current number of available compression
+streams, which is not necessarily equal to the max number of compression
+streams. The max number of compression streams can be set too high and
+can be unreachable (depending on the load and the usage pattern, of
+course). `avail_streams' permits finding out the real 'level of
+concurrency' that a particular zram device saw and to calculate the real
+memory consumption by allocated compression streams, not the theoretical
+maximum value.
9) Deactivate:
swapoff /dev/zram0
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index ff49cf901148..e2f4e7948a66 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -843,6 +843,15 @@ PAGE_SIZE multiple when read back.
Amount of memory used to cache filesystem data,
including tmpfs and shared memory.
+ kernel_stack
+
+ Amount of memory allocated to kernel stacks.
+
+ slab
+
+ Amount of memory used for storing in-kernel data
+ structures.
+
sock
Amount of memory used in network transmission buffers
@@ -871,6 +880,16 @@ PAGE_SIZE multiple when read back.
on the internal memory management lists used by the
page reclaim algorithm
+ slab_reclaimable
+
+ Part of "slab" that might be reclaimed, such as
+ dentries and inodes.
+
+ slab_unreclaimable
+
+ Part of "slab" that cannot be reclaimed on memory
+ pressure.
+
pgfault
Total number of page faults incurred
diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt
new file mode 100644
index 000000000000..1ab07860430d
--- /dev/null
+++ b/Documentation/filesystems/ocfs2-online-filecheck.txt
@@ -0,0 +1,94 @@
+ OCFS2 online file check
+ -----------------------
+
+This document will describe OCFS2 online file check feature.
+
+Introduction
+============
+OCFS2 is often used in high-availaibility systems. However, OCFS2 usually
+converts the filesystem to read-only when encounters an error. This may not be
+necessary, since turning the filesystem read-only would affect other running
+processes as well, decreasing availability.
+Then, a mount option (errors=continue) is introduced, which would return the
+-EIO errno to the calling process and terminate furhter processing so that the
+filesystem is not corrupted further. The filesystem is not converted to
+read-only, and the problematic file's inode number is reported in the kernel
+log. The user can try to check/fix this file via online filecheck feature.
+
+Scope
+=====
+This effort is to check/fix small issues which may hinder day-to-day operations
+of a cluster filesystem by turning the filesystem read-only. The scope of
+checking/fixing is at the file level, initially for regular files and eventually
+to all files (including system files) of the filesystem.
+
+In case of directory to file links is incorrect, the directory inode is
+reported as erroneous.
+
+This feature is not suited for extravagant checks which involve dependency of
+other components of the filesystem, such as but not limited to, checking if the
+bits for file blocks in the allocation has been set. In case of such an error,
+the offline fsck should/would be recommended.
+
+Finally, such an operation/feature should not be automated lest the filesystem
+may end up with more damage than before the repair attempt. So, this has to
+be performed using user interaction and consent.
+
+User interface
+==============
+When there are errors in the OCFS2 filesystem, they are usually accompanied
+by the inode number which caused the error. This inode number would be the
+input to check/fix the file.
+
+There is a sysfs directory for each OCFS2 file system mounting:
+
+ /sys/fs/ocfs2/<devname>/filecheck
+
+Here, <devname> indicates the name of OCFS2 volumn device which has been already
+mounted. The file above would accept inode numbers. This could be used to
+communicate with kernel space, tell which file(inode number) will be checked or
+fixed. Currently, three operations are supported, which includes checking
+inode, fixing inode and setting the size of result record history.
+
+1. If you want to know what error exactly happened to <inode> before fixing, do
+
+ # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/check
+ # cat /sys/fs/ocfs2/<devname>/filecheck/check
+
+The output is like this:
+ INO DONE ERROR
+39502 1 GENERATION
+
+<INO> lists the inode numbers.
+<DONE> indicates whether the operation has been finished.
+<ERROR> says what kind of errors was found. For the detailed error numbers,
+please refer to the file linux/fs/ocfs2/filecheck.h.
+
+2. If you determine to fix this inode, do
+
+ # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/fix
+ # cat /sys/fs/ocfs2/<devname>/filecheck/fix
+
+The output is like this:
+ INO DONE ERROR
+39502 1 SUCCESS
+
+This time, the <ERROR> column indicates whether this fix is successful or not.
+
+3. The record cache is used to store the history of check/fix results. It's
+defalut size is 10, and can be adjust between the range of 10 ~ 100. You can
+adjust the size like this:
+
+ # echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set
+
+Fixing stuff
+============
+On receivng the inode, the filesystem would read the inode and the
+file metadata. In case of errors, the filesystem would fix the errors
+and report the problems it fixed in the kernel log. As a precautionary measure,
+the inode must first be checked for errors before performing a final fix.
+
+The inode and the result history will be maintained temporarily in a
+small linked list buffer which would contain the last (N) inodes
+fixed/checked, the detailed errors which were fixed/checked are printed in the
+kernel log.
diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt
new file mode 100644
index 000000000000..779ff4ab1c1d
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+ CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+ mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
+#define KCOV_ENABLE _IO('c', 100)
+#define KCOV_DISABLE _IO('c', 101)
+#define COVER_SIZE (64<<10)
+
+int main(int argc, char **argv)
+{
+ int fd;
+ unsigned long *cover, n, i;
+
+ /* A single fd descriptor allows coverage collection on a single
+ * thread.
+ */
+ fd = open("/sys/kernel/debug/kcov", O_RDWR);
+ if (fd == -1)
+ perror("open"), exit(1);
+ /* Setup trace mode and trace size. */
+ if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+ perror("ioctl"), exit(1);
+ /* Mmap buffer shared between kernel- and user-space. */
+ cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if ((void*)cover == MAP_FAILED)
+ perror("mmap"), exit(1);
+ /* Enable coverage collection on the current thread. */
+ if (ioctl(fd, KCOV_ENABLE, 0))
+ perror("ioctl"), exit(1);
+ /* Reset coverage from the tail of the ioctl() call. */
+ __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+ /* That's the target syscal call. */
+ read(-1, NULL, 0);
+ /* Read number of PCs collected. */
+ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+ for (i = 0; i < n; i++)
+ printf("0x%lx\n", cover[i + 1]);
+ /* Disable coverage collection for the current thread. After this call
+ * coverage can be enabled for a different thread.
+ */
+ if (ioctl(fd, KCOV_DISABLE, 0))
+ perror("ioctl"), exit(1);
+ /* Free resources. */
+ if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+ perror("munmap"), exit(1);
+ if (close(fd))
+ perror("close"), exit(1);
+ return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5688b69c26bb..52a8471fba9c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1759,7 +1759,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
keepinitrd [HW,ARM]
- kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
+ kernelcore= [KNL,X86,IA-64,PPC]
+ Format: nn[KMGTPE] | "mirror"
+ This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations. The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1775,6 +1777,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
+ Instead of specifying the amount of memory (nn[KMGTPE]),
+ you can specify "mirror" option. In case "mirror"
+ option is specified, mirrored (reliable) memory is used
+ for non-movable allocations and remaining memory is used
+ for Movable pages. nn[KMGTPE] and "mirror" are exclusive,
+ so you can NOT specify nn[KMGTPE] and "mirror" at the same
+ time.
+
kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
@@ -2732,6 +2742,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
we can turn it on.
on: enable the feature
+ page_poison= [KNL] Boot-time parameter changing the state of
+ poisoning on the buddy allocator.
+ off: turn off poisoning
+ on: turn on poisoning
+
panic= [KNL] Kernel behaviour on panic: delay <timeout>
timeout > 0: seconds before rebooting
timeout = 0: wait forever
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index ce2cfcf35c27..443f4b44ad97 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -256,10 +256,27 @@ If the memory block is offline, you'll read "offline".
5.2. How to online memory
------------
-Even if the memory is hot-added, it is not at ready-to-use state.
-For using newly added memory, you have to "online" the memory block.
+When the memory is hot-added, the kernel decides whether or not to "online"
+it according to the policy which can be read from "auto_online_blocks" file:
-For onlining, you have to write "online" to the memory block's state file as:
+% cat /sys/devices/system/memory/auto_online_blocks
+
+The default is "offline" which means the newly added memory is not in a
+ready-to-use state and you have to "online" the newly added memory blocks
+manually. Automatic onlining can be requested by writing "online" to
+"auto_online_blocks" file:
+
+% echo online > /sys/devices/system/memory/auto_online_blocks
+
+This sets a global policy and impacts all memory blocks that will subsequently
+be hotplugged. Currently offline blocks keep their state. It is possible, under
+certain circumstances, that some memory blocks will be added but will fail to
+online. User space tools can check their "state" files
+(/sys/devices/system/memory/memoryXXX/state) and try to online them manually.
+
+If the automatic onlining wasn't requested, failed, or some memory block was
+offlined it is possible to change the individual block's state by writing to the
+"state" file:
% echo online > /sys/devices/system/memory/memoryXXX/state
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 5d1128bf0282..5962949944fd 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -298,6 +298,24 @@ bitmap and its derivatives such as cpumask and nodemask:
Passed by reference.
+Flags bitfields such as page flags, gfp_flags:
+
+ %pGp referenced|uptodate|lru|active|private
+ %pGg GFP_USER|GFP_DMA32|GFP_NOWARN
+ %pGv read|exec|mayread|maywrite|mayexec|denywrite
+
+ For printing flags bitfields as a collection of symbolic constants that
+ would construct the value. The type of flags is given by the third
+ character. Currently supported are [p]age flags, [v]ma_flags (both
+ expect unsigned long *) and [g]fp_flags (expects gfp_t *). The flag
+ names and print order depends on the particular type.
+
+ Note that this format should not be used directly in TP_printk() part
+ of a tracepoint. Instead, use the show_*_flags() functions from
+ <trace/events/mmflags.h>.
+
+ Passed by reference.
+
Network device features:
%pNF 0x000000000000c000
diff --git a/Documentation/rapidio/mport_cdev.txt b/Documentation/rapidio/mport_cdev.txt
new file mode 100644
index 000000000000..20c120d4b3b8
--- /dev/null
+++ b/Documentation/rapidio/mport_cdev.txt
@@ -0,0 +1,104 @@
+RapidIO subsystem mport character device driver (rio_mport_cdev.c)
+==================================================================
+
+Version History:
+----------------
+ 1.0.0 - Initial driver release.
+
+==================================================================
+
+I. Overview
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Freescale,
+Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was
+received from other members of RapidIO.org. The objective was to create a
+character mode driver interface which exposes the capabilities of RapidIO
+devices directly to applications, in a manner that allows the numerous and
+varied RapidIO implementations to interoperate.
+
+This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
+for user-space applications. Most of RapidIO operations are supported through
+'ioctl' system calls.
+
+When loaded this device driver creates filesystem nodes named rio_mportX in /dev
+directory for each registered RapidIO mport device. 'X' in the node name matches
+to unique port ID assigned to each local mport device.
+
+Using available set of ioctl commands user-space applications can perform
+following RapidIO bus and subsystem operations:
+
+- Reads and writes from/to configuration registers of mport devices
+ (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
+- Reads and writes from/to configuration registers of remote RapidIO devices.
+ This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
+ (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
+- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
+- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
+- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
+- Query capabilities and RapidIO link configuration of mport devices
+ (RIO_MPORT_GET_PROPERTIES)
+- Enable/Disable reporting of RapidIO doorbell events to user-space applications
+ (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
+- Enable/Disable reporting of RIO port-write events to user-space applications
+ (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
+- Query/Control type of events reported through this driver: doorbells,
+ port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
+- Configure/Map mport's outbound requests window(s) for specific size,
+ RapidIO destination ID, hopcount and request type
+ (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
+- Configure/Map mport's inbound requests window(s) for specific size,
+ RapidIO base address and local memory base address
+ (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
+- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
+ to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
+- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
+ Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
+ transfer modes.
+- Check/Wait for completion of asynchronous DMA data transfer
+ (RIO_WAIT_FOR_ASYNC)
+- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
+ This allows implementation of various RapidIO fabric enumeration algorithms
+ as user-space applications while using remaining functionality provided by
+ kernel RapidIO subsystem.
+
+II. Hardware Compatibility
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport implementation.
+
+At this moment the most common limitation is availability of RapidIO-specific
+DMA engine framework for specific mport device. Users should verify available
+functionality of their platform when planning to use this driver:
+
+- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
+ compatible with this driver.
+- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
+ specific DMA engine support and therefore DMA data transfers mport_cdev driver
+ are not available.
+
+III. Module parameters
+
+- 'dbg_level' - This parameter allows to control amount of debug information
+ generated by this device driver. This parameter is formed by set of
+ This parameter can be changed bit masks that correspond to the specific
+ functional block.
+ For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
+ This parameter can be changed dynamically.
+ Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+IV. Known problems
+
+ None.
+
+V. User-space Applications and API
+
+API library and applications that use this device driver are available from
+RapidIO.org.
+
+VI. TODO List
+
+- Add support for sending/receiving "raw" RapidIO messaging packets.
+- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
+ is not available.
diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt
index 626052f403bb..7c1c7bf48ec0 100644
--- a/Documentation/rapidio/tsi721.txt
+++ b/Documentation/rapidio/tsi721.txt
@@ -16,6 +16,15 @@ For inbound messages this driver uses destination ID matching to forward message
into the corresponding message queue. Messaging callbacks are implemented to be
fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
+1. Module parameters:
+- 'dbg_level' - This parameter allows to control amount of debug information
+ generated by this device driver. This parameter is formed by set of
+ This parameter can be changed bit masks that correspond to the specific
+ functional block.
+ For mask definitions see 'drivers/rapidio/devices/tsi721.h'
+ This parameter can be changed dynamically.
+ Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
II. Known problems
None.
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index f34a8ee6f860..a7ef7595edab 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -80,6 +80,50 @@ run - set 0 to stop ksmd from running but keep merged pages,
Default: 0 (must be changed to 1 to activate KSM,
except if CONFIG_SYSFS is disabled)
+max_page_sharing - Maximum sharing allowed for each KSM page. This
+ enforces a deduplication limit to avoid the virtual
+ memory rmap lists to grow too large. The minimum
+ value is 2 as a newly created KSM page will have at
+ least two sharers. The rmap walk has O(N)
+ complexity where N is the number of rmap_items
+ (i.e. virtual mappings) that are sharing the page,
+ which is in turn capped by max_page_sharing. So
+ this effectively spread the the linear O(N)
+ computational complexity from rmap walk context
+ over different KSM pages. The ksmd walk over the
+ stable_node "chains" is also O(N), but N is the
+ number of stable_node "dups", not the number of
+ rmap_items, so it has not a significant impact on
+ ksmd performance. In practice the best stable_node
+ "dup" candidate will be kept and found at the head
+ of the "dups" list. The higher this value the
+ faster KSM will merge the memory (because there
+ will be fewer stable_node dups queued into the
+ stable_node chain->hlist to check for pruning) and
+ the higher the deduplication factor will be, but
+ the slowest the worst case rmap walk could be for
+ any given KSM page. Slowing down the rmap_walk
+ means there will be higher latency for certain
+ virtual memory operations happening during
+ swapping, compaction, NUMA balancing and page
+ migration, in turn decreasing responsiveness for
+ the caller of those virtual memory operations. The
+ scheduler latency of other tasks not involved with
+ the VM operations doing the rmap walk is not
+ affected by this parameter as the rmap walks are
+ always schedule friendly themselves.
+
+stable_node_chains_prune_millisecs - How frequently to walk the whole
+ list of stable_node "dups" linked in the
+ stable_node "chains" in order to prune stale
+ stable_nodes. Smaller milllisecs values will free
+ up the KSM metadata with lower latency, but they
+ will make ksmd use more CPU during the scan. This
+ only applies to the stable_node chains so it's a
+ noop if not a single KSM page hit the
+ max_page_sharing yet (there would be no stable_node
+ chains in such case).
+
The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
pages_shared - how many shared pages are being used
@@ -88,10 +132,29 @@ pages_unshared - how many pages unique but repeatedly checked for merging
pages_volatile - how many pages changing too fast to be placed in a tree
full_scans - how many times all mergeable areas have been scanned
+stable_node_chains - number of stable node chains allocated, this is
+ effectively the number of KSM pages that hit the
+ max_page_sharing limit
+stable_node_dups - number of stable node dups queued into the
+ stable_node chains
+
A high ratio of pages_sharing to pages_shared indicates good sharing, but
a high ratio of pages_unshared to pages_sharing indicates wasted effort.
pages_volatile embraces several different kinds of activity, but a high
proportion there would also indicate poor use of madvise MADV_MERGEABLE.
+The maximum possible page_sharing/page_shared ratio is limited by the
+max_page_sharing tunable. To increase the ratio max_page_sharing must
+be increased accordingly.
+
+The stable_node_dups/stable_node_chains ratio is also affected by the
+max_page_sharing tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to freeup
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
Izik Eidus,
Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt
index 8f3ce9b3aa11..ffff1439076a 100644
--- a/Documentation/vm/page_owner.txt
+++ b/Documentation/vm/page_owner.txt
@@ -28,10 +28,11 @@ with page owner and page owner is disabled in runtime due to no enabling
boot option, runtime overhead is marginal. If disabled in runtime, it
doesn't require memory to store owner information, so there is no runtime
memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if it returns false then allocation is
-done like as the kernel without page owner. These two unlikely branches
-would not affect to allocation performance. Following is the kernel's
-code size change due to this facility.
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
- Without page owner
text data bss dec hex filename
diff --git a/Makefile b/Makefile
index 682840850f58..9092c16fa6cf 100644
--- a/Makefile
+++ b/Makefile
@@ -365,6 +365,7 @@ LDFLAGS_MODULE =
CFLAGS_KERNEL =
AFLAGS_KERNEL =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
+CFLAGS_KCOV = -fsanitize-coverage=trace-pc
# Use USERINCLUDE when you must reference the UAPI directories only.
@@ -411,7 +412,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN
export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -673,6 +674,14 @@ endif
endif
KBUILD_CFLAGS += $(stackp-flag)
+ifdef CONFIG_KCOV
+ ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+ $(warning Cannot use CONFIG_KCOV: \
+ -fsanitize-coverage=trace-pc is not supported by compiler)
+ CFLAGS_KCOV =
+ endif
+endif
+
ifeq ($(cc-name),clang)
KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
diff --git a/arch/Kconfig b/arch/Kconfig
index f6b649d88ec8..155e6cdbff74 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool
+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 9b0d40093c9a..c419b43c461d 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -483,7 +483,13 @@ struct exception_table_entry
(pc) + (_fixup)->fixup.bits.nextinsn; \
})
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta) \
+ do { \
+ (a)->fixup.unit = (b)->fixup.unit; \
+ (b)->fixup.unit = (tmp).fixup.unit; \
+ } while (0)
+
#endif /* __ALPHA_UACCESS_H */
diff --git a/arch/alpha/mm/Makefile b/arch/alpha/mm/Makefile
index c993d3f93cf6..5a9807936411 100644
--- a/arch/alpha/mm/Makefile
+++ b/arch/alpha/mm/Makefile
@@ -4,6 +4,6 @@
ccflags-y := -Werror
-obj-y := init.o fault.o extable.o
+obj-y := init.o fault.o
obj-$(CONFIG_DISCONTIGMEM) += numa.o
diff --git a/arch/alpha/mm/extable.c b/arch/alpha/mm/extable.c
deleted file mode 100644
index 813c9b63c0e1..000000000000
--- a/arch/alpha/mm/extable.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * linux/arch/alpha/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <asm/uaccess.h>
-
-static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-
-static void swap_ex(void *a, void *b, int size)
-{
- struct exception_table_entry *ex_a = a, *ex_b = b;
- unsigned long addr_a = ex_to_addr(ex_a), addr_b = ex_to_addr(ex_b);
- unsigned int t = ex_a->fixup.unit;
-
- ex_a->fixup.unit = ex_b->fixup.unit;
- ex_b->fixup.unit = t;
- ex_a->insn = (int)(addr_b - (unsigned long)&ex_a->insn);
- ex_b->insn = (int)(addr_a - (unsigned long)&ex_b->insn);
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /* avoid overflow */
- if (ex_to_addr(x) > ex_to_addr(y))
- return 1;
- if (ex_to_addr(x) < ex_to_addr(y))
- return -1;
- return 0;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, swap_ex);
-}
-
-#ifdef CONFIG_MODULES
-/*
- * Any entry referring to the module init will be at the beginning or
- * the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_to_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
- m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
-
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry *mid;
- unsigned long mid_value;
-
- mid = (last - first) / 2 + first;
- mid_value = ex_to_addr(mid);
- if (mid_value == value)
- return mid;
- else if (mid_value < value)
- first = mid+1;
- else
- last = mid-1;
- }
-
- return NULL;
-}
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index cc95ff8f07cb..da226ea98e80 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -66,6 +66,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_NMI
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 37312f6749f3..b4048e370730 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -645,7 +645,9 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
case IPI_CPU_BACKTRACE:
irq_enter();
+ printk_nmi_enter();
nmi_cpu_backtrace(regs);
+ printk_nmi_exit();
irq_exit();
break;
diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c
index 77d6b1bab278..ee06fabdf60e 100644
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -86,8 +86,8 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
{
dma_addr_t dma;
- fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
- PANEL_SIZE, &dma, GFP_KERNEL);
+ fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, PANEL_SIZE, &dma,
+ GFP_KERNEL);
if (!fb->fb.screen_base) {
printk(KERN_ERR "CLCD: unable to map framebuffer\n");
return -ENOMEM;
@@ -116,15 +116,14 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
static int lpc32xx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
{
- return dma_mmap_writecombine(&fb->dev->dev, vma,
- fb->fb.screen_base, fb->fb.fix.smem_start,
- fb->fb.fix.smem_len);
+ return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+ fb->fb.fix.smem_start, fb->fb.fix.smem_len);
}
static void lpc32xx_clcd_remove(struct clcd_fb *fb)
{
- dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
- fb->fb.screen_base, fb->fb.fix.smem_start);
+ dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+ fb->fb.fix.smem_start);
}
/*
diff --git a/arch/arm/mach-netx/fb.c b/arch/arm/mach-netx/fb.c
index d122ee6ab991..8814ee5e98fd 100644
--- a/arch/arm/mach-netx/fb.c
+++ b/arch/arm/mach-netx/fb.c
@@ -42,8 +42,8 @@ int netx_clcd_setup(struct clcd_fb *fb)
fb->panel = netx_panel;
- fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, 1024*1024,
- &dma, GFP_KERNEL);
+ fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, 1024 * 1024, &dma,
+ GFP_KERNEL);
if (!fb->fb.screen_base) {
printk(KERN_ERR "CLCD: unable to map framebuffer\n");
return -ENOMEM;
@@ -57,16 +57,14 @@ int netx_clcd_setup(struct clcd_fb *fb)
int netx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
{
- return dma_mmap_writecombine(&fb->dev->dev, vma,
- fb->fb.screen_base,
- fb->fb.fix.smem_start,
- fb->fb.fix.smem_len);
+ return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+ fb->fb.fix.smem_start, fb->fb.fix.smem_len);
}
void netx_clcd_remove(struct clcd_fb *fb)
{
- dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
- fb->fb.screen_base, fb->fb.fix.smem_start);
+ dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+ fb->fb.fix.smem_start);
}
static AMBA_AHB_DEVICE(fb, "fb", 0, 0x00104000, { NETX_IRQ_LCD }, NULL);
diff --git a/arch/arm/mach-nspire/clcd.c b/arch/arm/mach-nspire/clcd.c
index abea12617b17..ea0e5b2ca1cd 100644
--- a/arch/arm/mach-nspire/clcd.c
+++ b/arch/arm/mach-nspire/clcd.c
@@ -90,8 +90,8 @@ int nspire_clcd_setup(struct clcd_fb *fb)
panel_size = ((panel->mode.xres * panel->mode.yres) * panel->bpp) / 8;
panel_size = ALIGN(panel_size, PAGE_SIZE);
- fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
- panel_size, &dma, GFP_KERNEL);
+ fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, panel_size, &dma,
+ GFP_KERNEL);
if (!fb->fb.screen_base) {
pr_err("CLCD: unable to map framebuffer\n");
@@ -107,13 +107,12 @@ int nspire_clcd_setup(struct clcd_fb *fb)
int nspire_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
{
- return dma_mmap_writecombine(&fb->dev->dev, vma,
- fb->fb.screen_base, fb->fb.fix.smem_start,
- fb->fb.fix.smem_len);
+ return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+ fb->fb.fix.smem_start, fb->fb.fix.smem_len);
}
void nspire_clcd_remove(struct clcd_fb *fb)
{
- dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
- fb->fb.screen_base, fb->fb.fix.smem_start);
+ dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+ fb->fb.fix.smem_start);
}
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 4b4058db0781..66353caa35b9 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -173,7 +173,7 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bb7b72734c24..d8bfcc1ce923 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -94,12 +94,19 @@
dmb \opt
.endm
+/*
+ * Emit an entry into the exception table
+ */
+ .macro _asm_extable, from, to
+ .pushsection __ex_table, "a"
+ .align 3
+ .long (\from - .), (\to - .)
+ .popsection
+ .endm
+
#define USER(l, x...) \
9999: x; \
- .section __ex_table,"a"; \
- .align 3; \
- .quad 9999b,l; \
- .previous
+ _asm_extable 9999b, l
/*
* Register aliases.
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f3ab8c1db55..f2585cdd32c2 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -42,10 +42,8 @@
"4: mov %w0, %w5\n" \
" b 3b\n" \
" .popsection\n" \
-" .pushsection __ex_table,\"a\"\n" \
-" .align 3\n" \
-" .quad 1b, 4b, 2b, 4b\n" \
-" .popsection\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
CONFIG_ARM64_PAN) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
@@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
"4: mov %w0, %w6\n"
" b 3b\n"
" .popsection\n"
-" .pushsection __ex_table,\"a\"\n"
-" .align 3\n"
-" .quad 1b, 4b, 2b, 4b\n"
-" .popsection\n"
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 4b)
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index b2ede967fe7d..dc11577fab7e 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,11 +36,11 @@
#define VERIFY_WRITE 1
/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of pairs of relative offsets: the first
+ * is the relative offset to an instruction that is allowed to fault,
+ * and the second is the relative offset at which the program should
+ * continue. No registers are modified, so it is entirely up to the
+ * continuation code to figure out what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -50,9 +50,11 @@
struct exception_table_entry
{
- unsigned long insn, fixup;
+ int insn, fixup;
};
+#define ARCH_HAS_RELATIVE_EXTABLE
+
extern int fixup_exception(struct pt_regs *regs);
#define KERNEL_DS (-1UL)
@@ -105,6 +107,12 @@ static inline void set_fs(mm_segment_t fs)
#define access_ok(type, addr, size) __range_ok(addr, size)
#define user_addr_max get_fs
+#define _ASM_EXTABLE(from, to) \
+ " .pushsection __ex_table, \"a\"\n" \
+ " .align 3\n" \
+ " .long (" #from " - .), (" #to " - .)\n" \
+ " .popsection\n"
+
/*
* The "__xxx" versions of the user access functions do not verify the address
* space - it must have been done previously with a separate "access_ok()"
@@ -123,10 +131,7 @@ static inline void set_fs(mm_segment_t fs)
" mov %1, #0\n" \
" b 2b\n" \
" .previous\n" \
- " .section __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 1b, 3b\n" \
- " .previous" \
+ _ASM_EXTABLE(1b, 3b) \
: "+r" (err), "=&r" (x) \
: "r" (addr), "i" (-EFAULT))
@@ -190,10 +195,7 @@ do { \
"3: mov %w0, %3\n" \
" b 2b\n" \
" .previous\n" \
- " .section __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 1b, 3b\n" \
- " .previous" \
+ _ASM_EXTABLE(1b, 3b) \
: "+r" (err) \
: "r" (x), "r" (addr), "i" (-EFAULT))
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index aab5bf09e9d9..2b79b8a89457 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -16,6 +16,8 @@
#ifndef __ASM_WORD_AT_A_TIME_H
#define __ASM_WORD_AT_A_TIME_H
+#include <asm/uaccess.h>
+
#ifndef __AARCH64EB__
#include <linux/kernel.h>
@@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
#endif
" b 2b\n"
" .popsection\n"
- " .pushsection __ex_table,\"a\"\n"
- " .align 3\n"
- " .quad 1b, 3b\n"
- " .popsection"
+ _ASM_EXTABLE(1b, 3b)
: "=&r" (ret), "=&r" (offset)
: "r" (addr), "Q" (*(unsigned long *)addr));
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 3e01207917b1..c37202c0c838 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -297,11 +297,8 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
"4: mov %w0, %w5\n" \
" b 3b\n" \
" .popsection" \
- " .pushsection __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 0b, 4b\n" \
- " .quad 1b, 4b\n" \
- " .popsection\n" \
+ _ASM_EXTABLE(0b, 4b) \
+ _ASM_EXTABLE(1b, 4b) \
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
CONFIG_ARM64_PAN) \
: "=&r" (res), "+r" (data), "=&r" (temp) \
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 79444279ba8c..81acd4706878 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
fixup = search_exception_tables(instruction_pointer(regs));
if (fixup)
- regs->pc = fixup->fixup;
+ regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
return fixup != NULL;
}
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 4c893b5189dd..232f787a088a 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -53,10 +53,10 @@ unsigned long arch_mmap_rnd(void)
#ifdef CONFIG_COMPAT
if (test_thread_flag(TIF_32BIT))
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
else
#endif
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI
config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index e086f9e93728..62148eaf0189 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -69,6 +69,7 @@ config CRIS
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+ select HAVE_NMI
config HZ
int
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index 4f3fb6ccbf21..2189d5ddc1ee 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -341,13 +341,11 @@ extern unsigned long __strnlen_user (const char __user *, long);
__su_ret; \
})
-/* Generic code can't deal with the location-relative format that we use for compactness. */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+#define ARCH_HAS_RELATIVE_EXTABLE
struct exception_table_entry {
- int addr; /* location-relative address of insn this fixup is for */
- int cont; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
+ int insn; /* location-relative address of insn this fixup is for */
+ int fixup; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
};
extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e);
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
index c99a41e29fe8..8f70bb2d0c37 100644
--- a/arch/ia64/mm/extable.c
+++ b/arch/ia64/mm/extable.c
@@ -5,107 +5,12 @@
* David Mosberger-Tang <davidm@hpl.hp.com>
*/
-#include <linux/sort.h>
-
#include <asm/uaccess.h>
-#include <linux/module.h>
-
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *l = a, *r = b;
- u64 lip = (u64) &l->addr + l->addr;
- u64 rip = (u64) &r->addr + r->addr;
-
- /* avoid overflow */
- if (lip > rip)
- return 1;
- if (lip < rip)
- return -1;
- return 0;
-}
-
-static void swap_ex(void *a, void *b, int size)
-{
- struct exception_table_entry *l = a, *r = b, tmp;
- u64 delta = (u64) r - (u64) l;
-
- tmp = *l;
- l->addr = r->addr + delta;
- l->cont = r->cont + delta;
- r->addr = tmp.addr - delta;
- r->cont = tmp.cont - delta;
-}
-
-/*
- * Sort the exception table. It's usually already sorted, but there
- * may be unordered entries due to multiple text sections (such as the
- * .init text section). Note that the exception-table-entries contain
- * location-relative addresses, which requires a bit of care during
- * sorting to avoid overflows in the offset members (e.g., it would
- * not be safe to make a temporary copy of an exception-table entry on
- * the stack, because the stack may be more than 2GB away from the
- * exception-table).
- */
-void sort_extable (struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, swap_ex);
-}
-
-static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->addr + x->addr;
-}
-
-#ifdef CONFIG_MODULES
-/*
- * Any entry referring to the module init will be at the beginning or
- * the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_to_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
- m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
-
-const struct exception_table_entry *
-search_extable (const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long ip)
-{
- const struct exception_table_entry *mid;
- unsigned long mid_ip;
- long diff;
-
- while (first <= last) {
- mid = &first[(last - first)/2];
- mid_ip = (u64) &mid->addr + mid->addr;
- diff = mid_ip - ip;
- if (diff == 0)
- return mid;
- else if (diff < 0)
- first = mid + 1;
- else
- last = mid - 1;
- }
- return NULL;
-}
void
ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
{
- long fix = (u64) &e->cont + e->cont;
+ long fix = (u64) &e->fixup + e->fixup;
regs->r8 = -EFAULT;
if (fix & 4)
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 0d4146f644dc..11fa717d93b1 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -59,21 +59,24 @@ void free_initrd_mem(unsigned long, unsigned long);
void __init zone_sizes_init(void)
{
unsigned long zones_size[MAX_NR_ZONES] = {0, };
- unsigned long max_dma;
- unsigned long low;
unsigned long start_pfn;
#ifdef CONFIG_MMU
- start_pfn = START_PFN(0);
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- low = MAX_LOW_PFN(0);
-
- if (low < max_dma){
- zones_size[ZONE_DMA] = low - start_pfn;
- zones_size[ZONE_NORMAL] = 0;
- } else {
- zones_size[ZONE_DMA] = low - start_pfn;
- zones_size[ZONE_NORMAL] = low - max_dma;
+ {
+ unsigned long low;
+ unsigned long max_dma;
+
+ start_pfn = START_PFN(0);
+ max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ low = MAX_LOW_PFN(0);
+
+ if (low < max_dma) {
+ zones_size[ZONE_DMA] = low - start_pfn;
+ zones_size[ZONE_NORMAL] = 0;
+ } else {
+ zones_size[ZONE_DMA] = low - start_pfn;
+ zones_size[ZONE_NORMAL] = low - max_dma;
+ }
}
#else
zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index cc172671797d..06a74aefe6fc 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -62,6 +62,7 @@ config MIPS
select HAVE_IRQ_TIME_ACCOUNTING
select GENERIC_TIME_VSYSCALL
select ARCH_CLOCKSOURCE_DATA
+ select HAVE_NMI
menu "Machine selection"
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 5c81fdd032c3..353037699512 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -146,7 +146,7 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- rnd = (unsigned long)get_random_int();
+ rnd = get_random_long();
rnd <<= PAGE_SHIFT;
if (TASK_IS_32BIT_ADDR)
rnd &= 0xfffffful;
@@ -174,7 +174,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
static inline unsigned long brk_rnd(void)
{
- unsigned long rnd = get_random_int();
+ unsigned long rnd = get_random_long();
rnd = rnd << PAGE_SHIFT;
/* 8MB for 32bit, 256MB for 64bit */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9faa18c4f3f7..3a557beccab5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -152,6 +152,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dccc87e8fee5..3c5736e52a14 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1768,9 +1768,9 @@ static inline unsigned long brk_rnd(void)
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+ rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
else
- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+ rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
return rnd << PAGE_SHIFT;
}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 5a2c049c1c61..aa610ce8742f 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -49,7 +49,7 @@ static unsigned int rtas_error_log_buffer_max;
static unsigned int event_scan;
static unsigned int rtas_event_scan_rate;
-static int full_rtas_msgs = 0;
+static bool full_rtas_msgs;
/* Stop logging to nvram after first fatal error */
static int logging_enabled; /* Until we initialize everything,
@@ -592,11 +592,6 @@ __setup("surveillance=", surveillance_setup);
static int __init rtasmsgs_setup(char *str)
{
- if (strcmp(str, "on") == 0)
- full_rtas_msgs = 1;
- else if (strcmp(str, "off") == 0)
- full_rtas_msgs = 0;
-
- return 1;
+ return (kstrtobool(str, &full_rtas_msgs) == 0);
}
__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 0f0502e12f6c..4087705ba90f 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,9 +59,9 @@ unsigned long arch_mmap_rnd(void)
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
- rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
+ rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
else
- rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
+ rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
return rnd << PAGE_SHIFT;
}
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 32274f72fe3f..282837a1d74b 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -47,20 +47,14 @@ static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
-static int cede_offline_enabled __read_mostly = 1;
+static bool cede_offline_enabled __read_mostly = true;
/*
* Enable/disable cede_offline when available.
*/
static int __init setup_cede_offline(char *str)
{
- if (!strcmp(str, "off"))
- cede_offline_enabled = 0;
- else if (!strcmp(str, "on"))
- cede_offline_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &cede_offline_enabled) == 0);
}
__setup("cede_offline=", setup_cede_offline);
diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index c1cd3698f534..385371acc0d0 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -606,6 +606,12 @@ int fsl_rio_setup(struct platform_device *dev)
if (!port)
continue;
+ rc = rio_mport_initialize(port);
+ if (rc) {
+ kfree(port);
+ continue;
+ }
+
i = *port_index - 1;
port->index = (unsigned char)i;
@@ -682,12 +688,6 @@ int fsl_rio_setup(struct platform_device *dev)
dev_info(&dev->dev, "RapidIO Common Transport System size: %d\n",
port->sys_size ? 65536 : 256);
- if (rio_register_mport(port)) {
- release_resource(&port->iores);
- kfree(priv);
- kfree(port);
- continue;
- }
if (port->host_deviceid >= 0)
out_be32(priv->regs_win + RIO_GCCSR, RIO_PORT_GEN_HOST |
RIO_PORT_GEN_MASTER | RIO_PORT_GEN_DISCOVERED);
@@ -727,6 +727,12 @@ int fsl_rio_setup(struct platform_device *dev)
dbell->mport[i] = port;
+ if (rio_register_mport(port)) {
+ release_resource(&port->iores);
+ kfree(priv);
+ kfree(port);
+ continue;
+ }
active_ports++;
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 3be9c832dec1..adb8ecfb4e3c 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -157,6 +157,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI
config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9dd4cc47ddc7..e0900ddf91dd 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -79,18 +79,12 @@ struct exception_table_entry
int insn, fixup;
};
-static inline unsigned long extable_insn(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-
static inline unsigned long extable_fixup(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+#define ARCH_HAS_RELATIVE_EXTABLE
/**
* __copy_from_user: - Copy a block of data from user space, with less checking.
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 02bd02ff648b..14c1ed3617c9 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/kdebug.h>
#include <linux/ptrace.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/processor.h>
@@ -184,9 +185,8 @@ void die(struct pt_regs *regs, const char *str)
#ifdef CONFIG_SMP
printk("SMP ");
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC");
printk("\n");
notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV);
print_modules();
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 99f84ac31307..580bc7299ec3 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1433,7 +1433,7 @@ device_initcall(etr_init_sysfs);
/*
* Server Time Protocol (STP) code.
*/
-static int stp_online;
+static bool stp_online;
static struct stp_sstpi stp_info;
static void *stp_page;
@@ -1444,11 +1444,7 @@ static struct timer_list stp_timer;
static int __init early_parse_stp(char *p)
{
- if (strncmp(p, "off", 3) == 0)
- stp_online = 0;
- else if (strncmp(p, "on", 2) == 0)
- stp_online = 1;
- return 0;
+ return kstrtobool(p, &stp_online);
}
early_param("stp", early_parse_stp);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 40b8102fdadb..64298a867589 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -37,7 +37,7 @@ static void set_topology_timer(void);
static void topology_work_fn(struct work_struct *work);
static struct sysinfo_15_1_x *tl_info;
-static int topology_enabled = 1;
+static bool topology_enabled = true;
static DECLARE_WORK(topology_work, topology_work_fn);
/*
@@ -444,10 +444,7 @@ static const struct cpumask *cpu_book_mask(int cpu)
static int __init early_parse_topology(char *p)
{
- if (strncmp(p, "off", 3))
- return 0;
- topology_enabled = 0;
- return 0;
+ return kstrtobool(p, &topology_enabled);
}
early_param("topology", early_parse_topology);
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 839592ca265c..479550dae80e 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -3,7 +3,7 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o
-obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o
+obj-y += page-states.o gup.o pageattr.o mem_detect.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
deleted file mode 100644
index 18c8b819b0aa..000000000000
--- a/arch/s390/mm/extable.c
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <asm/uaccess.h>
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- const struct exception_table_entry *mid;
- unsigned long addr;
-
- while (first <= last) {
- mid = ((last - first) >> 1) + first;
- addr = extable_insn(mid);
- if (addr < value)
- first = mid + 1;
- else if (addr > value)
- last = mid - 1;
- else
- return mid;
- }
- return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /* This compare is only valid after normalization. */
- return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- struct exception_table_entry *p;
- int i;
-
- /* Normalize entries to being relative to the start of the section */
- for (p = start, i = 0; p < finish; p++, i += 8) {
- p->insn += i;
- p->fixup += i + 4;
- }
- sort(start, finish - start, sizeof(*start), cmp_ex, NULL);
- /* Denormalize all entries */
- for (p = start, i = 0; p < finish; p++, i += 8) {
- p->insn -= i;
- p->fixup -= i + 4;
- }
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
- /* Trim the beginning */
- while (m->num_exentries &&
- within_module_init(extable_insn(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /* Trim the end */
- while (m->num_exentries &&
- within_module_init(extable_insn(&m->extable[m->num_exentries-1]), m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ef7d6c8fea66..d27fccbad7c1 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -94,16 +94,15 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
pgd_populate(&init_mm, pg_dir, pu_dir);
}
pu_dir = pud_offset(pg_dir, address);
-#ifndef CONFIG_DEBUG_PAGEALLOC
if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) {
+ !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
pud_val(*pu_dir) = __pa(address) |
_REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
(ro ? _REGION_ENTRY_PROTECT : 0);
address += PUD_SIZE;
continue;
}
-#endif
if (pud_none(*pu_dir)) {
pm_dir = vmem_pmd_alloc();
if (!pm_dir)
@@ -111,9 +110,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
pud_populate(&init_mm, pu_dir, pm_dir);
}
pm_dir = pmd_offset(pu_dir, address);
-#ifndef CONFIG_DEBUG_PAGEALLOC
if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) {
+ !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
pmd_val(*pm_dir) = __pa(address) |
_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
_SEGMENT_ENTRY_YOUNG |
@@ -121,7 +120,6 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
address += PMD_SIZE;
continue;
}
-#endif
if (pmd_none(*pm_dir)) {
pt_dir = vmem_pte_alloc(address);
if (!pt_dir)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e13da05505dc..be0c6de84fba 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -43,6 +43,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf285c2f..a900e7f35f6a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -79,6 +79,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 830502fe62b4..6f251c4d613e 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -307,4 +307,11 @@ static inline int is_compat_task(void)
return test_thread_flag(TIF_32BIT);
}
+static inline bool in_compat_syscall(void)
+{
+ /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */
+ return pt_regs_trap_type(current_pt_regs()) == 0x110;
+}
+#define in_compat_syscall in_compat_syscall
+
#endif /* _ASM_SPARC64_COMPAT_H */
diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h
index 49f71fd5b56e..1757cd6c521b 100644
--- a/arch/sparc/include/asm/syscall.h
+++ b/arch/sparc/include/asm/syscall.h
@@ -3,6 +3,7 @@
#include <uapi/linux/audit.h>
#include <linux/kernel.h>
+#include <linux/compat.h>
#include <linux/sched.h>
#include <asm/ptrace.h>
#include <asm/thread_info.h>
@@ -128,7 +129,13 @@ static inline void syscall_set_arguments(struct task_struct *task,
static inline int syscall_get_arch(void)
{
- return is_32bit_task() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64;
+#if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT)
+ return in_compat_syscall() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64;
+#elif defined(CONFIG_SPARC64)
+ return AUDIT_ARCH_SPARC64;
+#else
+ return AUDIT_ARCH_SPARC;
+#endif
}
#endif /* __ASM_SPARC_SYSCALL_H */
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index c690c8e16a96..b489e9759518 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -264,7 +264,7 @@ static unsigned long mmap_rnd(void)
unsigned long rnd = 0UL;
if (current->flags & PF_RANDOMIZE) {
- unsigned long val = get_random_int();
+ unsigned long val = get_random_long();
if (test_thread_flag(TIF_32BIT))
rnd = (val % (1UL << (23UL-PAGE_SHIFT)));
else
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index de4a4fff9323..adccc8625019 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -29,6 +29,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC
select EDAC_SUPPORT
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index d4e1fc41d06d..a0582b7f41d3 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -896,17 +896,15 @@ void __init pgtable_cache_init(void)
panic("pgtable_cache_init(): Cannot create pgd cache");
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static long __write_once initfree;
-#else
static long __write_once initfree = 1;
-#endif
+static bool __write_once set_initfree_done;
/* Select whether to free (1) or mark unusable (0) the __init pages. */
static int __init set_initfree(char *str)
{
long val;
if (kstrtol(str, 0, &val) == 0) {
+ set_initfree_done = true;
initfree = val;
pr_info("initfree: %s free init pages\n",
initfree ? "will" : "won't");
@@ -919,6 +917,11 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr = (unsigned long) begin;
+ /* Prefer user request first */
+ if (!set_initfree_done) {
+ if (debug_pagealloc_enabled())
+ initfree = 0;
+ }
if (kdata_huge && !initfree) {
pr_warn("Warning: ignoring initfree=0: incompatible with kdata=huge\n");
initfree = 1;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 405b1858134b..248faa89a630 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
@@ -129,6 +130,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
@@ -1409,8 +1411,10 @@ config NUMA_EMU
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
- range 1 10
- default "10" if MAXSMP
+ range 1 10 if !NR_ZONES_EXTENDED
+ range 1 9 if NR_ZONES_EXTENDED
+ default "10" if MAXSMP && !NR_ZONES_EXTENDED
+ default "9" if MAXSMP && NR_ZONES_EXTENDED
default "6" if X86_64
default "3"
depends on NEED_MULTIPLE_NODES
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index bbe1a62efc02..5f93ca072b21 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -10,6 +10,12 @@
#
KASAN_SANITIZE := n
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT := n
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f9ce75d80101..ad9e9fa5bb11 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -17,6 +17,8 @@
# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
KASAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c854541d93ff..5a1993905ace 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -5,6 +5,8 @@
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index acdee09228b3..ebb102e1bbc7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -316,9 +316,10 @@ static inline bool is_x32_task(void)
return false;
}
-static inline bool is_compat_task(void)
+static inline bool in_compat_syscall(void)
{
return is_ia32_task() || is_x32_task();
}
+#define in_compat_syscall in_compat_syscall /* override the generic impl */
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 24938852db30..21b66dbf3601 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -58,7 +58,7 @@ int ftrace_int3_handler(struct pt_regs *regs);
#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
{
- if (is_compat_task())
+ if (in_compat_syscall())
return true;
return false;
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..5664bf927264 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -106,9 +106,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
struct exception_table_entry {
int insn, fixup;
};
-/* This is not the generic standard exception_table_entry format */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+
+#define ARCH_HAS_RELATIVE_EXTABLE
extern int fixup_exception(struct pt_regs *regs);
extern int early_fixup_exception(unsigned long *ip);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..4648960d1c4c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,11 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e85f713641d..0a2bb1f62e72 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8f2a3d75c48c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o
obj-y += hw_nmi.o
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d8cf3338a035..c9d2a0115702 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acfadae2..32e5699eadfe 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,9 +265,8 @@ int __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_SMP
printk("SMP ");
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC ");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC ");
#ifdef CONFIG_KASAN
printk("KASAN");
#endif
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index ba7fbba9831b..fc3389fc47a2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
@@ -538,3 +539,48 @@ overflow:
return -ENOEXEC;
}
#endif /* CONFIG_KEXEC_FILE */
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..71a18a22ae2f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -476,7 +476,7 @@ void set_personality_ia32(bool x32)
if (current->mm)
current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
- /* is_compat_task() uses the presence of the x32
+ /* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
current_thread_info()->status &= ~TS_COMPAT;
} else {
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..fefca949c092 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
# Makefile for x86 specific library files.
#
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
+
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..147def619675 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o := n
+
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..0b172b656f43 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,14 +1,9 @@
+
#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sort.h>
+
#include <asm/uaccess.h>
static inline unsigned long
-ex_insn_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
@@ -70,100 +65,3 @@ int __init early_fixup_exception(unsigned long *ip)
return 0;
}
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry *mid;
- unsigned long addr;
-
- mid = ((last - first) >> 1) + first;
- addr = ex_insn_addr(mid);
- if (addr < value)
- first = mid + 1;
- else if (addr > value)
- last = mid - 1;
- else
- return mid;
- }
- return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /*
- * This value will always end up fittin in an int, because on
- * both i386 and x86-64 the kernel symbol-reachable address
- * space is < 2 GiB.
- *
- * This compare is only valid after normalization.
- */
- return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- struct exception_table_entry *p;
- int i;
-
- /* Convert all entries to being relative to the start of the section */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn += i;
- i += 4;
- p->fixup += i;
- i += 4;
- }
-
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, NULL);
-
- /* Denormalize all entries */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn -= i;
- i += 4;
- p->fixup -= i;
- i += 4;
- }
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f54172b4a..9d56f271d519 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,13 +150,14 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK)
/*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
+ * use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (cpu_has_pse)
+ if (cpu_has_pse && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
#endif
@@ -666,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
- begin, end - 1);
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- /*
- * We just marked the kernel text read only above, now that
- * we are going to free part of that, we need to make that
- * writeable and non-executable first.
- */
- set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+ if (debug_pagealloc_enabled()) {
+ pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ } else {
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable and non-executable first.
+ */
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
- free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
-#endif
+ free_reserved_area((void *)begin, (void *)end,
+ POISON_FREE_INITMEM, what);
+ }
}
void free_initmem(void)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 96bd1e2bffaf..72bb52f93c3d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void)
if (mmap_is_ia32())
#ifdef CONFIG_COMPAT
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
#else
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
#endif
else
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 632d34d20237..f6b206fb6403 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void)
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-# define debug_pagealloc 1
-#else
-# define debug_pagealloc 0
-#endif
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -708,10 +702,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
{
struct page *base;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
return -ENOMEM;
@@ -1326,10 +1320,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
return ret;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3e75fcf6b836..35129dcdeb71 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,6 +7,8 @@
#
#
KASAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
always := realmode.bin realmode.relocs
diff --git a/block/genhd.c b/block/genhd.c
index 9f42526b4d62..eb05dfc1dffe 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -877,7 +877,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/partition-generic.c b/block/partition-generic.c
index fefd01b496a0..5d8701941054 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -217,10 +217,21 @@ static void part_release(struct device *dev)
kfree(p);
}
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct hd_struct *part = dev_to_part(dev);
+
+ add_uevent_var(env, "PARTN=%u", part->partno);
+ if (part->info && part->info->volname[0])
+ add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+ return 0;
+}
+
struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
+ .uevent = part_uevent,
};
static void delete_partition_rcu_cb(struct rcu_head *head)
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index 0038dc4c06c7..e5fb7525a5df 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -176,17 +176,14 @@ static int hpt_dma_blacklisted(const struct ata_device *dev, char *modestr,
const char * const list[])
{
unsigned char model_num[ATA_ID_PROD_LEN + 1];
- int i = 0;
+ int i;
ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num));
- while (list[i] != NULL) {
- if (!strcmp(list[i], model_num)) {
- pr_warn("%s is not supported for %s\n",
- modestr, list[i]);
- return 1;
- }
- i++;
+ i = match_string(list, -1, model_num);
+ if (i >= 0) {
+ pr_warn("%s is not supported for %s\n", modestr, list[i]);
+ return 1;
}
return 0;
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 213456c2b123..f46dba8b7092 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -251,7 +251,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
return ret;
}
-static int memory_block_change_state(struct memory_block *mem,
+int memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
{
int ret = 0;
@@ -439,6 +439,37 @@ print_block_size(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
/*
+ * Memory auto online policy.
+ */
+
+static ssize_t
+show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ if (memhp_auto_online)
+ return sprintf(buf, "online\n");
+ else
+ return sprintf(buf, "offline\n");
+}
+
+static ssize_t
+store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (sysfs_streq(buf, "online"))
+ memhp_auto_online = true;
+ else if (sysfs_streq(buf, "offline"))
+ memhp_auto_online = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
+ store_auto_online_blocks);
+
+/*
* Some architectures will have custom drivers to do this, and
* will not need to do it from userspace. The fake hot-add code
* as well as ppc64 will do all of their discovery in userspace
@@ -746,6 +777,7 @@ static struct attribute *memory_root_attrs[] = {
#endif
&dev_attr_block_size_bytes.attr,
+ &dev_attr_auto_online_blocks.attr,
NULL
};
diff --git a/drivers/base/property.c b/drivers/base/property.c
index c359351d50f1..167f0a91258d 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -647,7 +647,7 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
const char *propname, const char *string)
{
const char **values;
- int nval, ret, i;
+ int nval, ret;
nval = fwnode_property_read_string_array(fwnode, propname, NULL, 0);
if (nval < 0)
@@ -664,13 +664,9 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
if (ret < 0)
goto out;
- ret = -ENODATA;
- for (i = 0; i < nval; i++) {
- if (!strcmp(values[i], string)) {
- ret = i;
- break;
- }
- }
+ ret = match_string(values, nval, string);
+ if (ret < 0)
+ ret = -ENODATA;
out:
kfree(values);
return ret;
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 3ef42e563bb5..83ee2a4bac1e 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -183,6 +183,18 @@ static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
return true;
}
+static int zcomp_strm_multi_num_avail_streams(struct zcomp *comp)
+{
+ int avail;
+ struct zcomp_strm_multi *zs = comp->stream;
+
+ spin_lock(&zs->strm_lock);
+ avail = zs->avail_strm;
+ spin_unlock(&zs->strm_lock);
+
+ return avail;
+}
+
static void zcomp_strm_multi_destroy(struct zcomp *comp)
{
struct zcomp_strm_multi *zs = comp->stream;
@@ -206,6 +218,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
comp->strm_find = zcomp_strm_multi_find;
comp->strm_release = zcomp_strm_multi_release;
comp->set_max_streams = zcomp_strm_multi_set_max_streams;
+ comp->num_avail_streams = zcomp_strm_multi_num_avail_streams;
zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
if (!zs)
return -ENOMEM;
@@ -246,6 +259,11 @@ static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
return false;
}
+static int zcomp_strm_single_num_avail_streams(struct zcomp *comp)
+{
+ return 1;
+}
+
static void zcomp_strm_single_destroy(struct zcomp *comp)
{
struct zcomp_strm_single *zs = comp->stream;
@@ -261,6 +279,7 @@ static int zcomp_strm_single_create(struct zcomp *comp)
comp->strm_find = zcomp_strm_single_find;
comp->strm_release = zcomp_strm_single_release;
comp->set_max_streams = zcomp_strm_single_set_max_streams;
+ comp->num_avail_streams = zcomp_strm_single_num_avail_streams;
zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
if (!zs)
return -ENOMEM;
@@ -304,6 +323,11 @@ bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
return comp->set_max_streams(comp, num_strm);
}
+int zcomp_num_avail_streams(struct zcomp *comp)
+{
+ return comp->num_avail_streams(comp);
+}
+
struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
{
return comp->strm_find(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index b7d2a4bcae54..a69a3ca8f6a2 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -47,6 +47,7 @@ struct zcomp {
struct zcomp_strm *(*strm_find)(struct zcomp *comp);
void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
bool (*set_max_streams)(struct zcomp *comp, int num_strm);
+ int (*num_avail_streams)(struct zcomp *comp);
void (*destroy)(struct zcomp *comp);
};
@@ -66,4 +67,5 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
size_t src_len, unsigned char *dst);
bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+int zcomp_num_avail_streams(struct zcomp *comp);
#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..46055dbc4095 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -429,6 +429,7 @@ static ssize_t mm_stat_show(struct device *dev,
struct zs_pool_stats pool_stats;
u64 orig_size, mem_used = 0;
long max_used;
+ int avail_streams = 0;
ssize_t ret;
memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
@@ -437,20 +438,22 @@ static ssize_t mm_stat_show(struct device *dev,
if (init_done(zram)) {
mem_used = zs_get_total_pages(zram->meta->mem_pool);
zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ avail_streams = zcomp_num_avail_streams(zram->comp);
}
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8d\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.zero_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ avail_streams);
up_read(&zram->init_lock);
return ret;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d0da5d852d41..b583e5336630 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1819,6 +1819,28 @@ unsigned int get_random_int(void)
EXPORT_SYMBOL(get_random_int);
/*
+ * Same as get_random_int(), but returns unsigned long.
+ */
+unsigned long get_random_long(void)
+{
+ __u32 *hash;
+ unsigned long ret;
+
+ if (arch_get_random_long(&ret))
+ return ret;
+
+ hash = get_cpu_var(get_random_int_hash);
+
+ hash[0] += current->pid + jiffies + random_get_entropy();
+ md5_transform(hash, random_int_secret);
+ ret = *(unsigned long *)hash;
+ put_cpu_var(get_random_int_hash);
+
+ return ret;
+}
+EXPORT_SYMBOL(get_random_long);
+
+/*
* randomize_range() returns a start address such that
*
* [...... <range> .....]
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index e4f43125e0fb..f039cfadf17b 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1300,10 +1300,10 @@ static int iop_adma_probe(struct platform_device *pdev)
* note: writecombine gives slightly better performance, but
* requires that we explicitly flush the writes
*/
- adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
- plat_data->pool_size,
- &adev->dma_desc_pool,
- GFP_KERNEL);
+ adev->dma_desc_pool_virt = dma_alloc_wc(&pdev->dev,
+ plat_data->pool_size,
+ &adev->dma_desc_pool,
+ GFP_KERNEL);
if (!adev->dma_desc_pool_virt) {
ret = -ENOMEM;
goto err_free_adev;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 14091f878f80..3922a5d56806 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -964,8 +964,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
* requires that we explicitly flush the writes
*/
mv_chan->dma_desc_pool_virt =
- dma_alloc_writecombine(&pdev->dev, MV_XOR_POOL_SIZE,
- &mv_chan->dma_desc_pool, GFP_KERNEL);
+ dma_alloc_wc(&pdev->dev, MV_XOR_POOL_SIZE, &mv_chan->dma_desc_pool,
+ GFP_KERNEL);
if (!mv_chan->dma_desc_pool_virt)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c
index 5a250cdc8376..d34aef7a101b 100644
--- a/drivers/dma/qcom_bam_dma.c
+++ b/drivers/dma/qcom_bam_dma.c
@@ -502,8 +502,8 @@ static int bam_alloc_chan(struct dma_chan *chan)
return 0;
/* allocate FIFO descriptor space, but only if necessary */
- bchan->fifo_virt = dma_alloc_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
- &bchan->fifo_phys, GFP_KERNEL);
+ bchan->fifo_virt = dma_alloc_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+ &bchan->fifo_phys, GFP_KERNEL);
if (!bchan->fifo_virt) {
dev_err(bdev->dev, "Failed to allocate desc fifo\n");
@@ -538,8 +538,8 @@ static void bam_free_chan(struct dma_chan *chan)
bam_reset_channel(bchan);
spin_unlock_irqrestore(&bchan->vc.lock, flags);
- dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
- bchan->fifo_phys);
+ dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
+ bchan->fifo_phys);
bchan->fifo_virt = NULL;
/* mask irq for pipe/channel */
@@ -1231,9 +1231,9 @@ static int bam_dma_remove(struct platform_device *pdev)
bam_dma_terminate_all(&bdev->channels[i].vc.chan);
tasklet_kill(&bdev->channels[i].vc.task);
- dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
- bdev->channels[i].fifo_virt,
- bdev->channels[i].fifo_phys);
+ dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+ bdev->channels[i].fifo_virt,
+ bdev->channels[i].fifo_phys);
}
tasklet_kill(&bdev->task);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 36a7c2d89a01..aee149bdf4c0 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -221,7 +221,7 @@ struct inbound_phy_packet_event {
#ifdef CONFIG_COMPAT
static void __user *u64_to_uptr(u64 value)
{
- if (is_compat_task())
+ if (in_compat_syscall())
return compat_ptr(value);
else
return (void __user *)(unsigned long)value;
@@ -229,7 +229,7 @@ static void __user *u64_to_uptr(u64 value)
static u64 uptr_to_u64(void __user *ptr)
{
- if (is_compat_task())
+ if (in_compat_syscall())
return ptr_to_compat(ptr);
else
return (u64)(unsigned long)ptr;
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index c5b0d2bc1111..87d48d4e327d 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -231,7 +231,7 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor,
static inline bool is_compat(void)
{
- if (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())
+ if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall())
return true;
return false;
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index aaf9c0bab42e..37cc9e395edb 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -23,6 +23,8 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
GCOV_PROFILE := n
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
lib-y := efi-stub-helper.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d2b49c026cf6..07ac724e3ec9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -107,7 +107,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
if (iminor(inode) != 0)
return -ENODEV;
- is_32bit_user_mode = is_compat_task();
+ is_32bit_user_mode = in_compat_syscall();
if (is_32bit_user_mode == true) {
dev_warn(kfd_device,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902ae037398..ac005796b71c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -311,7 +311,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
goto err_process_pqm_init;
/* init process apertures*/
- process->is_32bit_user_mode = is_compat_task();
+ process->is_32bit_user_mode = in_compat_syscall();
if (kfd_init_apertures(process) != 0)
goto err_init_apretures;
diff --git a/drivers/gpu/drm/drm_edid_load.c b/drivers/gpu/drm/drm_edid_load.c
index 698b8c3b09d9..9a401aed98e0 100644
--- a/drivers/gpu/drm/drm_edid_load.c
+++ b/drivers/gpu/drm/drm_edid_load.c
@@ -170,16 +170,11 @@ static void *edid_load(struct drm_connector *connector, const char *name,
int i, valid_extensions = 0;
bool print_bad_edid = !connector->bad_edid_counter || (drm_debug & DRM_UT_KMS);
- builtin = 0;
- for (i = 0; i < GENERIC_EDIDS; i++) {
- if (strcmp(name, generic_edid_name[i]) == 0) {
- fwdata = generic_edid[i];
- fwsize = sizeof(generic_edid[i]);
- builtin = 1;
- break;
- }
- }
- if (!builtin) {
+ builtin = match_string(generic_edid_name, GENERIC_EDIDS, name);
+ if (builtin >= 0) {
+ fwdata = generic_edid[builtin];
+ fwsize = sizeof(generic_edid[builtin]);
+ } else {
struct platform_device *pdev;
int err;
@@ -252,7 +247,7 @@ static void *edid_load(struct drm_connector *connector, const char *name,
}
DRM_INFO("Got %s EDID base block and %d extension%s from "
- "\"%s\" for connector \"%s\"\n", builtin ? "built-in" :
+ "\"%s\" for connector \"%s\"\n", (builtin >= 0) ? "built-in" :
"external", valid_extensions, valid_extensions == 1 ? "" : "s",
name, connector_name);
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index e5df53b6e229..1f500a1b9969 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -109,8 +109,8 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
if (IS_ERR(cma_obj))
return cma_obj;
- cma_obj->vaddr = dma_alloc_writecombine(drm->dev, size,
- &cma_obj->paddr, GFP_KERNEL | __GFP_NOWARN);
+ cma_obj->vaddr = dma_alloc_wc(drm->dev, size, &cma_obj->paddr,
+ GFP_KERNEL | __GFP_NOWARN);
if (!cma_obj->vaddr) {
dev_err(drm->dev, "failed to allocate buffer with size %zu\n",
size);
@@ -192,8 +192,8 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
cma_obj = to_drm_gem_cma_obj(gem_obj);
if (cma_obj->vaddr) {
- dma_free_writecombine(gem_obj->dev->dev, cma_obj->base.size,
- cma_obj->vaddr, cma_obj->paddr);
+ dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
+ cma_obj->vaddr, cma_obj->paddr);
} else if (gem_obj->import_attach) {
drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
}
@@ -324,9 +324,8 @@ static int drm_gem_cma_mmap_obj(struct drm_gem_cma_object *cma_obj,
vma->vm_flags &= ~VM_PFNMAP;
vma->vm_pgoff = 0;
- ret = dma_mmap_writecombine(cma_obj->base.dev->dev, vma,
- cma_obj->vaddr, cma_obj->paddr,
- vma->vm_end - vma->vm_start);
+ ret = dma_mmap_wc(cma_obj->base.dev->dev, vma, cma_obj->vaddr,
+ cma_obj->paddr, vma->vm_end - vma->vm_start);
if (ret)
drm_gem_vm_close(vma);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index a33162cf4f4c..3c1ce44483d9 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1113,8 +1113,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
if (!cmdbuf)
return NULL;
- cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
- GFP_KERNEL);
+ cmdbuf->vaddr = dma_alloc_wc(gpu->dev, size, &cmdbuf->paddr,
+ GFP_KERNEL);
if (!cmdbuf->vaddr) {
kfree(cmdbuf);
return NULL;
@@ -1128,8 +1128,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
{
- dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
- cmdbuf->vaddr, cmdbuf->paddr);
+ dma_free_wc(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+ cmdbuf->paddr);
kfree(cmdbuf);
}
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
index dfebdc4aa0f2..85dfe3674b41 100644
--- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
@@ -573,10 +573,9 @@ static int omap_dmm_remove(struct platform_device *dev)
kfree(omap_dmm->engines);
if (omap_dmm->refill_va)
- dma_free_writecombine(omap_dmm->dev,
- REFILL_BUFFER_SIZE * omap_dmm->num_engines,
- omap_dmm->refill_va,
- omap_dmm->refill_pa);
+ dma_free_wc(omap_dmm->dev,
+ REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+ omap_dmm->refill_va, omap_dmm->refill_pa);
if (omap_dmm->dummy_page)
__free_page(omap_dmm->dummy_page);
@@ -701,9 +700,9 @@ static int omap_dmm_probe(struct platform_device *dev)
omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
/* alloc refill memory */
- omap_dmm->refill_va = dma_alloc_writecombine(&dev->dev,
- REFILL_BUFFER_SIZE * omap_dmm->num_engines,
- &omap_dmm->refill_pa, GFP_KERNEL);
+ omap_dmm->refill_va = dma_alloc_wc(&dev->dev,
+ REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+ &omap_dmm->refill_pa, GFP_KERNEL);
if (!omap_dmm->refill_va) {
dev_err(&dev->dev, "could not allocate refill memory\n");
goto fail;
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 8495a1a4b617..359b0d7e8ef7 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1330,8 +1330,8 @@ void omap_gem_free_object(struct drm_gem_object *obj)
omap_gem_detach_pages(obj);
if (!is_shmem(obj)) {
- dma_free_writecombine(dev->dev, obj->size,
- omap_obj->vaddr, omap_obj->paddr);
+ dma_free_wc(dev->dev, obj->size, omap_obj->vaddr,
+ omap_obj->paddr);
} else if (omap_obj->vaddr) {
vunmap(omap_obj->vaddr);
}
@@ -1395,8 +1395,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev,
/* attempt to allocate contiguous memory if we don't
* have DMM for remappign discontiguous buffers
*/
- omap_obj->vaddr = dma_alloc_writecombine(dev->dev, size,
- &omap_obj->paddr, GFP_KERNEL);
+ omap_obj->vaddr = dma_alloc_wc(dev->dev, size,
+ &omap_obj->paddr, GFP_KERNEL);
if (!omap_obj->vaddr) {
kfree(omap_obj);
diff --git a/drivers/gpu/drm/sti/sti_cursor.c b/drivers/gpu/drm/sti/sti_cursor.c
index 807863106b8d..bd736ace3f81 100644
--- a/drivers/gpu/drm/sti/sti_cursor.c
+++ b/drivers/gpu/drm/sti/sti_cursor.c
@@ -157,17 +157,15 @@ static void sti_cursor_atomic_update(struct drm_plane *drm_plane,
cursor->height = src_h;
if (cursor->pixmap.base)
- dma_free_writecombine(cursor->dev,
- cursor->pixmap.size,
- cursor->pixmap.base,
- cursor->pixmap.paddr);
+ dma_free_wc(cursor->dev, cursor->pixmap.size,
+ cursor->pixmap.base, cursor->pixmap.paddr);
cursor->pixmap.size = cursor->width * cursor->height;
- cursor->pixmap.base = dma_alloc_writecombine(cursor->dev,
- cursor->pixmap.size,
- &cursor->pixmap.paddr,
- GFP_KERNEL | GFP_DMA);
+ cursor->pixmap.base = dma_alloc_wc(cursor->dev,
+ cursor->pixmap.size,
+ &cursor->pixmap.paddr,
+ GFP_KERNEL | GFP_DMA);
if (!cursor->pixmap.base) {
DRM_ERROR("Failed to allocate memory for pixmap\n");
return;
@@ -252,8 +250,8 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
/* Allocate clut buffer */
size = 0x100 * sizeof(unsigned short);
- cursor->clut = dma_alloc_writecombine(dev, size, &cursor->clut_paddr,
- GFP_KERNEL | GFP_DMA);
+ cursor->clut = dma_alloc_wc(dev, size, &cursor->clut_paddr,
+ GFP_KERNEL | GFP_DMA);
if (!cursor->clut) {
DRM_ERROR("Failed to allocate memory for cursor clut\n");
@@ -286,7 +284,7 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
return &cursor->plane.drm_plane;
err_plane:
- dma_free_writecombine(dev, size, cursor->clut, cursor->clut_paddr);
+ dma_free_wc(dev, size, cursor->clut, cursor->clut_paddr);
err_clut:
devm_kfree(dev, cursor);
return NULL;
diff --git a/drivers/gpu/drm/sti/sti_gdp.c b/drivers/gpu/drm/sti/sti_gdp.c
index f9a1d92c9d95..514551c857bb 100644
--- a/drivers/gpu/drm/sti/sti_gdp.c
+++ b/drivers/gpu/drm/sti/sti_gdp.c
@@ -312,8 +312,7 @@ static void sti_gdp_init(struct sti_gdp *gdp)
/* Allocate all the nodes within a single memory page */
size = sizeof(struct sti_gdp_node) *
GDP_NODE_PER_FIELD * GDP_NODE_NB_BANK;
- base = dma_alloc_writecombine(gdp->dev,
- size, &dma_addr, GFP_KERNEL | GFP_DMA);
+ base = dma_alloc_wc(gdp->dev, size, &dma_addr, GFP_KERNEL | GFP_DMA);
if (!base) {
DRM_ERROR("Failed to allocate memory for GDP node\n");
diff --git a/drivers/gpu/drm/sti/sti_hqvdp.c b/drivers/gpu/drm/sti/sti_hqvdp.c
index 43861b52261d..1d3c3d029603 100644
--- a/drivers/gpu/drm/sti/sti_hqvdp.c
+++ b/drivers/gpu/drm/sti/sti_hqvdp.c
@@ -617,9 +617,9 @@ static void sti_hqvdp_init(struct sti_hqvdp *hqvdp)
/* Allocate memory for the VDP commands */
size = NB_VDP_CMD * sizeof(struct sti_hqvdp_cmd);
- hqvdp->hqvdp_cmd = dma_alloc_writecombine(hqvdp->dev, size,
- &hqvdp->hqvdp_cmd_paddr,
- GFP_KERNEL | GFP_DMA);
+ hqvdp->hqvdp_cmd = dma_alloc_wc(hqvdp->dev, size,
+ &hqvdp->hqvdp_cmd_paddr,
+ GFP_KERNEL | GFP_DMA);
if (!hqvdp->hqvdp_cmd) {
DRM_ERROR("Failed to allocate memory for VDP cmd\n");
return;
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 33add93b4ed9..3b0d8c392b70 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -175,8 +175,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
sg_free_table(bo->sgt);
kfree(bo->sgt);
} else if (bo->vaddr) {
- dma_free_writecombine(drm->dev, bo->gem.size, bo->vaddr,
- bo->paddr);
+ dma_free_wc(drm->dev, bo->gem.size, bo->vaddr, bo->paddr);
}
}
@@ -233,8 +232,8 @@ static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
} else {
size_t size = bo->gem.size;
- bo->vaddr = dma_alloc_writecombine(drm->dev, size, &bo->paddr,
- GFP_KERNEL | __GFP_NOWARN);
+ bo->vaddr = dma_alloc_wc(drm->dev, size, &bo->paddr,
+ GFP_KERNEL | __GFP_NOWARN);
if (!bo->vaddr) {
dev_err(drm->dev,
"failed to allocate buffer of size %zu\n",
@@ -472,8 +471,8 @@ int tegra_drm_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags &= ~VM_PFNMAP;
vma->vm_pgoff = 0;
- ret = dma_mmap_writecombine(gem->dev->dev, vma, bo->vaddr,
- bo->paddr, gem->size);
+ ret = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->paddr,
+ gem->size);
if (ret) {
drm_gem_vm_close(vma);
return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 18dfe3ec9a62..85639ab53e94 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -398,9 +398,8 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_flags &= ~VM_PFNMAP;
vma->vm_pgoff = 0;
- ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
- bo->base.vaddr, bo->base.paddr,
- vma->vm_end - vma->vm_start);
+ ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
+ bo->base.paddr, vma->vm_end - vma->vm_start);
if (ret)
drm_gem_vm_close(vma);
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 5a8c8d55317a..a18db4d5347c 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -52,8 +52,8 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
struct host1x *host1x = cdma_to_host1x(cdma);
if (pb->phys != 0)
- dma_free_writecombine(host1x->dev, pb->size_bytes + 4,
- pb->mapped, pb->phys);
+ dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
+ pb->phys);
pb->mapped = NULL;
pb->phys = 0;
@@ -76,8 +76,8 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
pb->pos = 0;
/* allocate and map pushbuffer memory */
- pb->mapped = dma_alloc_writecombine(host1x->dev, pb->size_bytes + 4,
- &pb->phys, GFP_KERNEL);
+ pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
+ GFP_KERNEL);
if (!pb->mapped)
goto fail;
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 1919aab88c3f..a0a1c8d50105 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -467,9 +467,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
size += g->words * sizeof(u32);
}
- job->gather_copy_mapped = dma_alloc_writecombine(dev, size,
- &job->gather_copy,
- GFP_KERNEL);
+ job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy,
+ GFP_KERNEL);
if (!job->gather_copy_mapped) {
job->gather_copy_mapped = NULL;
return -ENOMEM;
@@ -578,9 +577,8 @@ void host1x_job_unpin(struct host1x_job *job)
job->num_unpins = 0;
if (job->gather_copy_size)
- dma_free_writecombine(job->channel->dev, job->gather_copy_size,
- job->gather_copy_mapped,
- job->gather_copy);
+ dma_free_wc(job->channel->dev, job->gather_copy_size,
+ job->gather_copy_mapped, job->gather_copy);
}
EXPORT_SYMBOL(host1x_job_unpin);
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index e094c572b86e..16b6f11a0700 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -384,7 +384,7 @@ struct uhid_create_req_compat {
static int uhid_event_from_user(const char __user *buffer, size_t len,
struct uhid_event *event)
{
- if (is_compat_task()) {
+ if (in_compat_syscall()) {
u32 type;
if (get_user(type, buffer))
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 696b6c1ec940..f94baadbf424 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -531,14 +531,9 @@ static const struct hpt_info hpt371n = {
.timings = &hpt37x_timings
};
-static int check_in_drive_list(ide_drive_t *drive, const char **list)
+static bool check_in_drive_list(ide_drive_t *drive, const char **list)
{
- char *m = (char *)&drive->id[ATA_ID_PROD];
-
- while (*list)
- if (!strcmp(*list++, m))
- return 1;
- return 0;
+ return match_string(list, -1, (char *)&drive->id[ATA_ID_PROD]) >= 0;
}
static struct hpt_info *hpt3xx_get_info(struct device *dev)
diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h
index 148f66fe3205..0f25878d5fa2 100644
--- a/drivers/input/input-compat.h
+++ b/drivers/input/input-compat.h
@@ -17,17 +17,7 @@
#ifdef CONFIG_COMPAT
-/* Note to the author of this code: did it ever occur to
- you why the ifdefs are needed? Think about it again. -AK */
-#if defined(CONFIG_X86_64) || defined(CONFIG_TILE)
-# define INPUT_COMPAT_TEST is_compat_task()
-#elif defined(CONFIG_S390)
-# define INPUT_COMPAT_TEST test_thread_flag(TIF_31BIT)
-#elif defined(CONFIG_MIPS)
-# define INPUT_COMPAT_TEST test_thread_flag(TIF_32BIT_ADDR)
-#else
-# define INPUT_COMPAT_TEST test_thread_flag(TIF_32BIT)
-#endif
+#define INPUT_COMPAT_TEST in_compat_syscall()
struct input_event_compat {
struct compat_timeval time;
diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c
index 6efe9d002961..b6625047250d 100644
--- a/drivers/media/platform/coda/coda-bit.c
+++ b/drivers/media/platform/coda/coda-bit.c
@@ -1455,9 +1455,9 @@ static int coda_alloc_bitstream_buffer(struct coda_ctx *ctx,
return 0;
ctx->bitstream.size = roundup_pow_of_two(q_data->sizeimage * 2);
- ctx->bitstream.vaddr = dma_alloc_writecombine(
- &ctx->dev->plat_dev->dev, ctx->bitstream.size,
- &ctx->bitstream.paddr, GFP_KERNEL);
+ ctx->bitstream.vaddr = dma_alloc_wc(&ctx->dev->plat_dev->dev,
+ ctx->bitstream.size,
+ &ctx->bitstream.paddr, GFP_KERNEL);
if (!ctx->bitstream.vaddr) {
v4l2_err(&ctx->dev->v4l2_dev,
"failed to allocate bitstream ringbuffer");
@@ -1474,8 +1474,8 @@ static void coda_free_bitstream_buffer(struct coda_ctx *ctx)
if (ctx->bitstream.vaddr == NULL)
return;
- dma_free_writecombine(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
- ctx->bitstream.vaddr, ctx->bitstream.paddr);
+ dma_free_wc(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
+ ctx->bitstream.vaddr, ctx->bitstream.paddr);
ctx->bitstream.vaddr = NULL;
kfifo_init(&ctx->bitstream_fifo, NULL, 0);
}
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index 01f08a7751f7..9cfe6aeac84e 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -24,6 +24,7 @@
#include <linux/skbuff.h>
#include <linux/crc32.h>
#include <linux/ethtool.h>
+#include <linux/reboot.h>
#define DRV_NAME "rionet"
#define DRV_VERSION "0.3"
@@ -48,6 +49,8 @@ MODULE_LICENSE("GPL");
#define RIONET_TX_RING_SIZE CONFIG_RIONET_TX_SIZE
#define RIONET_RX_RING_SIZE CONFIG_RIONET_RX_SIZE
#define RIONET_MAX_NETS 8
+#define RIONET_MSG_SIZE RIO_MAX_MSG_SIZE
+#define RIONET_MAX_MTU (RIONET_MSG_SIZE - ETH_HLEN)
struct rionet_private {
struct rio_mport *mport;
@@ -60,6 +63,7 @@ struct rionet_private {
spinlock_t lock;
spinlock_t tx_lock;
u32 msg_enable;
+ bool open;
};
struct rionet_peer {
@@ -71,6 +75,7 @@ struct rionet_peer {
struct rionet_net {
struct net_device *ndev;
struct list_head peers;
+ spinlock_t lock; /* net info access lock */
struct rio_dev **active;
int nact; /* number of active peers */
};
@@ -232,26 +237,32 @@ static void rionet_dbell_event(struct rio_mport *mport, void *dev_id, u16 sid, u
struct net_device *ndev = dev_id;
struct rionet_private *rnet = netdev_priv(ndev);
struct rionet_peer *peer;
+ unsigned char netid = rnet->mport->id;
if (netif_msg_intr(rnet))
printk(KERN_INFO "%s: doorbell sid %4.4x tid %4.4x info %4.4x",
DRV_NAME, sid, tid, info);
if (info == RIONET_DOORBELL_JOIN) {
- if (!nets[rnet->mport->id].active[sid]) {
- list_for_each_entry(peer,
- &nets[rnet->mport->id].peers, node) {
+ if (!nets[netid].active[sid]) {
+ spin_lock(&nets[netid].lock);
+ list_for_each_entry(peer, &nets[netid].peers, node) {
if (peer->rdev->destid == sid) {
- nets[rnet->mport->id].active[sid] =
- peer->rdev;
- nets[rnet->mport->id].nact++;
+ nets[netid].active[sid] = peer->rdev;
+ nets[netid].nact++;
}
}
+ spin_unlock(&nets[netid].lock);
+
rio_mport_send_doorbell(mport, sid,
RIONET_DOORBELL_JOIN);
}
} else if (info == RIONET_DOORBELL_LEAVE) {
- nets[rnet->mport->id].active[sid] = NULL;
- nets[rnet->mport->id].nact--;
+ spin_lock(&nets[netid].lock);
+ if (nets[netid].active[sid]) {
+ nets[netid].active[sid] = NULL;
+ nets[netid].nact--;
+ }
+ spin_unlock(&nets[netid].lock);
} else {
if (netif_msg_intr(rnet))
printk(KERN_WARNING "%s: unhandled doorbell\n",
@@ -280,7 +291,7 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo
struct net_device *ndev = dev_id;
struct rionet_private *rnet = netdev_priv(ndev);
- spin_lock(&rnet->lock);
+ spin_lock(&rnet->tx_lock);
if (netif_msg_intr(rnet))
printk(KERN_INFO
@@ -299,14 +310,16 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo
if (rnet->tx_cnt < RIONET_TX_RING_SIZE)
netif_wake_queue(ndev);
- spin_unlock(&rnet->lock);
+ spin_unlock(&rnet->tx_lock);
}
static int rionet_open(struct net_device *ndev)
{
int i, rc = 0;
- struct rionet_peer *peer, *tmp;
+ struct rionet_peer *peer;
struct rionet_private *rnet = netdev_priv(ndev);
+ unsigned char netid = rnet->mport->id;
+ unsigned long flags;
if (netif_msg_ifup(rnet))
printk(KERN_INFO "%s: open\n", DRV_NAME);
@@ -345,20 +358,13 @@ static int rionet_open(struct net_device *ndev)
netif_carrier_on(ndev);
netif_start_queue(ndev);
- list_for_each_entry_safe(peer, tmp,
- &nets[rnet->mport->id].peers, node) {
- if (!(peer->res = rio_request_outb_dbell(peer->rdev,
- RIONET_DOORBELL_JOIN,
- RIONET_DOORBELL_LEAVE)))
- {
- printk(KERN_ERR "%s: error requesting doorbells\n",
- DRV_NAME);
- continue;
- }
-
+ spin_lock_irqsave(&nets[netid].lock, flags);
+ list_for_each_entry(peer, &nets[netid].peers, node) {
/* Send a join message */
rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN);
}
+ spin_unlock_irqrestore(&nets[netid].lock, flags);
+ rnet->open = true;
out:
return rc;
@@ -367,7 +373,9 @@ static int rionet_open(struct net_device *ndev)
static int rionet_close(struct net_device *ndev)
{
struct rionet_private *rnet = netdev_priv(ndev);
- struct rionet_peer *peer, *tmp;
+ struct rionet_peer *peer;
+ unsigned char netid = rnet->mport->id;
+ unsigned long flags;
int i;
if (netif_msg_ifup(rnet))
@@ -375,18 +383,21 @@ static int rionet_close(struct net_device *ndev)
netif_stop_queue(ndev);
netif_carrier_off(ndev);
+ rnet->open = false;
for (i = 0; i < RIONET_RX_RING_SIZE; i++)
kfree_skb(rnet->rx_skb[i]);
- list_for_each_entry_safe(peer, tmp,
- &nets[rnet->mport->id].peers, node) {
- if (nets[rnet->mport->id].active[peer->rdev->destid]) {
+ spin_lock_irqsave(&nets[netid].lock, flags);
+ list_for_each_entry(peer, &nets[netid].peers, node) {
+ if (nets[netid].active[peer->rdev->destid]) {
rio_send_doorbell(peer->rdev, RIONET_DOORBELL_LEAVE);
- nets[rnet->mport->id].active[peer->rdev->destid] = NULL;
+ nets[netid].active[peer->rdev->destid] = NULL;
}
- rio_release_outb_dbell(peer->rdev, peer->res);
+ if (peer->res)
+ rio_release_outb_dbell(peer->rdev, peer->res);
}
+ spin_unlock_irqrestore(&nets[netid].lock, flags);
rio_release_inb_dbell(rnet->mport, RIONET_DOORBELL_JOIN,
RIONET_DOORBELL_LEAVE);
@@ -400,22 +411,38 @@ static void rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
{
struct rio_dev *rdev = to_rio_dev(dev);
unsigned char netid = rdev->net->hport->id;
- struct rionet_peer *peer, *tmp;
+ struct rionet_peer *peer;
+ int state, found = 0;
+ unsigned long flags;
- if (dev_rionet_capable(rdev)) {
- list_for_each_entry_safe(peer, tmp, &nets[netid].peers, node) {
- if (peer->rdev == rdev) {
- if (nets[netid].active[rdev->destid]) {
- nets[netid].active[rdev->destid] = NULL;
- nets[netid].nact--;
+ if (!dev_rionet_capable(rdev))
+ return;
+
+ spin_lock_irqsave(&nets[netid].lock, flags);
+ list_for_each_entry(peer, &nets[netid].peers, node) {
+ if (peer->rdev == rdev) {
+ list_del(&peer->node);
+ if (nets[netid].active[rdev->destid]) {
+ state = atomic_read(&rdev->state);
+ if (state != RIO_DEVICE_GONE &&
+ state != RIO_DEVICE_INITIALIZING) {
+ rio_send_doorbell(rdev,
+ RIONET_DOORBELL_LEAVE);
}
-
- list_del(&peer->node);
- kfree(peer);
- break;
+ nets[netid].active[rdev->destid] = NULL;
+ nets[netid].nact--;
}
+ found = 1;
+ break;
}
}
+ spin_unlock_irqrestore(&nets[netid].lock, flags);
+
+ if (found) {
+ if (peer->res)
+ rio_release_outb_dbell(rdev, peer->res);
+ kfree(peer);
+ }
}
static void rionet_get_drvinfo(struct net_device *ndev,
@@ -443,6 +470,17 @@ static void rionet_set_msglevel(struct net_device *ndev, u32 value)
rnet->msg_enable = value;
}
+static int rionet_change_mtu(struct net_device *ndev, int new_mtu)
+{
+ if ((new_mtu < 68) || (new_mtu > RIONET_MAX_MTU)) {
+ printk(KERN_ERR "%s: Invalid MTU size %d\n",
+ ndev->name, new_mtu);
+ return -EINVAL;
+ }
+ ndev->mtu = new_mtu;
+ return 0;
+}
+
static const struct ethtool_ops rionet_ethtool_ops = {
.get_drvinfo = rionet_get_drvinfo,
.get_msglevel = rionet_get_msglevel,
@@ -454,7 +492,7 @@ static const struct net_device_ops rionet_netdev_ops = {
.ndo_open = rionet_open,
.ndo_stop = rionet_close,
.ndo_start_xmit = rionet_start_xmit,
- .ndo_change_mtu = eth_change_mtu,
+ .ndo_change_mtu = rionet_change_mtu,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = eth_mac_addr,
};
@@ -478,6 +516,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
/* Set up private area */
rnet = netdev_priv(ndev);
rnet->mport = mport;
+ rnet->open = false;
/* Set the default MAC address */
device_id = rio_local_get_device_id(mport);
@@ -489,7 +528,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
ndev->dev_addr[5] = device_id & 0xff;
ndev->netdev_ops = &rionet_netdev_ops;
- ndev->mtu = RIO_MAX_MSG_SIZE - 14;
+ ndev->mtu = RIONET_MAX_MTU;
ndev->features = NETIF_F_LLTX;
SET_NETDEV_DEV(ndev, &mport->dev);
ndev->ethtool_ops = &rionet_ethtool_ops;
@@ -500,8 +539,11 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
rnet->msg_enable = RIONET_DEFAULT_MSGLEVEL;
rc = register_netdev(ndev);
- if (rc != 0)
+ if (rc != 0) {
+ free_pages((unsigned long)nets[mport->id].active,
+ get_order(rionet_active_bytes));
goto out;
+ }
printk(KERN_INFO "%s: %s %s Version %s, MAC %pM, %s\n",
ndev->name,
@@ -515,8 +557,6 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
return rc;
}
-static unsigned long net_table[RIONET_MAX_NETS/sizeof(unsigned long) + 1];
-
static int rionet_add_dev(struct device *dev, struct subsys_interface *sif)
{
int rc = -ENODEV;
@@ -525,19 +565,16 @@ static int rionet_add_dev(struct device *dev, struct subsys_interface *sif)
struct net_device *ndev = NULL;
struct rio_dev *rdev = to_rio_dev(dev);
unsigned char netid = rdev->net->hport->id;
- int oldnet;
if (netid >= RIONET_MAX_NETS)
return rc;
- oldnet = test_and_set_bit(netid, net_table);
-
/*
* If first time through this net, make sure local device is rionet
* capable and setup netdev (this step will be skipped in later probes
* on the same net).
*/
- if (!oldnet) {
+ if (!nets[netid].ndev) {
rio_local_read_config_32(rdev->net->hport, RIO_SRC_OPS_CAR,
&lsrc_ops);
rio_local_read_config_32(rdev->net->hport, RIO_DST_OPS_CAR,
@@ -555,30 +592,56 @@ static int rionet_add_dev(struct device *dev, struct subsys_interface *sif)
rc = -ENOMEM;
goto out;
}
- nets[netid].ndev = ndev;
+
rc = rionet_setup_netdev(rdev->net->hport, ndev);
if (rc) {
printk(KERN_ERR "%s: failed to setup netdev (rc=%d)\n",
DRV_NAME, rc);
+ free_netdev(ndev);
goto out;
}
INIT_LIST_HEAD(&nets[netid].peers);
+ spin_lock_init(&nets[netid].lock);
nets[netid].nact = 0;
- } else if (nets[netid].ndev == NULL)
- goto out;
+ nets[netid].ndev = ndev;
+ }
/*
* If the remote device has mailbox/doorbell capabilities,
* add it to the peer list.
*/
if (dev_rionet_capable(rdev)) {
- if (!(peer = kmalloc(sizeof(struct rionet_peer), GFP_KERNEL))) {
+ struct rionet_private *rnet;
+ unsigned long flags;
+
+ rnet = netdev_priv(nets[netid].ndev);
+
+ peer = kzalloc(sizeof(*peer), GFP_KERNEL);
+ if (!peer) {
rc = -ENOMEM;
goto out;
}
peer->rdev = rdev;
+ peer->res = rio_request_outb_dbell(peer->rdev,
+ RIONET_DOORBELL_JOIN,
+ RIONET_DOORBELL_LEAVE);
+ if (!peer->res) {
+ pr_err("%s: error requesting doorbells\n", DRV_NAME);
+ kfree(peer);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock_irqsave(&nets[netid].lock, flags);
list_add_tail(&peer->node, &nets[netid].peers);
+ spin_unlock_irqrestore(&nets[netid].lock, flags);
+ pr_debug("%s: %s add peer %s\n",
+ DRV_NAME, __func__, rio_name(rdev));
+
+ /* If netdev is already opened, send join request to new peer */
+ if (rnet->open)
+ rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN);
}
return 0;
@@ -586,6 +649,61 @@ out:
return rc;
}
+static int rionet_shutdown(struct notifier_block *nb, unsigned long code,
+ void *unused)
+{
+ struct rionet_peer *peer;
+ unsigned long flags;
+ int i;
+
+ pr_debug("%s: %s\n", DRV_NAME, __func__);
+
+ for (i = 0; i < RIONET_MAX_NETS; i++) {
+ if (!nets[i].ndev)
+ continue;
+
+ spin_lock_irqsave(&nets[i].lock, flags);
+ list_for_each_entry(peer, &nets[i].peers, node) {
+ if (nets[i].active[peer->rdev->destid]) {
+ rio_send_doorbell(peer->rdev,
+ RIONET_DOORBELL_LEAVE);
+ nets[i].active[peer->rdev->destid] = NULL;
+ }
+ }
+ spin_unlock_irqrestore(&nets[i].lock, flags);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void rionet_remove_mport(struct device *dev,
+ struct class_interface *class_intf)
+{
+ struct rio_mport *mport = to_rio_mport(dev);
+ struct net_device *ndev;
+ int id = mport->id;
+
+ pr_debug("%s %s\n", __func__, mport->name);
+
+ WARN(nets[id].nact, "%s called when connected to %d peers\n",
+ __func__, nets[id].nact);
+ WARN(!nets[id].ndev, "%s called for mport without NDEV\n",
+ __func__);
+
+ if (nets[id].ndev) {
+ ndev = nets[id].ndev;
+ netif_stop_queue(ndev);
+ unregister_netdev(ndev);
+
+ free_pages((unsigned long)nets[id].active,
+ get_order(sizeof(void *) *
+ RIO_MAX_ROUTE_ENTRIES(mport->sys_size)));
+ nets[id].active = NULL;
+ free_netdev(ndev);
+ nets[id].ndev = NULL;
+ }
+}
+
#ifdef MODULE
static struct rio_device_id rionet_id_table[] = {
{RIO_DEVICE(RIO_ANY_ID, RIO_ANY_ID)},
@@ -602,40 +720,43 @@ static struct subsys_interface rionet_interface = {
.remove_dev = rionet_remove_dev,
};
+static struct notifier_block rionet_notifier = {
+ .notifier_call = rionet_shutdown,
+};
+
+/* the rio_mport_interface is used to handle local mport devices */
+static struct class_interface rio_mport_interface __refdata = {
+ .class = &rio_mport_class,
+ .add_dev = NULL,
+ .remove_dev = rionet_remove_mport,
+};
+
static int __init rionet_init(void)
{
+ int ret;
+
+ ret = register_reboot_notifier(&rionet_notifier);
+ if (ret) {
+ pr_err("%s: failed to register reboot notifier (err=%d)\n",
+ DRV_NAME, ret);
+ return ret;
+ }
+
+ ret = class_interface_register(&rio_mport_interface);
+ if (ret) {
+ pr_err("%s: class_interface_register error: %d\n",
+ DRV_NAME, ret);
+ return ret;
+ }
+
return subsys_interface_register(&rionet_interface);
}
static void __exit rionet_exit(void)
{
- struct rionet_private *rnet;
- struct net_device *ndev;
- struct rionet_peer *peer, *tmp;
- int i;
-
- for (i = 0; i < RIONET_MAX_NETS; i++) {
- if (nets[i].ndev != NULL) {
- ndev = nets[i].ndev;
- rnet = netdev_priv(ndev);
- unregister_netdev(ndev);
-
- list_for_each_entry_safe(peer,
- tmp, &nets[i].peers, node) {
- list_del(&peer->node);
- kfree(peer);
- }
-
- free_pages((unsigned long)nets[i].active,
- get_order(sizeof(void *) *
- RIO_MAX_ROUTE_ENTRIES(rnet->mport->sys_size)));
- nets[i].active = NULL;
-
- free_netdev(ndev);
- }
- }
-
+ unregister_reboot_notifier(&rionet_notifier);
subsys_interface_unregister(&rionet_interface);
+ class_interface_unregister(&rio_mport_interface);
}
late_initcall(rionet_init);
diff --git a/drivers/net/wireless/marvell/mwifiex/debugfs.c b/drivers/net/wireless/marvell/mwifiex/debugfs.c
index df28836a1d11..bccf17ad588e 100644
--- a/drivers/net/wireless/marvell/mwifiex/debugfs.c
+++ b/drivers/net/wireless/marvell/mwifiex/debugfs.c
@@ -925,14 +925,12 @@ mwifiex_reset_write(struct file *file,
{
struct mwifiex_private *priv = file->private_data;
struct mwifiex_adapter *adapter = priv->adapter;
- char cmd;
bool result;
+ int rc;
- if (copy_from_user(&cmd, ubuf, sizeof(cmd)))
- return -EFAULT;
-
- if (strtobool(&cmd, &result))
- return -EINVAL;
+ rc = kstrtobool_from_user(ubuf, count, &result);
+ if (rc)
+ return rc;
if (!result)
return -EINVAL;
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 29984b36926a..c223a9ef1fe1 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -334,7 +334,6 @@ int pinmux_map_to_setting(struct pinctrl_map const *map,
unsigned num_groups;
int ret;
const char *group;
- int i;
if (!pmxops) {
dev_err(pctldev->dev, "does not support mux function\n");
@@ -363,19 +362,13 @@ int pinmux_map_to_setting(struct pinctrl_map const *map,
return -EINVAL;
}
if (map->data.mux.group) {
- bool found = false;
group = map->data.mux.group;
- for (i = 0; i < num_groups; i++) {
- if (!strcmp(group, groups[i])) {
- found = true;
- break;
- }
- }
- if (!found) {
+ ret = match_string(groups, num_groups, group);
+ if (ret < 0) {
dev_err(pctldev->dev,
"invalid group \"%s\" for function \"%s\"\n",
group, map->data.mux.function);
- return -EINVAL;
+ return ret;
}
} else {
group = groups[0];
diff --git a/drivers/power/ab8500_btemp.c b/drivers/power/ab8500_btemp.c
index 8f8044e1acf3..bf2e5dd301e7 100644
--- a/drivers/power/ab8500_btemp.c
+++ b/drivers/power/ab8500_btemp.c
@@ -906,26 +906,21 @@ static int ab8500_btemp_get_property(struct power_supply *psy,
static int ab8500_btemp_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_btemp *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/*
* For all psy where the name of your driver
* appears in any supplied_to
*/
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index e388171f4e58..30de5d42b26a 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -1929,11 +1929,11 @@ static int ab8540_charger_usb_pre_chg_enable(struct ux500_charger *charger,
static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_charger *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
struct ux500_charger *usb_chg;
usb_chg = (struct ux500_charger *)data;
@@ -1941,15 +1941,9 @@ static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
di = to_ab8500_charger_usb_device_info(usb_chg);
- ext = dev_get_drvdata(dev);
-
/* For all psy where the driver name appears in any supplied_to */
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 3830dade5d69..5a36cf88578a 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -2168,26 +2168,21 @@ static int ab8500_fg_get_property(struct power_supply *psy,
static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_fg *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/*
* For all psy where the name of your driver
* appears in any supplied_to
*/
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index 541f702e0451..d9104b1ab7cf 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -975,22 +975,18 @@ static void handle_maxim_chg_curr(struct abx500_chargalg *di)
static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct abx500_chargalg *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
bool capacity_updated = false;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/* For all psy where the driver name appears in any supplied_to */
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/*
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index 1ea5d1aa268b..e664ca7c0afd 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -2020,27 +2020,6 @@ static void __exit charger_manager_cleanup(void)
module_exit(charger_manager_cleanup);
/**
- * find_power_supply - find the associated power_supply of charger
- * @cm: the Charger Manager representing the battery
- * @psy: pointer to instance of charger's power_supply
- */
-static bool find_power_supply(struct charger_manager *cm,
- struct power_supply *psy)
-{
- int i;
- bool found = false;
-
- for (i = 0; cm->desc->psy_charger_stat[i]; i++) {
- if (!strcmp(psy->desc->name, cm->desc->psy_charger_stat[i])) {
- found = true;
- break;
- }
- }
-
- return found;
-}
-
-/**
* cm_notify_event - charger driver notify Charger Manager of charger event
* @psy: pointer to instance of charger's power_supply
* @type: type of charger event
@@ -2057,9 +2036,11 @@ void cm_notify_event(struct power_supply *psy, enum cm_event_types type,
mutex_lock(&cm_list_mtx);
list_for_each_entry(cm, &cm_list, entry) {
- found_power_supply = find_power_supply(cm, psy);
- if (found_power_supply)
+ if (match_string(cm->desc->psy_charger_stat, -1,
+ psy->desc->name) >= 0) {
+ found_power_supply = true;
break;
+ }
}
mutex_unlock(&cm_list_mtx);
diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index 3e3be57e9a1a..b5a10d3c92c7 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -67,6 +67,14 @@ config RAPIDIO_ENUM_BASIC
endchoice
+config RAPIDIO_MPORT_CDEV
+ tristate "RapidIO /dev mport device driver"
+ depends on RAPIDIO
+ help
+ This option includes generic RapidIO mport device driver which
+ allows to user space applications to perform RapidIO-specific
+ operations through selected RapidIO mport.
+
menu "RapidIO Switch drivers"
depends on RAPIDIO
diff --git a/drivers/rapidio/devices/Makefile b/drivers/rapidio/devices/Makefile
index 9432c494cf57..927dbf89592b 100644
--- a/drivers/rapidio/devices/Makefile
+++ b/drivers/rapidio/devices/Makefile
@@ -5,3 +5,4 @@
obj-$(CONFIG_RAPIDIO_TSI721) += tsi721_mport.o
tsi721_mport-y := tsi721.o
tsi721_mport-$(CONFIG_RAPIDIO_DMA_ENGINE) += tsi721_dma.o
+obj-$(CONFIG_RAPIDIO_MPORT_CDEV) += rio_mport_cdev.o
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
new file mode 100644
index 000000000000..6d56725c937b
--- /dev/null
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -0,0 +1,2711 @@
+/*
+ * RapidIO mport character device
+ *
+ * Copyright 2014-2015 Integrated Device Technology, Inc.
+ * Alexandre Bounine <alexandre.bounine@idt.com>
+ * Copyright 2014-2015 Prodrive Technologies
+ * Andre van Herk <andre.van.herk@prodrive-technologies.com>
+ * Jerry Jacobs <jerry.jacobs@prodrive-technologies.com>
+ * Copyright (C) 2014 Texas Instruments Incorporated
+ * Aurelien Jacquiot <a-jacquiot@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/cdev.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/kfifo.h>
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mman.h>
+
+#include <linux/dma-mapping.h>
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+#include <linux/dmaengine.h>
+#endif
+
+#include <linux/rio.h>
+#include <linux/rio_ids.h>
+#include <linux/rio_drv.h>
+#include <linux/rio_mport_cdev.h>
+
+#include "../rio.h"
+
+#define DRV_NAME "rio_mport"
+#define DRV_PREFIX DRV_NAME ": "
+#define DEV_NAME "rio_mport"
+#define DRV_VERSION "1.0.0"
+
+/* Debug output filtering masks */
+enum {
+ DBG_NONE = 0,
+ DBG_INIT = BIT(0), /* driver init */
+ DBG_EXIT = BIT(1), /* driver exit */
+ DBG_MPORT = BIT(2), /* mport add/remove */
+ DBG_RDEV = BIT(3), /* RapidIO device add/remove */
+ DBG_DMA = BIT(4), /* DMA transfer messages */
+ DBG_MMAP = BIT(5), /* mapping messages */
+ DBG_IBW = BIT(6), /* inbound window */
+ DBG_EVENT = BIT(7), /* event handling messages */
+ DBG_OBW = BIT(8), /* outbound window messages */
+ DBG_DBELL = BIT(9), /* doorbell messages */
+ DBG_ALL = ~0,
+};
+
+#ifdef DEBUG
+#define rmcd_debug(level, fmt, arg...) \
+ do { \
+ if (DBG_##level & dbg_level) \
+ pr_debug(DRV_PREFIX "%s: " fmt "\n", __func__, ##arg); \
+ } while (0)
+#else
+#define rmcd_debug(level, fmt, arg...) \
+ no_printk(KERN_DEBUG pr_fmt(DRV_PREFIX fmt "\n"), ##arg)
+#endif
+
+#define rmcd_warn(fmt, arg...) \
+ pr_warn(DRV_PREFIX "%s WARNING " fmt "\n", __func__, ##arg)
+
+#define rmcd_error(fmt, arg...) \
+ pr_err(DRV_PREFIX "%s ERROR " fmt "\n", __func__, ##arg)
+
+MODULE_AUTHOR("Jerry Jacobs <jerry.jacobs@prodrive-technologies.com>");
+MODULE_AUTHOR("Aurelien Jacquiot <a-jacquiot@ti.com>");
+MODULE_AUTHOR("Alexandre Bounine <alexandre.bounine@idt.com>");
+MODULE_AUTHOR("Andre van Herk <andre.van.herk@prodrive-technologies.com>");
+MODULE_DESCRIPTION("RapidIO mport character device driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static int dma_timeout = 3000; /* DMA transfer timeout in msec */
+module_param(dma_timeout, int, S_IRUGO);
+MODULE_PARM_DESC(dma_timeout, "DMA Transfer Timeout in msec (default: 3000)");
+
+#ifdef DEBUG
+static u32 dbg_level = DBG_NONE;
+module_param(dbg_level, uint, S_IWUSR | S_IWGRP | S_IRUGO);
+MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)");
+#endif
+
+/*
+ * An internal DMA coherent buffer
+ */
+struct mport_dma_buf {
+ void *ib_base;
+ dma_addr_t ib_phys;
+ u32 ib_size;
+ u64 ib_rio_base;
+ bool ib_map;
+ struct file *filp;
+};
+
+/*
+ * Internal memory mapping structure
+ */
+enum rio_mport_map_dir {
+ MAP_INBOUND,
+ MAP_OUTBOUND,
+ MAP_DMA,
+};
+
+struct rio_mport_mapping {
+ struct list_head node;
+ struct mport_dev *md;
+ enum rio_mport_map_dir dir;
+ u32 rioid;
+ u64 rio_addr;
+ dma_addr_t phys_addr; /* for mmap */
+ void *virt_addr; /* kernel address, for dma_free_coherent */
+ u64 size;
+ struct kref ref; /* refcount of vmas sharing the mapping */
+ struct file *filp;
+};
+
+struct rio_mport_dma_map {
+ int valid;
+ uint64_t length;
+ void *vaddr;
+ dma_addr_t paddr;
+};
+
+#define MPORT_MAX_DMA_BUFS 16
+#define MPORT_EVENT_DEPTH 10
+
+/*
+ * mport_dev driver-specific structure that represents mport device
+ * @active mport device status flag
+ * @node list node to maintain list of registered mports
+ * @cdev character device
+ * @dev associated device object
+ * @mport associated subsystem's master port device object
+ * @buf_mutex lock for buffer handling
+ * @file_mutex - lock for open files list
+ * @file_list - list of open files on given mport
+ * @properties properties of this mport
+ * @portwrites queue of inbound portwrites
+ * @pw_lock lock for port write queue
+ * @mappings queue for memory mappings
+ * @dma_chan DMA channels associated with this device
+ * @dma_ref:
+ * @comp:
+ */
+struct mport_dev {
+ atomic_t active;
+ struct list_head node;
+ struct cdev cdev;
+ struct device dev;
+ struct rio_mport *mport;
+ struct mutex buf_mutex;
+ struct mutex file_mutex;
+ struct list_head file_list;
+ struct rio_mport_properties properties;
+ struct list_head doorbells;
+ spinlock_t db_lock;
+ struct list_head portwrites;
+ spinlock_t pw_lock;
+ struct list_head mappings;
+#ifdef CONFIG_DMA_ENGINE
+ struct dma_chan *dma_chan;
+ struct kref dma_ref;
+ struct completion comp;
+#endif
+};
+
+/*
+ * mport_cdev_priv - data structure specific to individual file object
+ * associated with an open device
+ * @md master port character device object
+ * @async_queue - asynchronous notification queue
+ * @list - file objects tracking list
+ * @db_filters inbound doorbell filters for this descriptor
+ * @pw_filters portwrite filters for this descriptor
+ * @event_fifo event fifo for this descriptor
+ * @event_rx_wait wait queue for this descriptor
+ * @fifo_lock lock for event_fifo
+ * @event_mask event mask for this descriptor
+ * @dmach DMA engine channel allocated for specific file object
+ */
+struct mport_cdev_priv {
+ struct mport_dev *md;
+ struct fasync_struct *async_queue;
+ struct list_head list;
+ struct list_head db_filters;
+ struct list_head pw_filters;
+ struct kfifo event_fifo;
+ wait_queue_head_t event_rx_wait;
+ spinlock_t fifo_lock;
+ unsigned int event_mask; /* RIO_DOORBELL, RIO_PORTWRITE */
+#ifdef CONFIG_DMA_ENGINE
+ struct dma_chan *dmach;
+ struct list_head async_list;
+ struct list_head pend_list;
+ spinlock_t req_lock;
+ struct mutex dma_lock;
+ struct kref dma_ref;
+ struct completion comp;
+#endif
+};
+
+/*
+ * rio_mport_pw_filter - structure to describe a portwrite filter
+ * md_node node in mport device's list
+ * priv_node node in private file object's list
+ * priv reference to private data
+ * filter actual portwrite filter
+ */
+struct rio_mport_pw_filter {
+ struct list_head md_node;
+ struct list_head priv_node;
+ struct mport_cdev_priv *priv;
+ struct rio_pw_filter filter;
+};
+
+/*
+ * rio_mport_db_filter - structure to describe a doorbell filter
+ * @data_node reference to device node
+ * @priv_node node in private data
+ * @priv reference to private data
+ * @filter actual doorbell filter
+ */
+struct rio_mport_db_filter {
+ struct list_head data_node;
+ struct list_head priv_node;
+ struct mport_cdev_priv *priv;
+ struct rio_doorbell_filter filter;
+};
+
+static LIST_HEAD(mport_devs);
+static DEFINE_MUTEX(mport_devs_lock);
+
+#if (0) /* used by commented out portion of poll function : FIXME */
+static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait);
+#endif
+
+static struct class *dev_class;
+static dev_t dev_number;
+
+static struct workqueue_struct *dma_wq;
+
+static void mport_release_mapping(struct kref *ref);
+
+static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg,
+ int local)
+{
+ struct rio_mport *mport = priv->md->mport;
+ struct rio_mport_maint_io maint_io;
+ u32 *buffer;
+ u32 offset;
+ size_t length;
+ int ret, i;
+
+ if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io))))
+ return -EFAULT;
+
+ if ((maint_io.offset % 4) ||
+ (maint_io.length == 0) || (maint_io.length % 4))
+ return -EINVAL;
+
+ buffer = vmalloc(maint_io.length);
+ if (buffer == NULL)
+ return -ENOMEM;
+ length = maint_io.length/sizeof(u32);
+ offset = maint_io.offset;
+
+ for (i = 0; i < length; i++) {
+ if (local)
+ ret = __rio_local_read_config_32(mport,
+ offset, &buffer[i]);
+ else
+ ret = rio_mport_read_config_32(mport, maint_io.rioid,
+ maint_io.hopcount, offset, &buffer[i]);
+ if (ret)
+ goto out;
+
+ offset += 4;
+ }
+
+ if (unlikely(copy_to_user(maint_io.buffer, buffer, maint_io.length)))
+ ret = -EFAULT;
+out:
+ vfree(buffer);
+ return ret;
+}
+
+static int rio_mport_maint_wr(struct mport_cdev_priv *priv, void __user *arg,
+ int local)
+{
+ struct rio_mport *mport = priv->md->mport;
+ struct rio_mport_maint_io maint_io;
+ u32 *buffer;
+ u32 offset;
+ size_t length;
+ int ret = -EINVAL, i;
+
+ if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io))))
+ return -EFAULT;
+
+ if ((maint_io.offset % 4) ||
+ (maint_io.length == 0) || (maint_io.length % 4))
+ return -EINVAL;
+
+ buffer = vmalloc(maint_io.length);
+ if (buffer == NULL)
+ return -ENOMEM;
+ length = maint_io.length;
+
+ if (unlikely(copy_from_user(buffer, maint_io.buffer, length))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ offset = maint_io.offset;
+ length /= sizeof(u32);
+
+ for (i = 0; i < length; i++) {
+ if (local)
+ ret = __rio_local_write_config_32(mport,
+ offset, buffer[i]);
+ else
+ ret = rio_mport_write_config_32(mport, maint_io.rioid,
+ maint_io.hopcount,
+ offset, buffer[i]);
+ if (ret)
+ goto out;
+
+ offset += 4;
+ }
+
+out:
+ vfree(buffer);
+ return ret;
+}
+
+
+/*
+ * Inbound/outbound memory mapping functions
+ */
+static int
+rio_mport_create_outbound_mapping(struct mport_dev *md, struct file *filp,
+ u32 rioid, u64 raddr, u32 size,
+ dma_addr_t *paddr)
+{
+ struct rio_mport *mport = md->mport;
+ struct rio_mport_mapping *map;
+ int ret;
+
+ rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%x", rioid, raddr, size);
+
+ map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+ if (map == NULL)
+ return -ENOMEM;
+
+ ret = rio_map_outb_region(mport, rioid, raddr, size, 0, paddr);
+ if (ret < 0)
+ goto err_map_outb;
+
+ map->dir = MAP_OUTBOUND;
+ map->rioid = rioid;
+ map->rio_addr = raddr;
+ map->size = size;
+ map->phys_addr = *paddr;
+ map->filp = filp;
+ map->md = md;
+ kref_init(&map->ref);
+ list_add_tail(&map->node, &md->mappings);
+ return 0;
+err_map_outb:
+ kfree(map);
+ return ret;
+}
+
+static int
+rio_mport_get_outbound_mapping(struct mport_dev *md, struct file *filp,
+ u32 rioid, u64 raddr, u32 size,
+ dma_addr_t *paddr)
+{
+ struct rio_mport_mapping *map;
+ int err = -ENOMEM;
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry(map, &md->mappings, node) {
+ if (map->dir != MAP_OUTBOUND)
+ continue;
+ if (rioid == map->rioid &&
+ raddr == map->rio_addr && size == map->size) {
+ *paddr = map->phys_addr;
+ err = 0;
+ break;
+ } else if (rioid == map->rioid &&
+ raddr < (map->rio_addr + map->size - 1) &&
+ (raddr + size) > map->rio_addr) {
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ /* If not found, create new */
+ if (err == -ENOMEM)
+ err = rio_mport_create_outbound_mapping(md, filp, rioid, raddr,
+ size, paddr);
+ mutex_unlock(&md->buf_mutex);
+ return err;
+}
+
+static int rio_mport_obw_map(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *data = priv->md;
+ struct rio_mmap map;
+ dma_addr_t paddr;
+ int ret;
+
+ if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap))))
+ return -EFAULT;
+
+ rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%llx",
+ map.rioid, map.rio_addr, map.length);
+
+ ret = rio_mport_get_outbound_mapping(data, filp, map.rioid,
+ map.rio_addr, map.length, &paddr);
+ if (ret < 0) {
+ rmcd_error("Failed to set OBW err= %d", ret);
+ return ret;
+ }
+
+ map.handle = paddr;
+
+ if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap))))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * rio_mport_obw_free() - unmap an OutBound Window from RapidIO address space
+ *
+ * @priv: driver private data
+ * @arg: buffer handle returned by allocation routine
+ */
+static int rio_mport_obw_free(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md = priv->md;
+ u64 handle;
+ struct rio_mport_mapping *map, *_map;
+
+ if (!md->mport->ops->unmap_outb)
+ return -EPROTONOSUPPORT;
+
+ if (copy_from_user(&handle, arg, sizeof(u64)))
+ return -EFAULT;
+
+ rmcd_debug(OBW, "h=0x%llx", handle);
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry_safe(map, _map, &md->mappings, node) {
+ if (map->dir == MAP_OUTBOUND && map->phys_addr == handle) {
+ if (map->filp == filp) {
+ rmcd_debug(OBW, "kref_put h=0x%llx", handle);
+ map->filp = NULL;
+ kref_put(&map->ref, mport_release_mapping);
+ }
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ return 0;
+}
+
+/*
+ * maint_hdid_set() - Set the host Device ID
+ * @priv: driver private data
+ * @arg: Device Id
+ */
+static int maint_hdid_set(struct mport_cdev_priv *priv, void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ uint16_t hdid;
+
+ if (copy_from_user(&hdid, arg, sizeof(uint16_t)))
+ return -EFAULT;
+
+ md->mport->host_deviceid = hdid;
+ md->properties.hdid = hdid;
+ rio_local_set_device_id(md->mport, hdid);
+
+ rmcd_debug(MPORT, "Set host device Id to %d", hdid);
+
+ return 0;
+}
+
+/*
+ * maint_comptag_set() - Set the host Component Tag
+ * @priv: driver private data
+ * @arg: Component Tag
+ */
+static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ uint32_t comptag;
+
+ if (copy_from_user(&comptag, arg, sizeof(uint32_t)))
+ return -EFAULT;
+
+ rio_local_write_config_32(md->mport, RIO_COMPONENT_TAG_CSR, comptag);
+
+ rmcd_debug(MPORT, "Set host Component Tag to %d", comptag);
+
+ return 0;
+}
+
+#ifdef CONFIG_DMA_ENGINE
+
+struct mport_dma_req {
+ struct list_head node;
+ struct file *filp;
+ struct mport_cdev_priv *priv;
+ enum rio_transfer_sync sync;
+ struct sg_table sgt;
+ struct page **page_list;
+ unsigned int nr_pages;
+ struct rio_mport_mapping *map;
+ struct dma_chan *dmach;
+ enum dma_data_direction dir;
+ dma_cookie_t cookie;
+ enum dma_status status;
+ struct completion req_comp;
+};
+
+struct mport_faf_work {
+ struct work_struct work;
+ struct mport_dma_req *req;
+};
+
+static void mport_release_def_dma(struct kref *dma_ref)
+{
+ struct mport_dev *md =
+ container_of(dma_ref, struct mport_dev, dma_ref);
+
+ rmcd_debug(EXIT, "DMA_%d", md->dma_chan->chan_id);
+ rio_release_dma(md->dma_chan);
+ md->dma_chan = NULL;
+}
+
+static void mport_release_dma(struct kref *dma_ref)
+{
+ struct mport_cdev_priv *priv =
+ container_of(dma_ref, struct mport_cdev_priv, dma_ref);
+
+ rmcd_debug(EXIT, "DMA_%d", priv->dmach->chan_id);
+ complete(&priv->comp);
+}
+
+static void dma_req_free(struct mport_dma_req *req)
+{
+ struct mport_cdev_priv *priv = req->priv;
+ unsigned int i;
+
+ dma_unmap_sg(req->dmach->device->dev,
+ req->sgt.sgl, req->sgt.nents, req->dir);
+ sg_free_table(&req->sgt);
+ if (req->page_list) {
+ for (i = 0; i < req->nr_pages; i++)
+ put_page(req->page_list[i]);
+ kfree(req->page_list);
+ }
+
+ if (req->map) {
+ mutex_lock(&req->map->md->buf_mutex);
+ kref_put(&req->map->ref, mport_release_mapping);
+ mutex_unlock(&req->map->md->buf_mutex);
+ }
+
+ kref_put(&priv->dma_ref, mport_release_dma);
+
+ kfree(req);
+}
+
+static void dma_xfer_callback(void *param)
+{
+ struct mport_dma_req *req = (struct mport_dma_req *)param;
+ struct mport_cdev_priv *priv = req->priv;
+
+ req->status = dma_async_is_tx_complete(priv->dmach, req->cookie,
+ NULL, NULL);
+ complete(&req->req_comp);
+}
+
+static void dma_faf_cleanup(struct work_struct *_work)
+{
+ struct mport_faf_work *work = container_of(_work,
+ struct mport_faf_work, work);
+ struct mport_dma_req *req = work->req;
+
+ dma_req_free(req);
+ kfree(work);
+}
+
+static void dma_faf_callback(void *param)
+{
+ struct mport_dma_req *req = (struct mport_dma_req *)param;
+ struct mport_faf_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return;
+
+ INIT_WORK(&work->work, dma_faf_cleanup);
+ work->req = req;
+ queue_work(dma_wq, &work->work);
+}
+
+/*
+ * prep_dma_xfer() - Configure and send request to DMAengine to prepare DMA
+ * transfer object.
+ * Returns pointer to DMA transaction descriptor allocated by DMA driver on
+ * success or ERR_PTR (and/or NULL) if failed. Caller must check returned
+ * non-NULL pointer using IS_ERR macro.
+ */
+static struct dma_async_tx_descriptor
+*prep_dma_xfer(struct dma_chan *chan, struct rio_transfer_io *transfer,
+ struct sg_table *sgt, int nents, enum dma_transfer_direction dir,
+ enum dma_ctrl_flags flags)
+{
+ struct rio_dma_data tx_data;
+
+ tx_data.sg = sgt->sgl;
+ tx_data.sg_len = nents;
+ tx_data.rio_addr_u = 0;
+ tx_data.rio_addr = transfer->rio_addr;
+ if (dir == DMA_MEM_TO_DEV) {
+ switch (transfer->method) {
+ case RIO_EXCHANGE_NWRITE:
+ tx_data.wr_type = RDW_ALL_NWRITE;
+ break;
+ case RIO_EXCHANGE_NWRITE_R_ALL:
+ tx_data.wr_type = RDW_ALL_NWRITE_R;
+ break;
+ case RIO_EXCHANGE_NWRITE_R:
+ tx_data.wr_type = RDW_LAST_NWRITE_R;
+ break;
+ case RIO_EXCHANGE_DEFAULT:
+ tx_data.wr_type = RDW_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return rio_dma_prep_xfer(chan, transfer->rioid, &tx_data, dir, flags);
+}
+
+/* Request DMA channel associated with this mport device.
+ * Try to request DMA channel for every new process that opened given
+ * mport. If a new DMA channel is not available use default channel
+ * which is the first DMA channel opened on mport device.
+ */
+static int get_dma_channel(struct mport_cdev_priv *priv)
+{
+ mutex_lock(&priv->dma_lock);
+ if (!priv->dmach) {
+ priv->dmach = rio_request_mport_dma(priv->md->mport);
+ if (!priv->dmach) {
+ /* Use default DMA channel if available */
+ if (priv->md->dma_chan) {
+ priv->dmach = priv->md->dma_chan;
+ kref_get(&priv->md->dma_ref);
+ } else {
+ rmcd_error("Failed to get DMA channel");
+ mutex_unlock(&priv->dma_lock);
+ return -ENODEV;
+ }
+ } else if (!priv->md->dma_chan) {
+ /* Register default DMA channel if we do not have one */
+ priv->md->dma_chan = priv->dmach;
+ kref_init(&priv->md->dma_ref);
+ rmcd_debug(DMA, "Register DMA_chan %d as default",
+ priv->dmach->chan_id);
+ }
+
+ kref_init(&priv->dma_ref);
+ init_completion(&priv->comp);
+ }
+
+ kref_get(&priv->dma_ref);
+ mutex_unlock(&priv->dma_lock);
+ return 0;
+}
+
+static void put_dma_channel(struct mport_cdev_priv *priv)
+{
+ kref_put(&priv->dma_ref, mport_release_dma);
+}
+
+/*
+ * DMA transfer functions
+ */
+static int do_dma_request(struct mport_dma_req *req,
+ struct rio_transfer_io *xfer,
+ enum rio_transfer_sync sync, int nents)
+{
+ struct mport_cdev_priv *priv;
+ struct sg_table *sgt;
+ struct dma_chan *chan;
+ struct dma_async_tx_descriptor *tx;
+ dma_cookie_t cookie;
+ unsigned long tmo = msecs_to_jiffies(dma_timeout);
+ enum dma_transfer_direction dir;
+ long wret;
+ int ret = 0;
+
+ priv = req->priv;
+ sgt = &req->sgt;
+
+ chan = priv->dmach;
+ dir = (req->dir == DMA_FROM_DEVICE) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV;
+
+ rmcd_debug(DMA, "%s(%d) uses %s for DMA_%s",
+ current->comm, task_pid_nr(current),
+ dev_name(&chan->dev->device),
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE");
+
+ /* Initialize DMA transaction request */
+ tx = prep_dma_xfer(chan, xfer, sgt, nents, dir,
+ DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+
+ if (!tx) {
+ rmcd_debug(DMA, "prep error for %s A:0x%llx L:0x%llx",
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+ xfer->rio_addr, xfer->length);
+ ret = -EIO;
+ goto err_out;
+ } else if (IS_ERR(tx)) {
+ ret = PTR_ERR(tx);
+ rmcd_debug(DMA, "prep error %d for %s A:0x%llx L:0x%llx", ret,
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+ xfer->rio_addr, xfer->length);
+ goto err_out;
+ }
+
+ if (sync == RIO_TRANSFER_FAF)
+ tx->callback = dma_faf_callback;
+ else
+ tx->callback = dma_xfer_callback;
+ tx->callback_param = req;
+
+ req->dmach = chan;
+ req->sync = sync;
+ req->status = DMA_IN_PROGRESS;
+ init_completion(&req->req_comp);
+
+ cookie = dmaengine_submit(tx);
+ req->cookie = cookie;
+
+ rmcd_debug(DMA, "pid=%d DMA_%s tx_cookie = %d", task_pid_nr(current),
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+
+ if (dma_submit_error(cookie)) {
+ rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)",
+ cookie, xfer->rio_addr, xfer->length);
+ ret = -EIO;
+ goto err_out;
+ }
+
+ dma_async_issue_pending(chan);
+
+ if (sync == RIO_TRANSFER_ASYNC) {
+ spin_lock(&priv->req_lock);
+ list_add_tail(&req->node, &priv->async_list);
+ spin_unlock(&priv->req_lock);
+ return cookie;
+ } else if (sync == RIO_TRANSFER_FAF)
+ return 0;
+
+ wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo);
+
+ if (wret == 0) {
+ /* Timeout on wait occurred */
+ rmcd_error("%s(%d) timed out waiting for DMA_%s %d",
+ current->comm, task_pid_nr(current),
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+ return -ETIMEDOUT;
+ } else if (wret == -ERESTARTSYS) {
+ /* Wait_for_completion was interrupted by a signal but DMA may
+ * be in progress
+ */
+ rmcd_error("%s(%d) wait for DMA_%s %d was interrupted",
+ current->comm, task_pid_nr(current),
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+ return -EINTR;
+ }
+
+ if (req->status != DMA_COMPLETE) {
+ /* DMA transaction completion was signaled with error */
+ rmcd_error("%s(%d) DMA_%s %d completed with status %d (ret=%d)",
+ current->comm, task_pid_nr(current),
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+ cookie, req->status, ret);
+ ret = -EIO;
+ }
+
+err_out:
+ return ret;
+}
+
+/*
+ * rio_dma_transfer() - Perform RapidIO DMA data transfer to/from
+ * the remote RapidIO device
+ * @filp: file pointer associated with the call
+ * @transfer_mode: DMA transfer mode
+ * @sync: synchronization mode
+ * @dir: DMA transfer direction (DMA_MEM_TO_DEV = write OR
+ * DMA_DEV_TO_MEM = read)
+ * @xfer: data transfer descriptor structure
+ */
+static int
+rio_dma_transfer(struct file *filp, uint32_t transfer_mode,
+ enum rio_transfer_sync sync, enum dma_data_direction dir,
+ struct rio_transfer_io *xfer)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ unsigned long nr_pages = 0;
+ struct page **page_list = NULL;
+ struct mport_dma_req *req;
+ struct mport_dev *md = priv->md;
+ struct dma_chan *chan;
+ int i, ret;
+ int nents;
+
+ if (xfer->length == 0)
+ return -EINVAL;
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ ret = get_dma_channel(priv);
+ if (ret) {
+ kfree(req);
+ return ret;
+ }
+
+ /*
+ * If parameter loc_addr != NULL, we are transferring data from/to
+ * data buffer allocated in user-space: lock in memory user-space
+ * buffer pages and build an SG table for DMA transfer request
+ *
+ * Otherwise (loc_addr == NULL) contiguous kernel-space buffer is
+ * used for DMA data transfers: build single entry SG table using
+ * offset within the internal buffer specified by handle parameter.
+ */
+ if (xfer->loc_addr) {
+ unsigned long offset;
+ long pinned;
+
+ offset = (unsigned long)xfer->loc_addr & ~PAGE_MASK;
+ nr_pages = PAGE_ALIGN(xfer->length + offset) >> PAGE_SHIFT;
+
+ page_list = kmalloc_array(nr_pages,
+ sizeof(*page_list), GFP_KERNEL);
+ if (page_list == NULL) {
+ ret = -ENOMEM;
+ goto err_req;
+ }
+
+ down_read(&current->mm->mmap_sem);
+ pinned = get_user_pages(current, current->mm,
+ (unsigned long)xfer->loc_addr & PAGE_MASK,
+ nr_pages, dir == DMA_FROM_DEVICE, 0,
+ page_list, NULL);
+ up_read(&current->mm->mmap_sem);
+
+ if (pinned != nr_pages) {
+ if (pinned < 0) {
+ rmcd_error("get_user_pages err=%ld", pinned);
+ nr_pages = 0;
+ } else
+ rmcd_error("pinned %ld out of %ld pages",
+ pinned, nr_pages);
+ ret = -EFAULT;
+ goto err_pg;
+ }
+
+ ret = sg_alloc_table_from_pages(&req->sgt, page_list, nr_pages,
+ offset, xfer->length, GFP_KERNEL);
+ if (ret) {
+ rmcd_error("sg_alloc_table failed with err=%d", ret);
+ goto err_pg;
+ }
+
+ req->page_list = page_list;
+ req->nr_pages = nr_pages;
+ } else {
+ dma_addr_t baddr;
+ struct rio_mport_mapping *map;
+
+ baddr = (dma_addr_t)xfer->handle;
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry(map, &md->mappings, node) {
+ if (baddr >= map->phys_addr &&
+ baddr < (map->phys_addr + map->size)) {
+ kref_get(&map->ref);
+ req->map = map;
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ if (req->map == NULL) {
+ ret = -ENOMEM;
+ goto err_req;
+ }
+
+ if (xfer->length + xfer->offset > map->size) {
+ ret = -EINVAL;
+ goto err_req;
+ }
+
+ ret = sg_alloc_table(&req->sgt, 1, GFP_KERNEL);
+ if (unlikely(ret)) {
+ rmcd_error("sg_alloc_table failed for internal buf");
+ goto err_req;
+ }
+
+ sg_set_buf(req->sgt.sgl,
+ map->virt_addr + (baddr - map->phys_addr) +
+ xfer->offset, xfer->length);
+ }
+
+ req->dir = dir;
+ req->filp = filp;
+ req->priv = priv;
+ chan = priv->dmach;
+
+ nents = dma_map_sg(chan->device->dev,
+ req->sgt.sgl, req->sgt.nents, dir);
+ if (nents == -EFAULT) {
+ rmcd_error("Failed to map SG list");
+ return -EFAULT;
+ }
+
+ ret = do_dma_request(req, xfer, sync, nents);
+
+ if (ret >= 0) {
+ if (sync == RIO_TRANSFER_SYNC)
+ goto sync_out;
+ return ret; /* return ASYNC cookie */
+ }
+
+ if (ret == -ETIMEDOUT || ret == -EINTR) {
+ /*
+ * This can happen only in case of SYNC transfer.
+ * Do not free unfinished request structure immediately.
+ * Place it into pending list and deal with it later
+ */
+ spin_lock(&priv->req_lock);
+ list_add_tail(&req->node, &priv->pend_list);
+ spin_unlock(&priv->req_lock);
+ return ret;
+ }
+
+
+ rmcd_debug(DMA, "do_dma_request failed with err=%d", ret);
+sync_out:
+ dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir);
+ sg_free_table(&req->sgt);
+err_pg:
+ if (page_list) {
+ for (i = 0; i < nr_pages; i++)
+ put_page(page_list[i]);
+ kfree(page_list);
+ }
+err_req:
+ if (req->map) {
+ mutex_lock(&md->buf_mutex);
+ kref_put(&req->map->ref, mport_release_mapping);
+ mutex_unlock(&md->buf_mutex);
+ }
+ put_dma_channel(priv);
+ kfree(req);
+ return ret;
+}
+
+static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct rio_transaction transaction;
+ struct rio_transfer_io *transfer;
+ enum dma_data_direction dir;
+ int i, ret = 0;
+
+ if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
+ return -EFAULT;
+
+ if (transaction.count != 1)
+ return -EINVAL;
+
+ if ((transaction.transfer_mode &
+ priv->md->properties.transfer_mode) == 0)
+ return -ENODEV;
+
+ transfer = vmalloc(transaction.count * sizeof(struct rio_transfer_io));
+ if (!transfer)
+ return -ENOMEM;
+
+ if (unlikely(copy_from_user(transfer, transaction.block,
+ transaction.count * sizeof(struct rio_transfer_io)))) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ dir = (transaction.dir == RIO_TRANSFER_DIR_READ) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ for (i = 0; i < transaction.count && ret == 0; i++)
+ ret = rio_dma_transfer(filp, transaction.transfer_mode,
+ transaction.sync, dir, &transfer[i]);
+
+ if (unlikely(copy_to_user(transaction.block, transfer,
+ transaction.count * sizeof(struct rio_transfer_io))))
+ ret = -EFAULT;
+
+out_free:
+ vfree(transfer);
+
+ return ret;
+}
+
+static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv;
+ struct mport_dev *md;
+ struct rio_async_tx_wait w_param;
+ struct mport_dma_req *req;
+ dma_cookie_t cookie;
+ unsigned long tmo;
+ long wret;
+ int found = 0;
+ int ret;
+
+ priv = (struct mport_cdev_priv *)filp->private_data;
+ md = priv->md;
+
+ if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param))))
+ return -EFAULT;
+
+ cookie = w_param.token;
+ if (w_param.timeout)
+ tmo = msecs_to_jiffies(w_param.timeout);
+ else /* Use default DMA timeout */
+ tmo = msecs_to_jiffies(dma_timeout);
+
+ spin_lock(&priv->req_lock);
+ list_for_each_entry(req, &priv->async_list, node) {
+ if (req->cookie == cookie) {
+ list_del(&req->node);
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&priv->req_lock);
+
+ if (!found)
+ return -EAGAIN;
+
+ wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo);
+
+ if (wret == 0) {
+ /* Timeout on wait occurred */
+ rmcd_error("%s(%d) timed out waiting for ASYNC DMA_%s",
+ current->comm, task_pid_nr(current),
+ (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE");
+ ret = -ETIMEDOUT;
+ goto err_tmo;
+ } else if (wret == -ERESTARTSYS) {
+ /* Wait_for_completion was interrupted by a signal but DMA may
+ * be still in progress
+ */
+ rmcd_error("%s(%d) wait for ASYNC DMA_%s was interrupted",
+ current->comm, task_pid_nr(current),
+ (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE");
+ ret = -EINTR;
+ goto err_tmo;
+ }
+
+ if (req->status != DMA_COMPLETE) {
+ /* DMA transaction completion signaled with transfer error */
+ rmcd_error("%s(%d) ASYNC DMA_%s completion with status %d",
+ current->comm, task_pid_nr(current),
+ (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE",
+ req->status);
+ ret = -EIO;
+ } else
+ ret = 0;
+
+ if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED)
+ dma_req_free(req);
+
+ return ret;
+
+err_tmo:
+ /* Return request back into async queue */
+ spin_lock(&priv->req_lock);
+ list_add_tail(&req->node, &priv->async_list);
+ spin_unlock(&priv->req_lock);
+ return ret;
+}
+#else /* CONFIG_DMA_ENGINE */
+static int rio_mport_transfer_ioctl(struct file *filp, void *arg)
+{
+ return -ENODEV;
+}
+
+static int rio_mport_wait_for_async_dma(struct file *filp, int32_t arg)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_DMA_ENGINE */
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+
+static int rio_mport_create_dma_mapping(struct mport_dev *md, struct file *filp,
+ uint64_t size, struct rio_mport_mapping **mapping)
+{
+ struct rio_mport_mapping *map;
+
+ map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+ if (map == NULL)
+ return -ENOMEM;
+
+ map->virt_addr = dma_alloc_coherent(md->mport->dev.parent, size,
+ &map->phys_addr, GFP_KERNEL);
+ if (map->virt_addr == NULL) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->dir = MAP_DMA;
+ map->size = size;
+ map->filp = filp;
+ map->md = md;
+ kref_init(&map->ref);
+ mutex_lock(&md->buf_mutex);
+ list_add_tail(&map->node, &md->mappings);
+ mutex_unlock(&md->buf_mutex);
+ *mapping = map;
+
+ return 0;
+}
+
+static int rio_mport_alloc_dma(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md = priv->md;
+ struct rio_dma_mem map;
+ struct rio_mport_mapping *mapping = NULL;
+ int ret;
+
+ if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_dma_mem))))
+ return -EFAULT;
+
+ ret = rio_mport_create_dma_mapping(md, filp, map.length, &mapping);
+ if (ret)
+ return ret;
+
+ map.dma_handle = mapping->phys_addr;
+
+ if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_dma_mem)))) {
+ mutex_lock(&md->buf_mutex);
+ kref_put(&mapping->ref, mport_release_mapping);
+ mutex_unlock(&md->buf_mutex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int rio_mport_free_dma(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md = priv->md;
+ u64 handle;
+ int ret = -EFAULT;
+ struct rio_mport_mapping *map, *_map;
+
+ if (copy_from_user(&handle, arg, sizeof(u64)))
+ return -EFAULT;
+ rmcd_debug(EXIT, "filp=%p", filp);
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry_safe(map, _map, &md->mappings, node) {
+ if (map->dir == MAP_DMA && map->phys_addr == handle &&
+ map->filp == filp) {
+ kref_put(&map->ref, mport_release_mapping);
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ if (ret == -EFAULT) {
+ rmcd_debug(DMA, "ERR no matching mapping");
+ return ret;
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_RAPIDIO_DMA_ENGINE */
+
+/*
+ * Inbound/outbound memory mapping functions
+ */
+
+static int
+rio_mport_create_inbound_mapping(struct mport_dev *md, struct file *filp,
+ u64 raddr, u32 size,
+ struct rio_mport_mapping **mapping)
+{
+ struct rio_mport *mport = md->mport;
+ struct rio_mport_mapping *map;
+ int ret;
+
+ map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+ if (map == NULL)
+ return -ENOMEM;
+
+ map->virt_addr = dma_alloc_coherent(mport->dev.parent, size,
+ &map->phys_addr, GFP_KERNEL);
+ if (map->virt_addr == NULL) {
+ ret = -ENOMEM;
+ goto err_dma_alloc;
+ }
+
+ if (raddr == RIO_MAP_ANY_ADDR)
+ raddr = map->phys_addr;
+ ret = rio_map_inb_region(mport, map->phys_addr, raddr, size, 0);
+ if (ret < 0)
+ goto err_map_inb;
+
+ map->dir = MAP_INBOUND;
+ map->rio_addr = raddr;
+ map->size = size;
+ map->filp = filp;
+ map->md = md;
+ kref_init(&map->ref);
+ mutex_lock(&md->buf_mutex);
+ list_add_tail(&map->node, &md->mappings);
+ mutex_unlock(&md->buf_mutex);
+ *mapping = map;
+ return 0;
+
+err_map_inb:
+ dma_free_coherent(mport->dev.parent, size,
+ map->virt_addr, map->phys_addr);
+err_dma_alloc:
+ kfree(map);
+ return ret;
+}
+
+static int
+rio_mport_get_inbound_mapping(struct mport_dev *md, struct file *filp,
+ u64 raddr, u32 size,
+ struct rio_mport_mapping **mapping)
+{
+ struct rio_mport_mapping *map;
+ int err = -ENOMEM;
+
+ if (raddr == RIO_MAP_ANY_ADDR)
+ goto get_new;
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry(map, &md->mappings, node) {
+ if (map->dir != MAP_INBOUND)
+ continue;
+ if (raddr == map->rio_addr && size == map->size) {
+ /* allow exact match only */
+ *mapping = map;
+ err = 0;
+ break;
+ } else if (raddr < (map->rio_addr + map->size - 1) &&
+ (raddr + size) > map->rio_addr) {
+ err = -EBUSY;
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ if (err != -ENOMEM)
+ return err;
+get_new:
+ /* not found, create new */
+ return rio_mport_create_inbound_mapping(md, filp, raddr, size, mapping);
+}
+
+static int rio_mport_map_inbound(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md = priv->md;
+ struct rio_mmap map;
+ struct rio_mport_mapping *mapping = NULL;
+ int ret;
+
+ if (!md->mport->ops->map_inb)
+ return -EPROTONOSUPPORT;
+ if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap))))
+ return -EFAULT;
+
+ rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+ ret = rio_mport_get_inbound_mapping(md, filp, map.rio_addr,
+ map.length, &mapping);
+ if (ret)
+ return ret;
+
+ map.handle = mapping->phys_addr;
+ map.rio_addr = mapping->rio_addr;
+
+ if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) {
+ /* Delete mapping if it was created by this request */
+ if (ret == 0 && mapping->filp == filp) {
+ mutex_lock(&md->buf_mutex);
+ kref_put(&mapping->ref, mport_release_mapping);
+ mutex_unlock(&md->buf_mutex);
+ }
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * rio_mport_inbound_free() - unmap from RapidIO address space and free
+ * previously allocated inbound DMA coherent buffer
+ * @priv: driver private data
+ * @arg: buffer handle returned by allocation routine
+ */
+static int rio_mport_inbound_free(struct file *filp, void __user *arg)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md = priv->md;
+ u64 handle;
+ struct rio_mport_mapping *map, *_map;
+
+ rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+ if (!md->mport->ops->unmap_inb)
+ return -EPROTONOSUPPORT;
+
+ if (copy_from_user(&handle, arg, sizeof(u64)))
+ return -EFAULT;
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry_safe(map, _map, &md->mappings, node) {
+ if (map->dir == MAP_INBOUND && map->phys_addr == handle) {
+ if (map->filp == filp) {
+ map->filp = NULL;
+ kref_put(&map->ref, mport_release_mapping);
+ }
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ return 0;
+}
+
+/*
+ * maint_port_idx_get() - Get the port index of the mport instance
+ * @priv: driver private data
+ * @arg: port index
+ */
+static int maint_port_idx_get(struct mport_cdev_priv *priv, void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ uint32_t port_idx = md->mport->index;
+
+ rmcd_debug(MPORT, "port_index=%d", port_idx);
+
+ if (copy_to_user(arg, &port_idx, sizeof(port_idx)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int rio_mport_add_event(struct mport_cdev_priv *priv,
+ struct rio_event *event)
+{
+ int overflow;
+
+ if (!(priv->event_mask & event->header))
+ return -EACCES;
+
+ spin_lock(&priv->fifo_lock);
+ overflow = kfifo_avail(&priv->event_fifo) < sizeof(*event)
+ || kfifo_in(&priv->event_fifo, (unsigned char *)event,
+ sizeof(*event)) != sizeof(*event);
+ spin_unlock(&priv->fifo_lock);
+
+ wake_up_interruptible(&priv->event_rx_wait);
+
+ if (overflow) {
+ dev_warn(&priv->md->dev, DRV_NAME ": event fifo overflow\n");
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void rio_mport_doorbell_handler(struct rio_mport *mport, void *dev_id,
+ u16 src, u16 dst, u16 info)
+{
+ struct mport_dev *data = dev_id;
+ struct mport_cdev_priv *priv;
+ struct rio_mport_db_filter *db_filter;
+ struct rio_event event;
+ int handled;
+
+ event.header = RIO_DOORBELL;
+ event.u.doorbell.rioid = src;
+ event.u.doorbell.payload = info;
+
+ handled = 0;
+ spin_lock(&data->db_lock);
+ list_for_each_entry(db_filter, &data->doorbells, data_node) {
+ if (((db_filter->filter.rioid == 0xffffffff ||
+ db_filter->filter.rioid == src)) &&
+ info >= db_filter->filter.low &&
+ info <= db_filter->filter.high) {
+ priv = db_filter->priv;
+ rio_mport_add_event(priv, &event);
+ handled = 1;
+ }
+ }
+ spin_unlock(&data->db_lock);
+
+ if (!handled)
+ dev_warn(&data->dev,
+ "%s: spurious DB received from 0x%x, info=0x%04x\n",
+ __func__, src, info);
+}
+
+static int rio_mport_add_db_filter(struct mport_cdev_priv *priv,
+ void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ struct rio_mport_db_filter *db_filter;
+ struct rio_doorbell_filter filter;
+ unsigned long flags;
+ int ret;
+
+ if (copy_from_user(&filter, arg, sizeof(filter)))
+ return -EFAULT;
+
+ if (filter.low > filter.high)
+ return -EINVAL;
+
+ ret = rio_request_inb_dbell(md->mport, md, filter.low, filter.high,
+ rio_mport_doorbell_handler);
+ if (ret) {
+ rmcd_error("%s failed to register IBDB, err=%d",
+ dev_name(&md->dev), ret);
+ return ret;
+ }
+
+ db_filter = kzalloc(sizeof(*db_filter), GFP_KERNEL);
+ if (db_filter == NULL) {
+ rio_release_inb_dbell(md->mport, filter.low, filter.high);
+ return -ENOMEM;
+ }
+
+ db_filter->filter = filter;
+ db_filter->priv = priv;
+ spin_lock_irqsave(&md->db_lock, flags);
+ list_add_tail(&db_filter->priv_node, &priv->db_filters);
+ list_add_tail(&db_filter->data_node, &md->doorbells);
+ spin_unlock_irqrestore(&md->db_lock, flags);
+
+ return 0;
+}
+
+static void rio_mport_delete_db_filter(struct rio_mport_db_filter *db_filter)
+{
+ list_del(&db_filter->data_node);
+ list_del(&db_filter->priv_node);
+ kfree(db_filter);
+}
+
+static int rio_mport_remove_db_filter(struct mport_cdev_priv *priv,
+ void __user *arg)
+{
+ struct rio_mport_db_filter *db_filter;
+ struct rio_doorbell_filter filter;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&filter, arg, sizeof(filter)))
+ return -EFAULT;
+
+ spin_lock_irqsave(&priv->md->db_lock, flags);
+ list_for_each_entry(db_filter, &priv->db_filters, priv_node) {
+ if (db_filter->filter.rioid == filter.rioid &&
+ db_filter->filter.low == filter.low &&
+ db_filter->filter.high == filter.high) {
+ rio_mport_delete_db_filter(db_filter);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&priv->md->db_lock, flags);
+
+ if (!ret)
+ rio_release_inb_dbell(priv->md->mport, filter.low, filter.high);
+
+ return ret;
+}
+
+static int rio_mport_match_pw(union rio_pw_msg *msg,
+ struct rio_pw_filter *filter)
+{
+ if ((msg->em.comptag & filter->mask) < filter->low ||
+ (msg->em.comptag & filter->mask) > filter->high)
+ return 0;
+ return 1;
+}
+
+static int rio_mport_pw_handler(struct rio_mport *mport, void *context,
+ union rio_pw_msg *msg, int step)
+{
+ struct mport_dev *md = context;
+ struct mport_cdev_priv *priv;
+ struct rio_mport_pw_filter *pw_filter;
+ struct rio_event event;
+ int handled;
+
+ event.header = RIO_PORTWRITE;
+ memcpy(event.u.portwrite.payload, msg->raw, RIO_PW_MSG_SIZE);
+
+ handled = 0;
+ spin_lock(&md->pw_lock);
+ list_for_each_entry(pw_filter, &md->portwrites, md_node) {
+ if (rio_mport_match_pw(msg, &pw_filter->filter)) {
+ priv = pw_filter->priv;
+ rio_mport_add_event(priv, &event);
+ handled = 1;
+ }
+ }
+ spin_unlock(&md->pw_lock);
+
+ if (!handled) {
+ printk_ratelimited(KERN_WARNING DRV_NAME
+ ": mport%d received spurious PW from 0x%08x\n",
+ mport->id, msg->em.comptag);
+ }
+
+ return 0;
+}
+
+static int rio_mport_add_pw_filter(struct mport_cdev_priv *priv,
+ void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ struct rio_mport_pw_filter *pw_filter;
+ struct rio_pw_filter filter;
+ unsigned long flags;
+ int hadd = 0;
+
+ if (copy_from_user(&filter, arg, sizeof(filter)))
+ return -EFAULT;
+
+ pw_filter = kzalloc(sizeof(*pw_filter), GFP_KERNEL);
+ if (pw_filter == NULL)
+ return -ENOMEM;
+
+ pw_filter->filter = filter;
+ pw_filter->priv = priv;
+ spin_lock_irqsave(&md->pw_lock, flags);
+ if (list_empty(&md->portwrites))
+ hadd = 1;
+ list_add_tail(&pw_filter->priv_node, &priv->pw_filters);
+ list_add_tail(&pw_filter->md_node, &md->portwrites);
+ spin_unlock_irqrestore(&md->pw_lock, flags);
+
+ if (hadd) {
+ int ret;
+
+ ret = rio_add_mport_pw_handler(md->mport, md,
+ rio_mport_pw_handler);
+ if (ret) {
+ dev_err(&md->dev,
+ "%s: failed to add IB_PW handler, err=%d\n",
+ __func__, ret);
+ return ret;
+ }
+ rio_pw_enable(md->mport, 1);
+ }
+
+ return 0;
+}
+
+static void rio_mport_delete_pw_filter(struct rio_mport_pw_filter *pw_filter)
+{
+ list_del(&pw_filter->md_node);
+ list_del(&pw_filter->priv_node);
+ kfree(pw_filter);
+}
+
+static int rio_mport_match_pw_filter(struct rio_pw_filter *a,
+ struct rio_pw_filter *b)
+{
+ if ((a->mask == b->mask) && (a->low == b->low) && (a->high == b->high))
+ return 1;
+ return 0;
+}
+
+static int rio_mport_remove_pw_filter(struct mport_cdev_priv *priv,
+ void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ struct rio_mport_pw_filter *pw_filter;
+ struct rio_pw_filter filter;
+ unsigned long flags;
+ int ret = -EINVAL;
+ int hdel = 0;
+
+ if (copy_from_user(&filter, arg, sizeof(filter)))
+ return -EFAULT;
+
+ spin_lock_irqsave(&md->pw_lock, flags);
+ list_for_each_entry(pw_filter, &priv->pw_filters, priv_node) {
+ if (rio_mport_match_pw_filter(&pw_filter->filter, &filter)) {
+ rio_mport_delete_pw_filter(pw_filter);
+ ret = 0;
+ break;
+ }
+ }
+
+ if (list_empty(&md->portwrites))
+ hdel = 1;
+ spin_unlock_irqrestore(&md->pw_lock, flags);
+
+ if (hdel) {
+ rio_del_mport_pw_handler(md->mport, priv->md,
+ rio_mport_pw_handler);
+ rio_pw_enable(md->mport, 0);
+ }
+
+ return ret;
+}
+
+/*
+ * rio_release_dev - release routine for kernel RIO device object
+ * @dev: kernel device object associated with a RIO device structure
+ *
+ * Frees a RIO device struct associated a RIO device struct.
+ * The RIO device struct is freed.
+ */
+static void rio_release_dev(struct device *dev)
+{
+ struct rio_dev *rdev;
+
+ rdev = to_rio_dev(dev);
+ pr_info(DRV_PREFIX "%s: %s\n", __func__, rio_name(rdev));
+ kfree(rdev);
+}
+
+
+static void rio_release_net(struct device *dev)
+{
+ struct rio_net *net;
+
+ net = to_rio_net(dev);
+ rmcd_debug(RDEV, "net_%d", net->id);
+ kfree(net);
+}
+
+
+/*
+ * rio_mport_add_riodev - creates a kernel RIO device object
+ *
+ * Allocates a RIO device data structure and initializes required fields based
+ * on device's configuration space contents.
+ * If the device has switch capabilities, then a switch specific portion is
+ * allocated and configured.
+ */
+static int rio_mport_add_riodev(struct mport_cdev_priv *priv,
+ void __user *arg)
+{
+ struct mport_dev *md = priv->md;
+ struct rio_rdev_info dev_info;
+ struct rio_dev *rdev;
+ struct rio_switch *rswitch = NULL;
+ struct rio_mport *mport;
+ size_t size;
+ u32 rval;
+ u32 swpinfo = 0;
+ u16 destid;
+ u8 hopcount;
+ int err;
+
+ if (copy_from_user(&dev_info, arg, sizeof(dev_info)))
+ return -EFAULT;
+
+ rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name,
+ dev_info.comptag, dev_info.destid, dev_info.hopcount);
+
+ if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) {
+ rmcd_debug(RDEV, "device %s already exists", dev_info.name);
+ return -EEXIST;
+ }
+
+ size = sizeof(struct rio_dev);
+ mport = md->mport;
+ destid = (u16)dev_info.destid;
+ hopcount = (u8)dev_info.hopcount;
+
+ if (rio_mport_read_config_32(mport, destid, hopcount,
+ RIO_PEF_CAR, &rval))
+ return -EIO;
+
+ if (rval & RIO_PEF_SWITCH) {
+ rio_mport_read_config_32(mport, destid, hopcount,
+ RIO_SWP_INFO_CAR, &swpinfo);
+ size += (RIO_GET_TOTAL_PORTS(swpinfo) *
+ sizeof(rswitch->nextdev[0])) + sizeof(*rswitch);
+ }
+
+ rdev = kzalloc(size, GFP_KERNEL);
+ if (rdev == NULL)
+ return -ENOMEM;
+
+ if (mport->net == NULL) {
+ struct rio_net *net;
+
+ net = rio_alloc_net(mport);
+ if (!net) {
+ err = -ENOMEM;
+ rmcd_debug(RDEV, "failed to allocate net object");
+ goto cleanup;
+ }
+
+ net->id = mport->id;
+ net->hport = mport;
+ dev_set_name(&net->dev, "rnet_%d", net->id);
+ net->dev.parent = &mport->dev;
+ net->dev.release = rio_release_net;
+ err = rio_add_net(net);
+ if (err) {
+ rmcd_debug(RDEV, "failed to register net, err=%d", err);
+ kfree(net);
+ goto cleanup;
+ }
+ }
+
+ rdev->net = mport->net;
+ rdev->pef = rval;
+ rdev->swpinfo = swpinfo;
+ rio_mport_read_config_32(mport, destid, hopcount,
+ RIO_DEV_ID_CAR, &rval);
+ rdev->did = rval >> 16;
+ rdev->vid = rval & 0xffff;
+ rio_mport_read_config_32(mport, destid, hopcount, RIO_DEV_INFO_CAR,
+ &rdev->device_rev);
+ rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_ID_CAR,
+ &rval);
+ rdev->asm_did = rval >> 16;
+ rdev->asm_vid = rval & 0xffff;
+ rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_INFO_CAR,
+ &rval);
+ rdev->asm_rev = rval >> 16;
+
+ if (rdev->pef & RIO_PEF_EXT_FEATURES) {
+ rdev->efptr = rval & 0xffff;
+ rdev->phys_efptr = rio_mport_get_physefb(mport, 0, destid,
+ hopcount);
+
+ rdev->em_efptr = rio_mport_get_feature(mport, 0, destid,
+ hopcount, RIO_EFB_ERR_MGMNT);
+ }
+
+ rio_mport_read_config_32(mport, destid, hopcount, RIO_SRC_OPS_CAR,
+ &rdev->src_ops);
+ rio_mport_read_config_32(mport, destid, hopcount, RIO_DST_OPS_CAR,
+ &rdev->dst_ops);
+
+ rdev->comp_tag = dev_info.comptag;
+ rdev->destid = destid;
+ /* hopcount is stored as specified by a caller, regardles of EP or SW */
+ rdev->hopcount = hopcount;
+
+ if (rdev->pef & RIO_PEF_SWITCH) {
+ rswitch = rdev->rswitch;
+ rswitch->route_table = NULL;
+ }
+
+ if (strlen(dev_info.name))
+ dev_set_name(&rdev->dev, "%s", dev_info.name);
+ else if (rdev->pef & RIO_PEF_SWITCH)
+ dev_set_name(&rdev->dev, "%02x:s:%04x", mport->id,
+ rdev->comp_tag & RIO_CTAG_UDEVID);
+ else
+ dev_set_name(&rdev->dev, "%02x:e:%04x", mport->id,
+ rdev->comp_tag & RIO_CTAG_UDEVID);
+
+ INIT_LIST_HEAD(&rdev->net_list);
+ rdev->dev.parent = &mport->net->dev;
+ rio_attach_device(rdev);
+ rdev->dev.release = rio_release_dev;
+
+ if (rdev->dst_ops & RIO_DST_OPS_DOORBELL)
+ rio_init_dbell_res(&rdev->riores[RIO_DOORBELL_RESOURCE],
+ 0, 0xffff);
+ err = rio_add_device(rdev);
+ if (err)
+ goto cleanup;
+ rio_dev_get(rdev);
+
+ return 0;
+cleanup:
+ kfree(rdev);
+ return err;
+}
+
+static int rio_mport_del_riodev(struct mport_cdev_priv *priv, void __user *arg)
+{
+ struct rio_rdev_info dev_info;
+ struct rio_dev *rdev = NULL;
+ struct device *dev;
+ struct rio_mport *mport;
+ struct rio_net *net;
+
+ if (copy_from_user(&dev_info, arg, sizeof(dev_info)))
+ return -EFAULT;
+
+ mport = priv->md->mport;
+
+ /* If device name is specified, removal by name has priority */
+ if (strlen(dev_info.name)) {
+ dev = bus_find_device_by_name(&rio_bus_type, NULL,
+ dev_info.name);
+ if (dev)
+ rdev = to_rio_dev(dev);
+ } else {
+ do {
+ rdev = rio_get_comptag(dev_info.comptag, rdev);
+ if (rdev && rdev->dev.parent == &mport->net->dev &&
+ rdev->destid == (u16)dev_info.destid &&
+ rdev->hopcount == (u8)dev_info.hopcount)
+ break;
+ } while (rdev);
+ }
+
+ if (!rdev) {
+ rmcd_debug(RDEV,
+ "device name:%s ct:0x%x did:0x%x hc:0x%x not found",
+ dev_info.name, dev_info.comptag, dev_info.destid,
+ dev_info.hopcount);
+ return -ENODEV;
+ }
+
+ net = rdev->net;
+ rio_dev_put(rdev);
+ rio_del_device(rdev, RIO_DEVICE_SHUTDOWN);
+
+ if (list_empty(&net->devices)) {
+ rio_free_net(net);
+ mport->net = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * Mport cdev management
+ */
+
+/*
+ * mport_cdev_open() - Open character device (mport)
+ */
+static int mport_cdev_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+ int minor = iminor(inode);
+ struct mport_dev *chdev;
+ struct mport_cdev_priv *priv;
+
+ /* Test for valid device */
+ if (minor >= RIO_MAX_MPORTS) {
+ rmcd_error("Invalid minor device number");
+ return -EINVAL;
+ }
+
+ chdev = container_of(inode->i_cdev, struct mport_dev, cdev);
+
+ rmcd_debug(INIT, "%s filp=%p", dev_name(&chdev->dev), filp);
+
+ if (atomic_read(&chdev->active) == 0)
+ return -ENODEV;
+
+ get_device(&chdev->dev);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ put_device(&chdev->dev);
+ return -ENOMEM;
+ }
+
+ priv->md = chdev;
+
+ mutex_lock(&chdev->file_mutex);
+ list_add_tail(&priv->list, &chdev->file_list);
+ mutex_unlock(&chdev->file_mutex);
+
+ INIT_LIST_HEAD(&priv->db_filters);
+ INIT_LIST_HEAD(&priv->pw_filters);
+ INIT_LIST_HEAD(&priv->async_list);
+ INIT_LIST_HEAD(&priv->pend_list);
+ spin_lock_init(&priv->fifo_lock);
+ spin_lock_init(&priv->req_lock);
+ init_waitqueue_head(&priv->event_rx_wait);
+ ret = kfifo_alloc(&priv->event_fifo,
+ sizeof(struct rio_event) * MPORT_EVENT_DEPTH,
+ GFP_KERNEL);
+ if (ret < 0) {
+ dev_err(&chdev->dev, DRV_NAME ": kfifo_alloc failed\n");
+ ret = -ENOMEM;
+ goto err_fifo;
+ }
+
+#ifdef CONFIG_DMA_ENGINE
+ mutex_init(&priv->dma_lock);
+#endif
+
+ filp->private_data = priv;
+ goto out;
+err_fifo:
+ kfree(priv);
+out:
+ return ret;
+}
+
+static int mport_cdev_fasync(int fd, struct file *filp, int mode)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+
+ return fasync_helper(fd, filp, mode, &priv->async_queue);
+}
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+static void mport_cdev_release_dma(struct file *filp)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md;
+ struct mport_dma_req *req, *req_next;
+ unsigned long tmo = msecs_to_jiffies(dma_timeout);
+ long wret;
+ LIST_HEAD(list);
+
+ rmcd_debug(EXIT, "from filp=%p %s(%d)",
+ filp, current->comm, task_pid_nr(current));
+
+ if (!priv->dmach) {
+ rmcd_debug(EXIT, "No DMA channel for filp=%p", filp);
+ return;
+ }
+
+ md = priv->md;
+
+ flush_workqueue(dma_wq);
+
+ spin_lock(&priv->req_lock);
+ if (!list_empty(&priv->async_list)) {
+ rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)",
+ filp, current->comm, task_pid_nr(current));
+ list_splice_init(&priv->async_list, &list);
+ }
+ spin_unlock(&priv->req_lock);
+
+ if (!list_empty(&list)) {
+ rmcd_debug(EXIT, "temp list not empty");
+ list_for_each_entry_safe(req, req_next, &list, node) {
+ rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s",
+ req->filp, req->cookie,
+ completion_done(&req->req_comp)?"yes":"no");
+ list_del(&req->node);
+ dma_req_free(req);
+ }
+ }
+
+ if (!list_empty(&priv->pend_list)) {
+ rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)",
+ filp, current->comm, task_pid_nr(current));
+ list_for_each_entry_safe(req,
+ req_next, &priv->pend_list, node) {
+ rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s",
+ req->filp, req->cookie,
+ completion_done(&req->req_comp)?"yes":"no");
+ list_del(&req->node);
+ dma_req_free(req);
+ }
+ }
+
+ put_dma_channel(priv);
+ wret = wait_for_completion_interruptible_timeout(&priv->comp, tmo);
+
+ if (wret <= 0) {
+ rmcd_error("%s(%d) failed waiting for DMA release err=%ld",
+ current->comm, task_pid_nr(current), wret);
+ }
+
+ spin_lock(&priv->req_lock);
+
+ if (!list_empty(&priv->pend_list)) {
+ rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)",
+ filp, current->comm, task_pid_nr(current));
+ }
+
+ spin_unlock(&priv->req_lock);
+
+ if (priv->dmach != priv->md->dma_chan) {
+ rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)",
+ filp, current->comm, task_pid_nr(current));
+ rio_release_dma(priv->dmach);
+ } else {
+ rmcd_debug(EXIT, "Adjust default DMA channel refcount");
+ kref_put(&md->dma_ref, mport_release_def_dma);
+ }
+
+ priv->dmach = NULL;
+}
+#else
+#define mport_cdev_release_dma(priv) do {} while (0)
+#endif
+
+/*
+ * mport_cdev_release() - Release character device
+ */
+static int mport_cdev_release(struct inode *inode, struct file *filp)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *chdev;
+ struct rio_mport_pw_filter *pw_filter, *pw_filter_next;
+ struct rio_mport_db_filter *db_filter, *db_filter_next;
+ struct rio_mport_mapping *map, *_map;
+ unsigned long flags;
+
+ rmcd_debug(EXIT, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+ chdev = priv->md;
+ mport_cdev_release_dma(filp);
+
+ priv->event_mask = 0;
+
+ spin_lock_irqsave(&chdev->pw_lock, flags);
+ if (!list_empty(&priv->pw_filters)) {
+ list_for_each_entry_safe(pw_filter, pw_filter_next,
+ &priv->pw_filters, priv_node)
+ rio_mport_delete_pw_filter(pw_filter);
+ }
+ spin_unlock_irqrestore(&chdev->pw_lock, flags);
+
+ spin_lock_irqsave(&chdev->db_lock, flags);
+ list_for_each_entry_safe(db_filter, db_filter_next,
+ &priv->db_filters, priv_node) {
+ rio_mport_delete_db_filter(db_filter);
+ }
+ spin_unlock_irqrestore(&chdev->db_lock, flags);
+
+ kfifo_free(&priv->event_fifo);
+
+ mutex_lock(&chdev->buf_mutex);
+ list_for_each_entry_safe(map, _map, &chdev->mappings, node) {
+ if (map->filp == filp) {
+ rmcd_debug(EXIT, "release mapping %p filp=%p",
+ map->virt_addr, filp);
+ kref_put(&map->ref, mport_release_mapping);
+ }
+ }
+ mutex_unlock(&chdev->buf_mutex);
+
+ mport_cdev_fasync(-1, filp, 0);
+ filp->private_data = NULL;
+ mutex_lock(&chdev->file_mutex);
+ list_del(&priv->list);
+ mutex_unlock(&chdev->file_mutex);
+ put_device(&chdev->dev);
+ kfree(priv);
+ return 0;
+}
+
+/*
+ * mport_cdev_ioctl() - IOCTLs for character device
+ */
+static long mport_cdev_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int err = -EINVAL;
+ struct mport_cdev_priv *data = filp->private_data;
+ struct mport_dev *md = data->md;
+
+ if (atomic_read(&md->active) == 0)
+ return -ENODEV;
+
+ switch (cmd) {
+ case RIO_MPORT_MAINT_READ_LOCAL:
+ return rio_mport_maint_rd(data, (void __user *)arg, 1);
+ case RIO_MPORT_MAINT_WRITE_LOCAL:
+ return rio_mport_maint_wr(data, (void __user *)arg, 1);
+ case RIO_MPORT_MAINT_READ_REMOTE:
+ return rio_mport_maint_rd(data, (void __user *)arg, 0);
+ case RIO_MPORT_MAINT_WRITE_REMOTE:
+ return rio_mport_maint_wr(data, (void __user *)arg, 0);
+ case RIO_MPORT_MAINT_HDID_SET:
+ return maint_hdid_set(data, (void __user *)arg);
+ case RIO_MPORT_MAINT_COMPTAG_SET:
+ return maint_comptag_set(data, (void __user *)arg);
+ case RIO_MPORT_MAINT_PORT_IDX_GET:
+ return maint_port_idx_get(data, (void __user *)arg);
+ case RIO_MPORT_GET_PROPERTIES:
+ md->properties.hdid = md->mport->host_deviceid;
+ err = copy_to_user((void __user *)arg, &(data->md->properties),
+ sizeof(data->md->properties));
+ break;
+ case RIO_ENABLE_DOORBELL_RANGE:
+ return rio_mport_add_db_filter(data, (void __user *)arg);
+ case RIO_DISABLE_DOORBELL_RANGE:
+ return rio_mport_remove_db_filter(data, (void __user *)arg);
+ case RIO_ENABLE_PORTWRITE_RANGE:
+ return rio_mport_add_pw_filter(data, (void __user *)arg);
+ case RIO_DISABLE_PORTWRITE_RANGE:
+ return rio_mport_remove_pw_filter(data, (void __user *)arg);
+ case RIO_SET_EVENT_MASK:
+ data->event_mask = arg;
+ return 0;
+ case RIO_GET_EVENT_MASK:
+ return copy_to_user((void __user *)arg, &data->event_mask,
+ sizeof(data->event_mask));
+ case RIO_MAP_OUTBOUND:
+ return rio_mport_obw_map(filp, (void __user *)arg);
+ case RIO_MAP_INBOUND:
+ return rio_mport_map_inbound(filp, (void __user *)arg);
+ case RIO_UNMAP_OUTBOUND:
+ return rio_mport_obw_free(filp, (void __user *)arg);
+ case RIO_UNMAP_INBOUND:
+ return rio_mport_inbound_free(filp, (void __user *)arg);
+ case RIO_ALLOC_DMA:
+ return rio_mport_alloc_dma(filp, (void __user *)arg);
+ case RIO_FREE_DMA:
+ return rio_mport_free_dma(filp, (void __user *)arg);
+ case RIO_WAIT_FOR_ASYNC:
+ return rio_mport_wait_for_async_dma(filp, (void __user *)arg);
+ case RIO_TRANSFER:
+ return rio_mport_transfer_ioctl(filp, (void __user *)arg);
+ case RIO_DEV_ADD:
+ return rio_mport_add_riodev(data, (void __user *)arg);
+ case RIO_DEV_DEL:
+ return rio_mport_del_riodev(data, (void __user *)arg);
+ default:
+ break;
+ }
+
+ return err;
+}
+
+/*
+ * mport_release_mapping - free mapping resources and info structure
+ * @ref: a pointer to the kref within struct rio_mport_mapping
+ *
+ * NOTE: Shall be called while holding buf_mutex.
+ */
+static void mport_release_mapping(struct kref *ref)
+{
+ struct rio_mport_mapping *map =
+ container_of(ref, struct rio_mport_mapping, ref);
+ struct rio_mport *mport = map->md->mport;
+
+ rmcd_debug(MMAP, "type %d mapping @ %p (phys = %pad) for %s",
+ map->dir, map->virt_addr,
+ &map->phys_addr, mport->name);
+
+ list_del(&map->node);
+
+ switch (map->dir) {
+ case MAP_INBOUND:
+ rio_unmap_inb_region(mport, map->phys_addr);
+ case MAP_DMA:
+ dma_free_coherent(mport->dev.parent, map->size,
+ map->virt_addr, map->phys_addr);
+ break;
+ case MAP_OUTBOUND:
+ rio_unmap_outb_region(mport, map->rioid, map->rio_addr);
+ break;
+ }
+ kfree(map);
+}
+
+static void mport_mm_open(struct vm_area_struct *vma)
+{
+ struct rio_mport_mapping *map = vma->vm_private_data;
+
+rmcd_debug(MMAP, "0x%pad", &map->phys_addr);
+ kref_get(&map->ref);
+}
+
+static void mport_mm_close(struct vm_area_struct *vma)
+{
+ struct rio_mport_mapping *map = vma->vm_private_data;
+
+rmcd_debug(MMAP, "0x%pad", &map->phys_addr);
+ mutex_lock(&map->md->buf_mutex);
+ kref_put(&map->ref, mport_release_mapping);
+ mutex_unlock(&map->md->buf_mutex);
+}
+
+static const struct vm_operations_struct vm_ops = {
+ .open = mport_mm_open,
+ .close = mport_mm_close,
+};
+
+static int mport_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct mport_dev *md;
+ size_t size = vma->vm_end - vma->vm_start;
+ dma_addr_t baddr;
+ unsigned long offset;
+ int found = 0, ret;
+ struct rio_mport_mapping *map;
+
+ rmcd_debug(MMAP, "0x%x bytes at offset 0x%lx",
+ (unsigned int)size, vma->vm_pgoff);
+
+ md = priv->md;
+ baddr = ((dma_addr_t)vma->vm_pgoff << PAGE_SHIFT);
+
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry(map, &md->mappings, node) {
+ if (baddr >= map->phys_addr &&
+ baddr < (map->phys_addr + map->size)) {
+ found = 1;
+ break;
+ }
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ if (!found)
+ return -ENOMEM;
+
+ offset = baddr - map->phys_addr;
+
+ if (size + offset > map->size)
+ return -EINVAL;
+
+ vma->vm_pgoff = offset >> PAGE_SHIFT;
+ rmcd_debug(MMAP, "MMAP adjusted offset = 0x%lx", vma->vm_pgoff);
+
+ if (map->dir == MAP_INBOUND || map->dir == MAP_DMA)
+ ret = dma_mmap_coherent(md->mport->dev.parent, vma,
+ map->virt_addr, map->phys_addr, map->size);
+ else if (map->dir == MAP_OUTBOUND) {
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ ret = vm_iomap_memory(vma, map->phys_addr, map->size);
+ } else {
+ rmcd_error("Attempt to mmap unsupported mapping type");
+ ret = -EIO;
+ }
+
+ if (!ret) {
+ vma->vm_private_data = map;
+ vma->vm_ops = &vm_ops;
+ mport_mm_open(vma);
+ } else {
+ rmcd_error("MMAP exit with err=%d", ret);
+ }
+
+ return ret;
+}
+
+static unsigned int mport_cdev_poll(struct file *filp, poll_table *wait)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+
+ poll_wait(filp, &priv->event_rx_wait, wait);
+ if (kfifo_len(&priv->event_fifo))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static ssize_t mport_read(struct file *filp, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ int copied;
+ ssize_t ret;
+
+ if (!count)
+ return 0;
+
+ if (kfifo_is_empty(&priv->event_fifo) &&
+ (filp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ if (count % sizeof(struct rio_event))
+ return -EINVAL;
+
+ ret = wait_event_interruptible(priv->event_rx_wait,
+ kfifo_len(&priv->event_fifo) != 0);
+ if (ret)
+ return ret;
+
+ while (ret < count) {
+ if (kfifo_to_user(&priv->event_fifo, buf,
+ sizeof(struct rio_event), &copied))
+ return -EFAULT;
+ ret += copied;
+ buf += copied;
+ }
+
+ return ret;
+}
+
+static ssize_t mport_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mport_cdev_priv *priv = filp->private_data;
+ struct rio_mport *mport = priv->md->mport;
+ struct rio_event event;
+ int len, ret;
+
+ if (!count)
+ return 0;
+
+ if (count % sizeof(event))
+ return -EINVAL;
+
+ len = 0;
+ while ((count - len) >= (int)sizeof(event)) {
+ if (copy_from_user(&event, buf, sizeof(event)))
+ return -EFAULT;
+
+ if (event.header != RIO_DOORBELL)
+ return -EINVAL;
+
+ ret = rio_mport_send_doorbell(mport,
+ (u16)event.u.doorbell.rioid,
+ event.u.doorbell.payload);
+ if (ret < 0)
+ return ret;
+
+ len += sizeof(event);
+ buf += sizeof(event);
+ }
+
+ return len;
+}
+
+static const struct file_operations mport_fops = {
+ .owner = THIS_MODULE,
+ .open = mport_cdev_open,
+ .release = mport_cdev_release,
+ .poll = mport_cdev_poll,
+ .read = mport_read,
+ .write = mport_write,
+ .mmap = mport_cdev_mmap,
+ .fasync = mport_cdev_fasync,
+ .unlocked_ioctl = mport_cdev_ioctl
+};
+
+/*
+ * Character device management
+ */
+
+static void mport_device_release(struct device *dev)
+{
+ struct mport_dev *md;
+
+ rmcd_debug(EXIT, "%s", dev_name(dev));
+ md = container_of(dev, struct mport_dev, dev);
+ kfree(md);
+}
+
+/*
+ * mport_cdev_add() - Create mport_dev from rio_mport
+ * @mport: RapidIO master port
+ */
+static struct mport_dev *mport_cdev_add(struct rio_mport *mport)
+{
+ int ret = 0;
+ struct mport_dev *md;
+ struct rio_mport_attr attr;
+
+ md = kzalloc(sizeof(struct mport_dev), GFP_KERNEL);
+ if (!md) {
+ rmcd_error("Unable allocate a device object");
+ return NULL;
+ }
+
+ md->mport = mport;
+ mutex_init(&md->buf_mutex);
+ mutex_init(&md->file_mutex);
+ INIT_LIST_HEAD(&md->file_list);
+ cdev_init(&md->cdev, &mport_fops);
+ md->cdev.owner = THIS_MODULE;
+ ret = cdev_add(&md->cdev, MKDEV(MAJOR(dev_number), mport->id), 1);
+ if (ret < 0) {
+ kfree(md);
+ rmcd_error("Unable to register a device, err=%d", ret);
+ return NULL;
+ }
+
+ md->dev.devt = md->cdev.dev;
+ md->dev.class = dev_class;
+ md->dev.parent = &mport->dev;
+ md->dev.release = mport_device_release;
+ dev_set_name(&md->dev, DEV_NAME "%d", mport->id);
+ atomic_set(&md->active, 1);
+
+ ret = device_register(&md->dev);
+ if (ret) {
+ rmcd_error("Failed to register mport %d (err=%d)",
+ mport->id, ret);
+ goto err_cdev;
+ }
+
+ get_device(&md->dev);
+
+ INIT_LIST_HEAD(&md->doorbells);
+ spin_lock_init(&md->db_lock);
+ INIT_LIST_HEAD(&md->portwrites);
+ spin_lock_init(&md->pw_lock);
+ INIT_LIST_HEAD(&md->mappings);
+
+ md->properties.id = mport->id;
+ md->properties.sys_size = mport->sys_size;
+ md->properties.hdid = mport->host_deviceid;
+ md->properties.index = mport->index;
+
+ /* The transfer_mode property will be returned through mport query
+ * interface
+ */
+#ifdef CONFIG_PPC /* for now: only on Freescale's SoCs */
+ md->properties.transfer_mode |= RIO_TRANSFER_MODE_MAPPED;
+#else
+ md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER;
+#endif
+ ret = rio_query_mport(mport, &attr);
+ if (!ret) {
+ md->properties.flags = attr.flags;
+ md->properties.link_speed = attr.link_speed;
+ md->properties.link_width = attr.link_width;
+ md->properties.dma_max_sge = attr.dma_max_sge;
+ md->properties.dma_max_size = attr.dma_max_size;
+ md->properties.dma_align = attr.dma_align;
+ md->properties.cap_sys_size = 0;
+ md->properties.cap_transfer_mode = 0;
+ md->properties.cap_addr_size = 0;
+ } else
+ pr_info(DRV_PREFIX "Failed to obtain info for %s cdev(%d:%d)\n",
+ mport->name, MAJOR(dev_number), mport->id);
+
+ mutex_lock(&mport_devs_lock);
+ list_add_tail(&md->node, &mport_devs);
+ mutex_unlock(&mport_devs_lock);
+
+ pr_info(DRV_PREFIX "Added %s cdev(%d:%d)\n",
+ mport->name, MAJOR(dev_number), mport->id);
+
+ return md;
+
+err_cdev:
+ cdev_del(&md->cdev);
+ kfree(md);
+ return NULL;
+}
+
+/*
+ * mport_cdev_terminate_dma() - Stop all active DMA data transfers and release
+ * associated DMA channels.
+ */
+static void mport_cdev_terminate_dma(struct mport_dev *md)
+{
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+ struct mport_cdev_priv *client;
+
+ rmcd_debug(DMA, "%s", dev_name(&md->dev));
+
+ mutex_lock(&md->file_mutex);
+ list_for_each_entry(client, &md->file_list, list) {
+ if (client->dmach) {
+ dmaengine_terminate_all(client->dmach);
+ rio_release_dma(client->dmach);
+ }
+ }
+ mutex_unlock(&md->file_mutex);
+
+ if (md->dma_chan) {
+ dmaengine_terminate_all(md->dma_chan);
+ rio_release_dma(md->dma_chan);
+ md->dma_chan = NULL;
+ }
+#endif
+}
+
+
+/*
+ * mport_cdev_kill_fasync() - Send SIGIO signal to all processes with open
+ * mport_cdev files.
+ */
+static int mport_cdev_kill_fasync(struct mport_dev *md)
+{
+ unsigned int files = 0;
+ struct mport_cdev_priv *client;
+
+ mutex_lock(&md->file_mutex);
+ list_for_each_entry(client, &md->file_list, list) {
+ if (client->async_queue)
+ kill_fasync(&client->async_queue, SIGIO, POLL_HUP);
+ files++;
+ }
+ mutex_unlock(&md->file_mutex);
+ return files;
+}
+
+/*
+ * mport_cdev_remove() - Remove mport character device
+ * @dev: Mport device to remove
+ */
+static void mport_cdev_remove(struct mport_dev *md)
+{
+ struct rio_mport_mapping *map, *_map;
+
+ rmcd_debug(EXIT, "Remove %s cdev", md->mport->name);
+ atomic_set(&md->active, 0);
+ mport_cdev_terminate_dma(md);
+ rio_del_mport_pw_handler(md->mport, md, rio_mport_pw_handler);
+ cdev_del(&(md->cdev));
+ mport_cdev_kill_fasync(md);
+
+ flush_workqueue(dma_wq);
+
+ /* TODO: do we need to give clients some time to close file
+ * descriptors? Simple wait for XX, or kref?
+ */
+
+ /*
+ * Release DMA buffers allocated for the mport device.
+ * Disable associated inbound Rapidio requests mapping if applicable.
+ */
+ mutex_lock(&md->buf_mutex);
+ list_for_each_entry_safe(map, _map, &md->mappings, node) {
+ kref_put(&map->ref, mport_release_mapping);
+ }
+ mutex_unlock(&md->buf_mutex);
+
+ if (!list_empty(&md->mappings))
+ rmcd_warn("WARNING: %s pending mappings on removal",
+ md->mport->name);
+
+ rio_release_inb_dbell(md->mport, 0, 0x0fff);
+
+ device_unregister(&md->dev);
+ put_device(&md->dev);
+}
+
+/*
+ * RIO rio_mport_interface driver
+ */
+
+/*
+ * mport_add_mport() - Add rio_mport from LDM device struct
+ * @dev: Linux device model struct
+ * @class_intf: Linux class_interface
+ */
+static int mport_add_mport(struct device *dev,
+ struct class_interface *class_intf)
+{
+ struct rio_mport *mport = NULL;
+ struct mport_dev *chdev = NULL;
+
+ mport = to_rio_mport(dev);
+ if (!mport)
+ return -ENODEV;
+
+ chdev = mport_cdev_add(mport);
+ if (!chdev)
+ return -ENODEV;
+
+ return 0;
+}
+
+/*
+ * mport_remove_mport() - Remove rio_mport from global list
+ * TODO remove device from global mport_dev list
+ */
+static void mport_remove_mport(struct device *dev,
+ struct class_interface *class_intf)
+{
+ struct rio_mport *mport = NULL;
+ struct mport_dev *chdev;
+ int found = 0;
+
+ mport = to_rio_mport(dev);
+ rmcd_debug(EXIT, "Remove %s", mport->name);
+
+ mutex_lock(&mport_devs_lock);
+ list_for_each_entry(chdev, &mport_devs, node) {
+ if (chdev->mport->id == mport->id) {
+ atomic_set(&chdev->active, 0);
+ list_del(&chdev->node);
+ found = 1;
+ break;
+ }
+ }
+ mutex_unlock(&mport_devs_lock);
+
+ if (found)
+ mport_cdev_remove(chdev);
+}
+
+/* the rio_mport_interface is used to handle local mport devices */
+static struct class_interface rio_mport_interface __refdata = {
+ .class = &rio_mport_class,
+ .add_dev = mport_add_mport,
+ .remove_dev = mport_remove_mport,
+};
+
+/*
+ * Linux kernel module
+ */
+
+/*
+ * mport_init - Driver module loading
+ */
+static int __init mport_init(void)
+{
+ int ret;
+
+ /* Create device class needed by udev */
+ dev_class = class_create(THIS_MODULE, DRV_NAME);
+ if (!dev_class) {
+ rmcd_error("Unable to create " DRV_NAME " class");
+ return -EINVAL;
+ }
+
+ ret = alloc_chrdev_region(&dev_number, 0, RIO_MAX_MPORTS, DRV_NAME);
+ if (ret < 0)
+ goto err_chr;
+
+ rmcd_debug(INIT, "Registered class with major=%d", MAJOR(dev_number));
+
+ /* Register to rio_mport_interface */
+ ret = class_interface_register(&rio_mport_interface);
+ if (ret) {
+ rmcd_error("class_interface_register() failed, err=%d", ret);
+ goto err_cli;
+ }
+
+ dma_wq = create_singlethread_workqueue("dma_wq");
+ if (!dma_wq) {
+ rmcd_error("failed to create DMA work queue");
+ ret = -ENOMEM;
+ goto err_wq;
+ }
+
+ return 0;
+
+err_wq:
+ class_interface_unregister(&rio_mport_interface);
+err_cli:
+ unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
+err_chr:
+ class_destroy(dev_class);
+ return ret;
+}
+
+/**
+ * mport_exit - Driver module unloading
+ */
+static void __exit mport_exit(void)
+{
+ class_interface_unregister(&rio_mport_interface);
+ class_destroy(dev_class);
+ unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
+ destroy_workqueue(dma_wq);
+}
+
+module_init(mport_init);
+module_exit(mport_exit);
diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index eeca70ddbf61..b5b455614f8a 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -36,7 +36,11 @@
#include "tsi721.h"
-#define DEBUG_PW /* Inbound Port-Write debugging */
+#ifdef DEBUG
+u32 dbg_level = DBG_INIT | DBG_EXIT;
+module_param(dbg_level, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)");
+#endif
static void tsi721_omsg_handler(struct tsi721_device *priv, int ch);
static void tsi721_imsg_handler(struct tsi721_device *priv, int ch);
@@ -143,9 +147,9 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size,
& TSI721_DMAC_STS_RUN) {
udelay(1);
if (++i >= 5000000) {
- dev_dbg(&priv->pdev->dev,
- "%s : DMA[%d] read timeout ch_status=%x\n",
- __func__, priv->mdma.ch_id, ch_stat);
+ tsi_debug(MAINT, &priv->pdev->dev,
+ "DMA[%d] read timeout ch_status=%x",
+ priv->mdma.ch_id, ch_stat);
if (!do_wr)
*data = 0xffffffff;
err = -EIO;
@@ -157,10 +161,12 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size,
/* If DMA operation aborted due to error,
* reinitialize DMA channel
*/
- dev_dbg(&priv->pdev->dev, "%s : DMA ABORT ch_stat=%x\n",
- __func__, ch_stat);
- dev_dbg(&priv->pdev->dev, "OP=%d : destid=%x hc=%x off=%x\n",
- do_wr ? MAINT_WR : MAINT_RD, destid, hopcount, offset);
+ tsi_debug(MAINT, &priv->pdev->dev, "DMA ABORT ch_stat=%x",
+ ch_stat);
+ tsi_debug(MAINT, &priv->pdev->dev,
+ "OP=%d : destid=%x hc=%x off=%x",
+ do_wr ? MAINT_WR : MAINT_RD,
+ destid, hopcount, offset);
iowrite32(TSI721_DMAC_INT_ALL, regs + TSI721_DMAC_INT);
iowrite32(TSI721_DMAC_CTL_INIT, regs + TSI721_DMAC_CTL);
udelay(10);
@@ -236,16 +242,15 @@ static int tsi721_cwrite_dma(struct rio_mport *mport, int index, u16 destid,
/**
* tsi721_pw_handler - Tsi721 inbound port-write interrupt handler
- * @mport: RapidIO master port structure
+ * @priv: tsi721 device private structure
*
* Handles inbound port-write interrupts. Copies PW message from an internal
* buffer into PW message FIFO and schedules deferred routine to process
* queued messages.
*/
static int
-tsi721_pw_handler(struct rio_mport *mport)
+tsi721_pw_handler(struct tsi721_device *priv)
{
- struct tsi721_device *priv = mport->priv;
u32 pw_stat;
u32 pw_buf[TSI721_RIO_PW_MSG_SIZE/sizeof(u32)];
@@ -283,30 +288,15 @@ static void tsi721_pw_dpc(struct work_struct *work)
{
struct tsi721_device *priv = container_of(work, struct tsi721_device,
pw_work);
- u32 msg_buffer[RIO_PW_MSG_SIZE/sizeof(u32)]; /* Use full size PW message
- buffer for RIO layer */
+ union rio_pw_msg pwmsg;
/*
* Process port-write messages
*/
- while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)msg_buffer,
+ while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)&pwmsg,
TSI721_RIO_PW_MSG_SIZE, &priv->pw_fifo_lock)) {
- /* Process one message */
-#ifdef DEBUG_PW
- {
- u32 i;
- pr_debug("%s : Port-Write Message:", __func__);
- for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32); ) {
- pr_debug("0x%02x: %08x %08x %08x %08x", i*4,
- msg_buffer[i], msg_buffer[i + 1],
- msg_buffer[i + 2], msg_buffer[i + 3]);
- i += 4;
- }
- pr_debug("\n");
- }
-#endif
/* Pass the port-write message to RIO core for processing */
- rio_inb_pwrite_handler((union rio_pw_msg *)msg_buffer);
+ rio_inb_pwrite_handler(&priv->mport, &pwmsg);
}
}
@@ -354,8 +344,8 @@ static int tsi721_dsend(struct rio_mport *mport, int index,
offset = (((mport->sys_size) ? RIO_TT_CODE_16 : RIO_TT_CODE_8) << 18) |
(destid << 2);
- dev_dbg(&priv->pdev->dev,
- "Send Doorbell 0x%04x to destID 0x%x\n", data, destid);
+ tsi_debug(DBELL, &priv->pdev->dev,
+ "Send Doorbell 0x%04x to destID 0x%x", data, destid);
iowrite16be(data, priv->odb_base + offset);
return 0;
@@ -363,16 +353,15 @@ static int tsi721_dsend(struct rio_mport *mport, int index,
/**
* tsi721_dbell_handler - Tsi721 doorbell interrupt handler
- * @mport: RapidIO master port structure
+ * @priv: tsi721 device-specific data structure
*
* Handles inbound doorbell interrupts. Copies doorbell entry from an internal
* buffer into DB message FIFO and schedules deferred routine to process
* queued DBs.
*/
static int
-tsi721_dbell_handler(struct rio_mport *mport)
+tsi721_dbell_handler(struct tsi721_device *priv)
{
- struct tsi721_device *priv = mport->priv;
u32 regval;
/* Disable IDB interrupts */
@@ -404,7 +393,7 @@ static void tsi721_db_dpc(struct work_struct *work)
/*
* Process queued inbound doorbells
*/
- mport = priv->mport;
+ mport = &priv->mport;
wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE;
rd_ptr = ioread32(priv->regs + TSI721_IDQ_RP(IDB_QUEUE)) % IDB_QSIZE;
@@ -430,10 +419,10 @@ static void tsi721_db_dpc(struct work_struct *work)
dbell->dinb(mport, dbell->dev_id, DBELL_SID(idb.bytes),
DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
} else {
- dev_dbg(&priv->pdev->dev,
- "spurious inb doorbell, sid %2.2x tid %2.2x"
- " info %4.4x\n", DBELL_SID(idb.bytes),
- DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
+ tsi_debug(DBELL, &priv->pdev->dev,
+ "spurious IDB sid %2.2x tid %2.2x info %4.4x",
+ DBELL_SID(idb.bytes), DBELL_TID(idb.bytes),
+ DBELL_INF(idb.bytes));
}
wr_ptr = ioread32(priv->regs +
@@ -457,15 +446,14 @@ static void tsi721_db_dpc(struct work_struct *work)
/**
* tsi721_irqhandler - Tsi721 interrupt handler
* @irq: Linux interrupt number
- * @ptr: Pointer to interrupt-specific data (mport structure)
+ * @ptr: Pointer to interrupt-specific data (tsi721_device structure)
*
* Handles Tsi721 interrupts signaled using MSI and INTA. Checks reported
* interrupt events and calls an event-specific handler(s).
*/
static irqreturn_t tsi721_irqhandler(int irq, void *ptr)
{
- struct rio_mport *mport = (struct rio_mport *)ptr;
- struct tsi721_device *priv = mport->priv;
+ struct tsi721_device *priv = (struct tsi721_device *)ptr;
u32 dev_int;
u32 dev_ch_int;
u32 intval;
@@ -488,10 +476,10 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr)
intval = ioread32(priv->regs +
TSI721_SR_CHINT(IDB_QUEUE));
if (intval & TSI721_SR_CHINT_IDBQRCV)
- tsi721_dbell_handler(mport);
+ tsi721_dbell_handler(priv);
else
- dev_info(&priv->pdev->dev,
- "Unsupported SR_CH_INT %x\n", intval);
+ tsi_info(&priv->pdev->dev,
+ "Unsupported SR_CH_INT %x", intval);
/* Clear interrupts */
iowrite32(intval,
@@ -545,7 +533,7 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr)
/* Service SRIO MAC interrupts */
intval = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT);
if (intval & TSI721_RIO_EM_INT_STAT_PW_RX)
- tsi721_pw_handler(mport);
+ tsi721_pw_handler(priv);
}
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
@@ -553,8 +541,8 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr)
int ch;
if (dev_ch_int & TSI721_INT_BDMA_CHAN_M) {
- dev_dbg(&priv->pdev->dev,
- "IRQ from DMA channel 0x%08x\n", dev_ch_int);
+ tsi_debug(DMA, &priv->pdev->dev,
+ "IRQ from DMA channel 0x%08x", dev_ch_int);
for (ch = 0; ch < TSI721_DMA_MAXCH; ch++) {
if (!(dev_ch_int & TSI721_INT_BDMA_CHAN(ch)))
@@ -613,13 +601,13 @@ static void tsi721_interrupts_init(struct tsi721_device *priv)
/**
* tsi721_omsg_msix - MSI-X interrupt handler for outbound messaging
* @irq: Linux interrupt number
- * @ptr: Pointer to interrupt-specific data (mport structure)
+ * @ptr: Pointer to interrupt-specific data (tsi721_device structure)
*
* Handles outbound messaging interrupts signaled using MSI-X.
*/
static irqreturn_t tsi721_omsg_msix(int irq, void *ptr)
{
- struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+ struct tsi721_device *priv = (struct tsi721_device *)ptr;
int mbox;
mbox = (irq - priv->msix[TSI721_VECT_OMB0_DONE].vector) % RIO_MAX_MBOX;
@@ -630,13 +618,13 @@ static irqreturn_t tsi721_omsg_msix(int irq, void *ptr)
/**
* tsi721_imsg_msix - MSI-X interrupt handler for inbound messaging
* @irq: Linux interrupt number
- * @ptr: Pointer to interrupt-specific data (mport structure)
+ * @ptr: Pointer to interrupt-specific data (tsi721_device structure)
*
* Handles inbound messaging interrupts signaled using MSI-X.
*/
static irqreturn_t tsi721_imsg_msix(int irq, void *ptr)
{
- struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+ struct tsi721_device *priv = (struct tsi721_device *)ptr;
int mbox;
mbox = (irq - priv->msix[TSI721_VECT_IMB0_RCV].vector) % RIO_MAX_MBOX;
@@ -647,19 +635,19 @@ static irqreturn_t tsi721_imsg_msix(int irq, void *ptr)
/**
* tsi721_srio_msix - Tsi721 MSI-X SRIO MAC interrupt handler
* @irq: Linux interrupt number
- * @ptr: Pointer to interrupt-specific data (mport structure)
+ * @ptr: Pointer to interrupt-specific data (tsi721_device structure)
*
* Handles Tsi721 interrupts from SRIO MAC.
*/
static irqreturn_t tsi721_srio_msix(int irq, void *ptr)
{
- struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+ struct tsi721_device *priv = (struct tsi721_device *)ptr;
u32 srio_int;
/* Service SRIO MAC interrupts */
srio_int = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT);
if (srio_int & TSI721_RIO_EM_INT_STAT_PW_RX)
- tsi721_pw_handler((struct rio_mport *)ptr);
+ tsi721_pw_handler(priv);
return IRQ_HANDLED;
}
@@ -667,7 +655,7 @@ static irqreturn_t tsi721_srio_msix(int irq, void *ptr)
/**
* tsi721_sr2pc_ch_msix - Tsi721 MSI-X SR2PC Channel interrupt handler
* @irq: Linux interrupt number
- * @ptr: Pointer to interrupt-specific data (mport structure)
+ * @ptr: Pointer to interrupt-specific data (tsi721_device structure)
*
* Handles Tsi721 interrupts from SR2PC Channel.
* NOTE: At this moment services only one SR2PC channel associated with inbound
@@ -675,13 +663,13 @@ static irqreturn_t tsi721_srio_msix(int irq, void *ptr)
*/
static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr)
{
- struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+ struct tsi721_device *priv = (struct tsi721_device *)ptr;
u32 sr_ch_int;
/* Service Inbound DB interrupt from SR2PC channel */
sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
if (sr_ch_int & TSI721_SR_CHINT_IDBQRCV)
- tsi721_dbell_handler((struct rio_mport *)ptr);
+ tsi721_dbell_handler(priv);
/* Clear interrupts */
iowrite32(sr_ch_int, priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
@@ -693,32 +681,31 @@ static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr)
/**
* tsi721_request_msix - register interrupt service for MSI-X mode.
- * @mport: RapidIO master port structure
+ * @priv: tsi721 device-specific data structure
*
* Registers MSI-X interrupt service routines for interrupts that are active
* immediately after mport initialization. Messaging interrupt service routines
* should be registered during corresponding open requests.
*/
-static int tsi721_request_msix(struct rio_mport *mport)
+static int tsi721_request_msix(struct tsi721_device *priv)
{
- struct tsi721_device *priv = mport->priv;
int err = 0;
err = request_irq(priv->msix[TSI721_VECT_IDB].vector,
tsi721_sr2pc_ch_msix, 0,
- priv->msix[TSI721_VECT_IDB].irq_name, (void *)mport);
+ priv->msix[TSI721_VECT_IDB].irq_name, (void *)priv);
if (err)
- goto out;
+ return err;
err = request_irq(priv->msix[TSI721_VECT_PWRX].vector,
tsi721_srio_msix, 0,
- priv->msix[TSI721_VECT_PWRX].irq_name, (void *)mport);
- if (err)
- free_irq(
- priv->msix[TSI721_VECT_IDB].vector,
- (void *)mport);
-out:
- return err;
+ priv->msix[TSI721_VECT_PWRX].irq_name, (void *)priv);
+ if (err) {
+ free_irq(priv->msix[TSI721_VECT_IDB].vector, (void *)priv);
+ return err;
+ }
+
+ return 0;
}
/**
@@ -770,8 +757,8 @@ static int tsi721_enable_msix(struct tsi721_device *priv)
err = pci_enable_msix_exact(priv->pdev, entries, ARRAY_SIZE(entries));
if (err) {
- dev_err(&priv->pdev->dev,
- "Failed to enable MSI-X (err=%d)\n", err);
+ tsi_err(&priv->pdev->dev,
+ "Failed to enable MSI-X (err=%d)", err);
return err;
}
@@ -831,27 +818,209 @@ static int tsi721_enable_msix(struct tsi721_device *priv)
}
#endif /* CONFIG_PCI_MSI */
-static int tsi721_request_irq(struct rio_mport *mport)
+static int tsi721_request_irq(struct tsi721_device *priv)
{
- struct tsi721_device *priv = mport->priv;
int err;
#ifdef CONFIG_PCI_MSI
if (priv->flags & TSI721_USING_MSIX)
- err = tsi721_request_msix(mport);
+ err = tsi721_request_msix(priv);
else
#endif
err = request_irq(priv->pdev->irq, tsi721_irqhandler,
(priv->flags & TSI721_USING_MSI) ? 0 : IRQF_SHARED,
- DRV_NAME, (void *)mport);
+ DRV_NAME, (void *)priv);
if (err)
- dev_err(&priv->pdev->dev,
- "Unable to allocate interrupt, Error: %d\n", err);
+ tsi_err(&priv->pdev->dev,
+ "Unable to allocate interrupt, err=%d", err);
return err;
}
+static void tsi721_free_irq(struct tsi721_device *priv)
+{
+#ifdef CONFIG_PCI_MSI
+ if (priv->flags & TSI721_USING_MSIX) {
+ free_irq(priv->msix[TSI721_VECT_IDB].vector, (void *)priv);
+ free_irq(priv->msix[TSI721_VECT_PWRX].vector, (void *)priv);
+ } else
+#endif
+ free_irq(priv->pdev->irq, (void *)priv);
+}
+
+static int
+tsi721_obw_alloc(struct tsi721_device *priv, struct tsi721_obw_bar *pbar,
+ u32 size, int *win_id)
+{
+ u64 win_base;
+ u64 bar_base;
+ u64 bar_end;
+ u32 align;
+ struct tsi721_ob_win *win;
+ struct tsi721_ob_win *new_win = NULL;
+ int new_win_idx = -1;
+ int i = 0;
+
+ bar_base = pbar->base;
+ bar_end = bar_base + pbar->size;
+ win_base = bar_base;
+ align = size/TSI721_PC2SR_ZONES;
+
+ while (i < TSI721_IBWIN_NUM) {
+ for (i = 0; i < TSI721_IBWIN_NUM; i++) {
+ if (!priv->ob_win[i].active) {
+ if (new_win == NULL) {
+ new_win = &priv->ob_win[i];
+ new_win_idx = i;
+ }
+ continue;
+ }
+
+ /*
+ * If this window belongs to the current BAR check it
+ * for overlap
+ */
+ win = &priv->ob_win[i];
+
+ if (win->base >= bar_base && win->base < bar_end) {
+ if (win_base < (win->base + win->size) &&
+ (win_base + size) > win->base) {
+ /* Overlap detected */
+ win_base = win->base + win->size;
+ win_base = ALIGN(win_base, align);
+ break;
+ }
+ }
+ }
+ }
+
+ if (win_base + size > bar_end)
+ return -ENOMEM;
+
+ if (!new_win) {
+ tsi_err(&priv->pdev->dev, "OBW count tracking failed");
+ return -EIO;
+ }
+
+ new_win->active = true;
+ new_win->base = win_base;
+ new_win->size = size;
+ new_win->pbar = pbar;
+ priv->obwin_cnt--;
+ pbar->free -= size;
+ *win_id = new_win_idx;
+ return 0;
+}
+
+static int tsi721_map_outb_win(struct rio_mport *mport, u16 destid, u64 rstart,
+ u32 size, u32 flags, dma_addr_t *laddr)
+{
+ struct tsi721_device *priv = mport->priv;
+ int i;
+ struct tsi721_obw_bar *pbar;
+ struct tsi721_ob_win *ob_win;
+ int obw = -1;
+ u32 rval;
+ u64 rio_addr;
+ u32 zsize;
+ int ret = -ENOMEM;
+
+ tsi_debug(OBW, &priv->pdev->dev,
+ "did=%d ra=0x%llx sz=0x%x", destid, rstart, size);
+
+ if (!is_power_of_2(size) || (size < 0x8000) || (rstart & (size - 1)))
+ return -EINVAL;
+
+ if (priv->obwin_cnt == 0)
+ return -EBUSY;
+
+ for (i = 0; i < 2; i++) {
+ if (priv->p2r_bar[i].free >= size) {
+ pbar = &priv->p2r_bar[i];
+ ret = tsi721_obw_alloc(priv, pbar, size, &obw);
+ if (!ret)
+ break;
+ }
+ }
+
+ if (ret)
+ return ret;
+
+ WARN_ON(obw == -1);
+ ob_win = &priv->ob_win[obw];
+ ob_win->destid = destid;
+ ob_win->rstart = rstart;
+ tsi_debug(OBW, &priv->pdev->dev,
+ "allocated OBW%d @%llx", obw, ob_win->base);
+
+ /*
+ * Configure Outbound Window
+ */
+
+ zsize = size/TSI721_PC2SR_ZONES;
+ rio_addr = rstart;
+
+ /*
+ * Program Address Translation Zones:
+ * This implementation uses all 8 zones associated wit window.
+ */
+ for (i = 0; i < TSI721_PC2SR_ZONES; i++) {
+
+ while (ioread32(priv->regs + TSI721_ZONE_SEL) &
+ TSI721_ZONE_SEL_GO) {
+ udelay(1);
+ }
+
+ rval = (u32)(rio_addr & TSI721_LUT_DATA0_ADD) |
+ TSI721_LUT_DATA0_NREAD | TSI721_LUT_DATA0_NWR;
+ iowrite32(rval, priv->regs + TSI721_LUT_DATA0);
+ rval = (u32)(rio_addr >> 32);
+ iowrite32(rval, priv->regs + TSI721_LUT_DATA1);
+ rval = destid;
+ iowrite32(rval, priv->regs + TSI721_LUT_DATA2);
+
+ rval = TSI721_ZONE_SEL_GO | (obw << 3) | i;
+ iowrite32(rval, priv->regs + TSI721_ZONE_SEL);
+
+ rio_addr += zsize;
+ }
+
+ iowrite32(TSI721_OBWIN_SIZE(size) << 8,
+ priv->regs + TSI721_OBWINSZ(obw));
+ iowrite32((u32)(ob_win->base >> 32), priv->regs + TSI721_OBWINUB(obw));
+ iowrite32((u32)(ob_win->base & TSI721_OBWINLB_BA) | TSI721_OBWINLB_WEN,
+ priv->regs + TSI721_OBWINLB(obw));
+
+ *laddr = ob_win->base;
+ return 0;
+}
+
+static void tsi721_unmap_outb_win(struct rio_mport *mport,
+ u16 destid, u64 rstart)
+{
+ struct tsi721_device *priv = mport->priv;
+ struct tsi721_ob_win *ob_win;
+ int i;
+
+ tsi_debug(OBW, &priv->pdev->dev, "did=%d ra=0x%llx", destid, rstart);
+
+ for (i = 0; i < TSI721_OBWIN_NUM; i++) {
+ ob_win = &priv->ob_win[i];
+
+ if (ob_win->active &&
+ ob_win->destid == destid && ob_win->rstart == rstart) {
+ tsi_debug(OBW, &priv->pdev->dev,
+ "free OBW%d @%llx", i, ob_win->base);
+ ob_win->active = false;
+ iowrite32(0, priv->regs + TSI721_OBWINLB(i));
+ ob_win->pbar->free += ob_win->size;
+ priv->obwin_cnt++;
+ break;
+ }
+ }
+}
+
/**
* tsi721_init_pc2sr_mapping - initializes outbound (PCIe->SRIO)
* translation regions.
@@ -861,11 +1030,41 @@ static int tsi721_request_irq(struct rio_mport *mport)
*/
static void tsi721_init_pc2sr_mapping(struct tsi721_device *priv)
{
- int i;
+ int i, z;
+ u32 rval;
/* Disable all PC2SR translation windows */
for (i = 0; i < TSI721_OBWIN_NUM; i++)
iowrite32(0, priv->regs + TSI721_OBWINLB(i));
+
+ /* Initialize zone lookup tables to avoid ECC errors on reads */
+ iowrite32(0, priv->regs + TSI721_LUT_DATA0);
+ iowrite32(0, priv->regs + TSI721_LUT_DATA1);
+ iowrite32(0, priv->regs + TSI721_LUT_DATA2);
+
+ for (i = 0; i < TSI721_OBWIN_NUM; i++) {
+ for (z = 0; z < TSI721_PC2SR_ZONES; z++) {
+ while (ioread32(priv->regs + TSI721_ZONE_SEL) &
+ TSI721_ZONE_SEL_GO) {
+ udelay(1);
+ }
+ rval = TSI721_ZONE_SEL_GO | (i << 3) | z;
+ iowrite32(rval, priv->regs + TSI721_ZONE_SEL);
+ }
+ }
+
+ if (priv->p2r_bar[0].size == 0 && priv->p2r_bar[1].size == 0) {
+ priv->obwin_cnt = 0;
+ return;
+ }
+
+ priv->p2r_bar[0].free = priv->p2r_bar[0].size;
+ priv->p2r_bar[1].free = priv->p2r_bar[1].size;
+
+ for (i = 0; i < TSI721_OBWIN_NUM; i++)
+ priv->ob_win[i].active = false;
+
+ priv->obwin_cnt = TSI721_OBWIN_NUM;
}
/**
@@ -885,45 +1084,148 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart,
u64 rstart, u32 size, u32 flags)
{
struct tsi721_device *priv = mport->priv;
- int i;
+ int i, avail = -1;
u32 regval;
+ struct tsi721_ib_win *ib_win;
+ bool direct = (lstart == rstart);
+ u64 ibw_size;
+ dma_addr_t loc_start;
+ u64 ibw_start;
+ struct tsi721_ib_win_mapping *map = NULL;
+ int ret = -EBUSY;
+
+ if (direct) {
+ /* Calculate minimal acceptable window size and base address */
+
+ ibw_size = roundup_pow_of_two(size);
+ ibw_start = lstart & ~(ibw_size - 1);
+
+ tsi_debug(IBW, &priv->pdev->dev,
+ "Direct (RIO_0x%llx -> PCIe_0x%pad), size=0x%x, ibw_start = 0x%llx",
+ rstart, &lstart, size, ibw_start);
+
+ while ((lstart + size) > (ibw_start + ibw_size)) {
+ ibw_size *= 2;
+ ibw_start = lstart & ~(ibw_size - 1);
+ if (ibw_size > 0x80000000) { /* Limit max size to 2GB */
+ return -EBUSY;
+ }
+ }
- if (!is_power_of_2(size) || size < 0x1000 ||
- ((u64)lstart & (size - 1)) || (rstart & (size - 1)))
- return -EINVAL;
+ loc_start = ibw_start;
+
+ map = kzalloc(sizeof(struct tsi721_ib_win_mapping), GFP_ATOMIC);
+ if (map == NULL)
+ return -ENOMEM;
+
+ } else {
+ tsi_debug(IBW, &priv->pdev->dev,
+ "Translated (RIO_0x%llx -> PCIe_0x%pad), size=0x%x",
+ rstart, &lstart, size);
+
+ if (!is_power_of_2(size) || size < 0x1000 ||
+ ((u64)lstart & (size - 1)) || (rstart & (size - 1)))
+ return -EINVAL;
+ if (priv->ibwin_cnt == 0)
+ return -EBUSY;
+ ibw_start = rstart;
+ ibw_size = size;
+ loc_start = lstart;
+ }
- /* Search for free inbound translation window */
+ /*
+ * Scan for overlapping with active regions and mark the first available
+ * IB window at the same time.
+ */
for (i = 0; i < TSI721_IBWIN_NUM; i++) {
- regval = ioread32(priv->regs + TSI721_IBWIN_LB(i));
- if (!(regval & TSI721_IBWIN_LB_WEN))
+ ib_win = &priv->ib_win[i];
+
+ if (!ib_win->active) {
+ if (avail == -1) {
+ avail = i;
+ ret = 0;
+ }
+ } else if (ibw_start < (ib_win->rstart + ib_win->size) &&
+ (ibw_start + ibw_size) > ib_win->rstart) {
+ /* Return error if address translation involved */
+ if (direct && ib_win->xlat) {
+ ret = -EFAULT;
+ break;
+ }
+
+ /*
+ * Direct mappings usually are larger than originally
+ * requested fragments - check if this new request fits
+ * into it.
+ */
+ if (rstart >= ib_win->rstart &&
+ (rstart + size) <= (ib_win->rstart +
+ ib_win->size)) {
+ /* We are in - no further mapping required */
+ map->lstart = lstart;
+ list_add_tail(&map->node, &ib_win->mappings);
+ return 0;
+ }
+
+ ret = -EFAULT;
break;
+ }
}
- if (i >= TSI721_IBWIN_NUM) {
- dev_err(&priv->pdev->dev,
- "Unable to find free inbound window\n");
- return -EBUSY;
+ if (ret)
+ goto out;
+ i = avail;
+
+ /* Sanity check: available IB window must be disabled at this point */
+ regval = ioread32(priv->regs + TSI721_IBWIN_LB(i));
+ if (WARN_ON(regval & TSI721_IBWIN_LB_WEN)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ib_win = &priv->ib_win[i];
+ ib_win->active = true;
+ ib_win->rstart = ibw_start;
+ ib_win->lstart = loc_start;
+ ib_win->size = ibw_size;
+ ib_win->xlat = (lstart != rstart);
+ INIT_LIST_HEAD(&ib_win->mappings);
+
+ /*
+ * When using direct IBW mapping and have larger than requested IBW size
+ * we can have multiple local memory blocks mapped through the same IBW
+ * To handle this situation we maintain list of "clients" for such IBWs.
+ */
+ if (direct) {
+ map->lstart = lstart;
+ list_add_tail(&map->node, &ib_win->mappings);
}
- iowrite32(TSI721_IBWIN_SIZE(size) << 8,
+ iowrite32(TSI721_IBWIN_SIZE(ibw_size) << 8,
priv->regs + TSI721_IBWIN_SZ(i));
- iowrite32(((u64)lstart >> 32), priv->regs + TSI721_IBWIN_TUA(i));
- iowrite32(((u64)lstart & TSI721_IBWIN_TLA_ADD),
+ iowrite32(((u64)loc_start >> 32), priv->regs + TSI721_IBWIN_TUA(i));
+ iowrite32(((u64)loc_start & TSI721_IBWIN_TLA_ADD),
priv->regs + TSI721_IBWIN_TLA(i));
- iowrite32(rstart >> 32, priv->regs + TSI721_IBWIN_UB(i));
- iowrite32((rstart & TSI721_IBWIN_LB_BA) | TSI721_IBWIN_LB_WEN,
+ iowrite32(ibw_start >> 32, priv->regs + TSI721_IBWIN_UB(i));
+ iowrite32((ibw_start & TSI721_IBWIN_LB_BA) | TSI721_IBWIN_LB_WEN,
priv->regs + TSI721_IBWIN_LB(i));
- dev_dbg(&priv->pdev->dev,
- "Configured IBWIN%d mapping (RIO_0x%llx -> PCIe_0x%llx)\n",
- i, rstart, (unsigned long long)lstart);
+
+ priv->ibwin_cnt--;
+
+ tsi_debug(IBW, &priv->pdev->dev,
+ "Configured IBWIN%d (RIO_0x%llx -> PCIe_0x%pad), size=0x%llx",
+ i, ibw_start, &loc_start, ibw_size);
return 0;
+out:
+ kfree(map);
+ return ret;
}
/**
- * fsl_rio_unmap_inb_mem -- Unmapping inbound memory region.
+ * tsi721_rio_unmap_inb_mem -- Unmapping inbound memory region.
* @mport: RapidIO master port
* @lstart: Local memory space start address.
*/
@@ -931,25 +1233,56 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport,
dma_addr_t lstart)
{
struct tsi721_device *priv = mport->priv;
+ struct tsi721_ib_win *ib_win;
int i;
- u64 addr;
- u32 regval;
+
+ tsi_debug(IBW, &priv->pdev->dev,
+ "Unmap IBW mapped to PCIe_0x%pad", &lstart);
/* Search for matching active inbound translation window */
for (i = 0; i < TSI721_IBWIN_NUM; i++) {
- regval = ioread32(priv->regs + TSI721_IBWIN_LB(i));
- if (regval & TSI721_IBWIN_LB_WEN) {
- regval = ioread32(priv->regs + TSI721_IBWIN_TUA(i));
- addr = (u64)regval << 32;
- regval = ioread32(priv->regs + TSI721_IBWIN_TLA(i));
- addr |= regval & TSI721_IBWIN_TLA_ADD;
-
- if (addr == (u64)lstart) {
- iowrite32(0, priv->regs + TSI721_IBWIN_LB(i));
- break;
+ ib_win = &priv->ib_win[i];
+
+ /* Address translating IBWs must to be an exact march */
+ if (!ib_win->active ||
+ (ib_win->xlat && lstart != ib_win->lstart))
+ continue;
+
+ if (lstart >= ib_win->lstart &&
+ lstart < (ib_win->lstart + ib_win->size)) {
+
+ if (!ib_win->xlat) {
+ struct tsi721_ib_win_mapping *map;
+ int found = 0;
+
+ list_for_each_entry(map,
+ &ib_win->mappings, node) {
+ if (map->lstart == lstart) {
+ list_del(&map->node);
+ kfree(map);
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ continue;
+
+ if (!list_empty(&ib_win->mappings))
+ break;
}
+
+ tsi_debug(IBW, &priv->pdev->dev, "Disable IBWIN_%d", i);
+ iowrite32(0, priv->regs + TSI721_IBWIN_LB(i));
+ ib_win->active = false;
+ priv->ibwin_cnt++;
+ break;
}
}
+
+ if (i == TSI721_IBWIN_NUM)
+ tsi_debug(IBW, &priv->pdev->dev,
+ "IB window mapped to %pad not found", &lstart);
}
/**
@@ -966,6 +1299,27 @@ static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv)
/* Disable all SR2PC inbound windows */
for (i = 0; i < TSI721_IBWIN_NUM; i++)
iowrite32(0, priv->regs + TSI721_IBWIN_LB(i));
+ priv->ibwin_cnt = TSI721_IBWIN_NUM;
+}
+
+/*
+ * tsi721_close_sr2pc_mapping - closes all active inbound (SRIO->PCIe)
+ * translation regions.
+ * @priv: pointer to tsi721 device private data
+ */
+static void tsi721_close_sr2pc_mapping(struct tsi721_device *priv)
+{
+ struct tsi721_ib_win *ib_win;
+ int i;
+
+ /* Disable all active SR2PC inbound windows */
+ for (i = 0; i < TSI721_IBWIN_NUM; i++) {
+ ib_win = &priv->ib_win[i];
+ if (ib_win->active) {
+ iowrite32(0, priv->regs + TSI721_IBWIN_LB(i));
+ ib_win->active = false;
+ }
+ }
}
/**
@@ -982,7 +1336,7 @@ static int tsi721_port_write_init(struct tsi721_device *priv)
spin_lock_init(&priv->pw_fifo_lock);
if (kfifo_alloc(&priv->pw_fifo,
TSI721_RIO_PW_MSG_SIZE * 32, GFP_KERNEL)) {
- dev_err(&priv->pdev->dev, "PW FIFO allocation failed\n");
+ tsi_err(&priv->pdev->dev, "PW FIFO allocation failed");
return -ENOMEM;
}
@@ -991,6 +1345,11 @@ static int tsi721_port_write_init(struct tsi721_device *priv)
return 0;
}
+static void tsi721_port_write_free(struct tsi721_device *priv)
+{
+ kfifo_free(&priv->pw_fifo);
+}
+
static int tsi721_doorbell_init(struct tsi721_device *priv)
{
/* Outbound Doorbells do not require any setup.
@@ -1009,8 +1368,9 @@ static int tsi721_doorbell_init(struct tsi721_device *priv)
if (!priv->idb_base)
return -ENOMEM;
- dev_dbg(&priv->pdev->dev, "Allocated IDB buffer @ %p (phys = %llx)\n",
- priv->idb_base, (unsigned long long)priv->idb_dma);
+ tsi_debug(DBELL, &priv->pdev->dev,
+ "Allocated IDB buffer @ %p (phys = %pad)",
+ priv->idb_base, &priv->idb_dma);
iowrite32(TSI721_IDQ_SIZE_VAL(IDB_QSIZE),
priv->regs + TSI721_IDQ_SIZE(IDB_QUEUE));
@@ -1056,9 +1416,8 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv)
int bd_num = 2;
void __iomem *regs;
- dev_dbg(&priv->pdev->dev,
- "Init Block DMA Engine for Maintenance requests, CH%d\n",
- TSI721_DMACH_MAINT);
+ tsi_debug(MAINT, &priv->pdev->dev,
+ "Init BDMA_%d Maintenance requests", TSI721_DMACH_MAINT);
/*
* Initialize DMA channel for maintenance requests
@@ -1078,8 +1437,8 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv)
priv->mdma.bd_phys = bd_phys;
priv->mdma.bd_base = bd_ptr;
- dev_dbg(&priv->pdev->dev, "DMA descriptors @ %p (phys = %llx)\n",
- bd_ptr, (unsigned long long)bd_phys);
+ tsi_debug(MAINT, &priv->pdev->dev, "DMA descriptors @ %p (phys = %pad)",
+ bd_ptr, &bd_phys);
/* Allocate space for descriptor status FIFO */
sts_size = (bd_num >= TSI721_DMA_MINSTSSZ) ?
@@ -1101,9 +1460,9 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv)
priv->mdma.sts_base = sts_ptr;
priv->mdma.sts_size = sts_size;
- dev_dbg(&priv->pdev->dev,
- "desc status FIFO @ %p (phys = %llx) size=0x%x\n",
- sts_ptr, (unsigned long long)sts_phys, sts_size);
+ tsi_debug(MAINT, &priv->pdev->dev,
+ "desc status FIFO @ %p (phys = %pad) size=0x%x",
+ sts_ptr, &sts_phys, sts_size);
/* Initialize DMA descriptors ring */
bd_ptr[bd_num - 1].type_id = cpu_to_le32(DTYPE3 << 29);
@@ -1304,11 +1663,14 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox,
struct tsi721_device *priv = mport->priv;
struct tsi721_omsg_desc *desc;
u32 tx_slot;
+ unsigned long flags;
if (!priv->omsg_init[mbox] ||
len > TSI721_MSG_MAX_SIZE || len < 8)
return -EINVAL;
+ spin_lock_irqsave(&priv->omsg_ring[mbox].lock, flags);
+
tx_slot = priv->omsg_ring[mbox].tx_slot;
/* Copy copy message into transfer buffer */
@@ -1320,9 +1682,11 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox,
/* Build descriptor associated with buffer */
desc = priv->omsg_ring[mbox].omd_base;
desc[tx_slot].type_id = cpu_to_le32((DTYPE4 << 29) | rdev->destid);
+#ifdef TSI721_OMSG_DESC_INT
+ /* Request IOF_DONE interrupt generation for each N-th frame in queue */
if (tx_slot % 4 == 0)
desc[tx_slot].type_id |= cpu_to_le32(TSI721_OMD_IOF);
-
+#endif
desc[tx_slot].msg_info =
cpu_to_le32((mport->sys_size << 26) | (mbox << 22) |
(0xe << 12) | (len & 0xff8));
@@ -1348,6 +1712,8 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox,
priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
+ spin_unlock_irqrestore(&priv->omsg_ring[mbox].lock, flags);
+
return 0;
}
@@ -1361,20 +1727,23 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox,
static void tsi721_omsg_handler(struct tsi721_device *priv, int ch)
{
u32 omsg_int;
+ struct rio_mport *mport = &priv->mport;
+ void *dev_id = NULL;
+ u32 tx_slot = 0xffffffff;
+ int do_callback = 0;
spin_lock(&priv->omsg_ring[ch].lock);
omsg_int = ioread32(priv->regs + TSI721_OBDMAC_INT(ch));
if (omsg_int & TSI721_OBDMAC_INT_ST_FULL)
- dev_info(&priv->pdev->dev,
- "OB MBOX%d: Status FIFO is full\n", ch);
+ tsi_info(&priv->pdev->dev,
+ "OB MBOX%d: Status FIFO is full", ch);
if (omsg_int & (TSI721_OBDMAC_INT_DONE | TSI721_OBDMAC_INT_IOF_DONE)) {
u32 srd_ptr;
u64 *sts_ptr, last_ptr = 0, prev_ptr = 0;
int i, j;
- u32 tx_slot;
/*
* Find last successfully processed descriptor
@@ -1402,7 +1771,7 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch)
priv->omsg_ring[ch].sts_rdptr = srd_ptr;
iowrite32(srd_ptr, priv->regs + TSI721_OBDMAC_DSRP(ch));
- if (!priv->mport->outb_msg[ch].mcback)
+ if (!mport->outb_msg[ch].mcback)
goto no_sts_update;
/* Inform upper layer about transfer completion */
@@ -1424,14 +1793,19 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch)
goto no_sts_update;
}
+ if (tx_slot >= priv->omsg_ring[ch].size)
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "OB_MSG tx_slot=%x > size=%x",
+ tx_slot, priv->omsg_ring[ch].size);
+ WARN_ON(tx_slot >= priv->omsg_ring[ch].size);
+
/* Move slot index to the next message to be sent */
++tx_slot;
if (tx_slot == priv->omsg_ring[ch].size)
tx_slot = 0;
- BUG_ON(tx_slot >= priv->omsg_ring[ch].size);
- priv->mport->outb_msg[ch].mcback(priv->mport,
- priv->omsg_ring[ch].dev_id, ch,
- tx_slot);
+
+ dev_id = priv->omsg_ring[ch].dev_id;
+ do_callback = 1;
}
no_sts_update:
@@ -1442,20 +1816,20 @@ no_sts_update:
* reinitialize OB MSG channel
*/
- dev_dbg(&priv->pdev->dev, "OB MSG ABORT ch_stat=%x\n",
- ioread32(priv->regs + TSI721_OBDMAC_STS(ch)));
+ tsi_debug(OMSG, &priv->pdev->dev, "OB MSG ABORT ch_stat=%x",
+ ioread32(priv->regs + TSI721_OBDMAC_STS(ch)));
iowrite32(TSI721_OBDMAC_INT_ERROR,
priv->regs + TSI721_OBDMAC_INT(ch));
- iowrite32(TSI721_OBDMAC_CTL_INIT,
+ iowrite32(TSI721_OBDMAC_CTL_RETRY_THR | TSI721_OBDMAC_CTL_INIT,
priv->regs + TSI721_OBDMAC_CTL(ch));
ioread32(priv->regs + TSI721_OBDMAC_CTL(ch));
/* Inform upper level to clear all pending tx slots */
- if (priv->mport->outb_msg[ch].mcback)
- priv->mport->outb_msg[ch].mcback(priv->mport,
- priv->omsg_ring[ch].dev_id, ch,
- priv->omsg_ring[ch].tx_slot);
+ dev_id = priv->omsg_ring[ch].dev_id;
+ tx_slot = priv->omsg_ring[ch].tx_slot;
+ do_callback = 1;
+
/* Synch tx_slot tracking */
iowrite32(priv->omsg_ring[ch].tx_slot,
priv->regs + TSI721_OBDMAC_DRDCNT(ch));
@@ -1477,6 +1851,9 @@ no_sts_update:
}
spin_unlock(&priv->omsg_ring[ch].lock);
+
+ if (mport->outb_msg[ch].mcback && do_callback)
+ mport->outb_msg[ch].mcback(mport, dev_id, ch, tx_slot);
}
/**
@@ -1514,9 +1891,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
&priv->omsg_ring[mbox].omq_phys[i],
GFP_KERNEL);
if (priv->omsg_ring[mbox].omq_base[i] == NULL) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate OB MSG data buffer for"
- " MBOX%d\n", mbox);
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "ENOMEM for OB_MSG_%d data buffer", mbox);
rc = -ENOMEM;
goto out_buf;
}
@@ -1528,9 +1904,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
(entries + 1) * sizeof(struct tsi721_omsg_desc),
&priv->omsg_ring[mbox].omd_phys, GFP_KERNEL);
if (priv->omsg_ring[mbox].omd_base == NULL) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate OB MSG descriptor memory "
- "for MBOX%d\n", mbox);
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "ENOMEM for OB_MSG_%d descriptor memory", mbox);
rc = -ENOMEM;
goto out_buf;
}
@@ -1544,9 +1919,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
sizeof(struct tsi721_dma_sts),
&priv->omsg_ring[mbox].sts_phys, GFP_KERNEL);
if (priv->omsg_ring[mbox].sts_base == NULL) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate OB MSG descriptor status FIFO "
- "for MBOX%d\n", mbox);
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "ENOMEM for OB_MSG_%d status FIFO", mbox);
rc = -ENOMEM;
goto out_desc;
}
@@ -1575,32 +1949,28 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
#ifdef CONFIG_PCI_MSI
if (priv->flags & TSI721_USING_MSIX) {
+ int idx = TSI721_VECT_OMB0_DONE + mbox;
+
/* Request interrupt service if we are in MSI-X mode */
- rc = request_irq(
- priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
- tsi721_omsg_msix, 0,
- priv->msix[TSI721_VECT_OMB0_DONE + mbox].irq_name,
- (void *)mport);
+ rc = request_irq(priv->msix[idx].vector, tsi721_omsg_msix, 0,
+ priv->msix[idx].irq_name, (void *)priv);
if (rc) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate MSI-X interrupt for "
- "OBOX%d-DONE\n", mbox);
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "Unable to get MSI-X IRQ for OBOX%d-DONE",
+ mbox);
goto out_stat;
}
- rc = request_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector,
- tsi721_omsg_msix, 0,
- priv->msix[TSI721_VECT_OMB0_INT + mbox].irq_name,
- (void *)mport);
+ idx = TSI721_VECT_OMB0_INT + mbox;
+ rc = request_irq(priv->msix[idx].vector, tsi721_omsg_msix, 0,
+ priv->msix[idx].irq_name, (void *)priv);
if (rc) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate MSI-X interrupt for "
- "MBOX%d-INT\n", mbox);
- free_irq(
- priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
- (void *)mport);
+ tsi_debug(OMSG, &priv->pdev->dev,
+ "Unable to get MSI-X IRQ for MBOX%d-INT", mbox);
+ idx = TSI721_VECT_OMB0_DONE + mbox;
+ free_irq(priv->msix[idx].vector, (void *)priv);
goto out_stat;
}
}
@@ -1621,7 +1991,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
mb();
/* Initialize Outbound Message engine */
- iowrite32(TSI721_OBDMAC_CTL_INIT, priv->regs + TSI721_OBDMAC_CTL(mbox));
+ iowrite32(TSI721_OBDMAC_CTL_RETRY_THR | TSI721_OBDMAC_CTL_INIT,
+ priv->regs + TSI721_OBDMAC_CTL(mbox));
ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
udelay(10);
@@ -1684,9 +2055,9 @@ static void tsi721_close_outb_mbox(struct rio_mport *mport, int mbox)
#ifdef CONFIG_PCI_MSI
if (priv->flags & TSI721_USING_MSIX) {
free_irq(priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
- (void *)mport);
+ (void *)priv);
free_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector,
- (void *)mport);
+ (void *)priv);
}
#endif /* CONFIG_PCI_MSI */
@@ -1731,30 +2102,28 @@ static void tsi721_imsg_handler(struct tsi721_device *priv, int ch)
{
u32 mbox = ch - 4;
u32 imsg_int;
+ struct rio_mport *mport = &priv->mport;
spin_lock(&priv->imsg_ring[mbox].lock);
imsg_int = ioread32(priv->regs + TSI721_IBDMAC_INT(ch));
if (imsg_int & TSI721_IBDMAC_INT_SRTO)
- dev_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout\n",
- mbox);
+ tsi_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout", mbox);
if (imsg_int & TSI721_IBDMAC_INT_PC_ERROR)
- dev_info(&priv->pdev->dev, "IB MBOX%d PCIe error\n",
- mbox);
+ tsi_info(&priv->pdev->dev, "IB MBOX%d PCIe error", mbox);
if (imsg_int & TSI721_IBDMAC_INT_FQ_LOW)
- dev_info(&priv->pdev->dev,
- "IB MBOX%d IB free queue low\n", mbox);
+ tsi_info(&priv->pdev->dev, "IB MBOX%d IB free queue low", mbox);
/* Clear IB channel interrupts */
iowrite32(imsg_int, priv->regs + TSI721_IBDMAC_INT(ch));
/* If an IB Msg is received notify the upper layer */
if (imsg_int & TSI721_IBDMAC_INT_DQ_RCV &&
- priv->mport->inb_msg[mbox].mcback)
- priv->mport->inb_msg[mbox].mcback(priv->mport,
+ mport->inb_msg[mbox].mcback)
+ mport->inb_msg[mbox].mcback(mport,
priv->imsg_ring[mbox].dev_id, mbox, -1);
if (!(priv->flags & TSI721_USING_MSIX)) {
@@ -1810,8 +2179,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
GFP_KERNEL);
if (priv->imsg_ring[mbox].buf_base == NULL) {
- dev_err(&priv->pdev->dev,
- "Failed to allocate buffers for IB MBOX%d\n", mbox);
+ tsi_err(&priv->pdev->dev,
+ "Failed to allocate buffers for IB MBOX%d", mbox);
rc = -ENOMEM;
goto out;
}
@@ -1824,8 +2193,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
GFP_KERNEL);
if (priv->imsg_ring[mbox].imfq_base == NULL) {
- dev_err(&priv->pdev->dev,
- "Failed to allocate free queue for IB MBOX%d\n", mbox);
+ tsi_err(&priv->pdev->dev,
+ "Failed to allocate free queue for IB MBOX%d", mbox);
rc = -ENOMEM;
goto out_buf;
}
@@ -1837,8 +2206,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
&priv->imsg_ring[mbox].imd_phys, GFP_KERNEL);
if (priv->imsg_ring[mbox].imd_base == NULL) {
- dev_err(&priv->pdev->dev,
- "Failed to allocate descriptor memory for IB MBOX%d\n",
+ tsi_err(&priv->pdev->dev,
+ "Failed to allocate descriptor memory for IB MBOX%d",
mbox);
rc = -ENOMEM;
goto out_dma;
@@ -1859,7 +2228,7 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
* once when first inbound mailbox is requested.
*/
if (!(priv->flags & TSI721_IMSGID_SET)) {
- iowrite32((u32)priv->mport->host_deviceid,
+ iowrite32((u32)priv->mport.host_deviceid,
priv->regs + TSI721_IB_DEVID);
priv->flags |= TSI721_IMSGID_SET;
}
@@ -1890,31 +2259,29 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
#ifdef CONFIG_PCI_MSI
if (priv->flags & TSI721_USING_MSIX) {
+ int idx = TSI721_VECT_IMB0_RCV + mbox;
+
/* Request interrupt service if we are in MSI-X mode */
- rc = request_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
- tsi721_imsg_msix, 0,
- priv->msix[TSI721_VECT_IMB0_RCV + mbox].irq_name,
- (void *)mport);
+ rc = request_irq(priv->msix[idx].vector, tsi721_imsg_msix, 0,
+ priv->msix[idx].irq_name, (void *)priv);
if (rc) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate MSI-X interrupt for "
- "IBOX%d-DONE\n", mbox);
+ tsi_debug(IMSG, &priv->pdev->dev,
+ "Unable to get MSI-X IRQ for IBOX%d-DONE",
+ mbox);
goto out_desc;
}
- rc = request_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector,
- tsi721_imsg_msix, 0,
- priv->msix[TSI721_VECT_IMB0_INT + mbox].irq_name,
- (void *)mport);
+ idx = TSI721_VECT_IMB0_INT + mbox;
+ rc = request_irq(priv->msix[idx].vector, tsi721_imsg_msix, 0,
+ priv->msix[idx].irq_name, (void *)priv);
if (rc) {
- dev_dbg(&priv->pdev->dev,
- "Unable to allocate MSI-X interrupt for "
- "IBOX%d-INT\n", mbox);
+ tsi_debug(IMSG, &priv->pdev->dev,
+ "Unable to get MSI-X IRQ for IBOX%d-INT", mbox);
free_irq(
priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
- (void *)mport);
+ (void *)priv);
goto out_desc;
}
}
@@ -1985,9 +2352,9 @@ static void tsi721_close_inb_mbox(struct rio_mport *mport, int mbox)
#ifdef CONFIG_PCI_MSI
if (priv->flags & TSI721_USING_MSIX) {
free_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
- (void *)mport);
+ (void *)priv);
free_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector,
- (void *)mport);
+ (void *)priv);
}
#endif /* CONFIG_PCI_MSI */
@@ -2034,8 +2401,8 @@ static int tsi721_add_inb_buffer(struct rio_mport *mport, int mbox, void *buf)
rx_slot = priv->imsg_ring[mbox].rx_slot;
if (priv->imsg_ring[mbox].imq_base[rx_slot]) {
- dev_err(&priv->pdev->dev,
- "Error adding inbound buffer %d, buffer exists\n",
+ tsi_err(&priv->pdev->dev,
+ "Error adding inbound buffer %d, buffer exists",
rx_slot);
rc = -EINVAL;
goto out;
@@ -2153,6 +2520,39 @@ static int tsi721_messages_init(struct tsi721_device *priv)
}
/**
+ * tsi721_query_mport - Fetch inbound message from the Tsi721 MSG Queue
+ * @mport: Master port implementing the Inbound Messaging Engine
+ * @mbox: Inbound mailbox number
+ *
+ * Returns pointer to the message on success or NULL on failure.
+ */
+static int tsi721_query_mport(struct rio_mport *mport,
+ struct rio_mport_attr *attr)
+{
+ struct tsi721_device *priv = mport->priv;
+ u32 rval;
+
+ rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_ERR_STS_CSR(0)));
+ if (rval & RIO_PORT_N_ERR_STS_PORT_OK) {
+ rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_CTL2_CSR(0)));
+ attr->link_speed = (rval & RIO_PORT_N_CTL2_SEL_BAUD) >> 28;
+ rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_CTL_CSR(0)));
+ attr->link_width = (rval & RIO_PORT_N_CTL_IPW) >> 27;
+ } else
+ attr->link_speed = RIO_LINK_DOWN;
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+ attr->flags = RIO_MPORT_DMA | RIO_MPORT_DMA_SG;
+ attr->dma_max_sge = 0;
+ attr->dma_max_size = TSI721_BDMA_MAX_BCOUNT;
+ attr->dma_align = 0;
+#else
+ attr->flags = 0;
+#endif
+ return 0;
+}
+
+/**
* tsi721_disable_ints - disables all device interrupts
* @priv: pointer to tsi721 private data
*/
@@ -2203,6 +2603,34 @@ static void tsi721_disable_ints(struct tsi721_device *priv)
iowrite32(0, priv->regs + TSI721_RIO_EM_DEV_INT_EN);
}
+static struct rio_ops tsi721_rio_ops = {
+ .lcread = tsi721_lcread,
+ .lcwrite = tsi721_lcwrite,
+ .cread = tsi721_cread_dma,
+ .cwrite = tsi721_cwrite_dma,
+ .dsend = tsi721_dsend,
+ .open_inb_mbox = tsi721_open_inb_mbox,
+ .close_inb_mbox = tsi721_close_inb_mbox,
+ .open_outb_mbox = tsi721_open_outb_mbox,
+ .close_outb_mbox = tsi721_close_outb_mbox,
+ .add_outb_message = tsi721_add_outb_message,
+ .add_inb_buffer = tsi721_add_inb_buffer,
+ .get_inb_message = tsi721_get_inb_message,
+ .map_inb = tsi721_rio_map_inb_mem,
+ .unmap_inb = tsi721_rio_unmap_inb_mem,
+ .pwenable = tsi721_pw_enable,
+ .query_mport = tsi721_query_mport,
+ .map_outb = tsi721_map_outb_win,
+ .unmap_outb = tsi721_unmap_outb_win,
+};
+
+static void tsi721_mport_release(struct device *dev)
+{
+ struct rio_mport *mport = to_rio_mport(dev);
+
+ tsi_debug(EXIT, dev, "%s id=%d", mport->name, mport->id);
+}
+
/**
* tsi721_setup_mport - Setup Tsi721 as RapidIO subsystem master port
* @priv: pointer to tsi721 private data
@@ -2213,46 +2641,20 @@ static int tsi721_setup_mport(struct tsi721_device *priv)
{
struct pci_dev *pdev = priv->pdev;
int err = 0;
- struct rio_ops *ops;
-
- struct rio_mport *mport;
+ struct rio_mport *mport = &priv->mport;
- ops = kzalloc(sizeof(struct rio_ops), GFP_KERNEL);
- if (!ops) {
- dev_dbg(&pdev->dev, "Unable to allocate memory for rio_ops\n");
- return -ENOMEM;
- }
-
- ops->lcread = tsi721_lcread;
- ops->lcwrite = tsi721_lcwrite;
- ops->cread = tsi721_cread_dma;
- ops->cwrite = tsi721_cwrite_dma;
- ops->dsend = tsi721_dsend;
- ops->open_inb_mbox = tsi721_open_inb_mbox;
- ops->close_inb_mbox = tsi721_close_inb_mbox;
- ops->open_outb_mbox = tsi721_open_outb_mbox;
- ops->close_outb_mbox = tsi721_close_outb_mbox;
- ops->add_outb_message = tsi721_add_outb_message;
- ops->add_inb_buffer = tsi721_add_inb_buffer;
- ops->get_inb_message = tsi721_get_inb_message;
- ops->map_inb = tsi721_rio_map_inb_mem;
- ops->unmap_inb = tsi721_rio_unmap_inb_mem;
-
- mport = kzalloc(sizeof(struct rio_mport), GFP_KERNEL);
- if (!mport) {
- kfree(ops);
- dev_dbg(&pdev->dev, "Unable to allocate memory for mport\n");
- return -ENOMEM;
- }
+ err = rio_mport_initialize(mport);
+ if (err)
+ return err;
- mport->ops = ops;
+ mport->ops = &tsi721_rio_ops;
mport->index = 0;
mport->sys_size = 0; /* small system */
mport->phy_type = RIO_PHY_SERIAL;
mport->priv = (void *)priv;
mport->phys_efptr = 0x100;
mport->dev.parent = &pdev->dev;
- priv->mport = mport;
+ mport->dev.release = tsi721_mport_release;
INIT_LIST_HEAD(&mport->dbells);
@@ -2270,31 +2672,28 @@ static int tsi721_setup_mport(struct tsi721_device *priv)
else if (!pci_enable_msi(pdev))
priv->flags |= TSI721_USING_MSI;
else
- dev_info(&pdev->dev,
- "MSI/MSI-X is not available. Using legacy INTx.\n");
+ tsi_debug(MPORT, &pdev->dev,
+ "MSI/MSI-X is not available. Using legacy INTx.");
#endif /* CONFIG_PCI_MSI */
- err = tsi721_request_irq(mport);
+ err = tsi721_request_irq(priv);
- if (!err) {
- tsi721_interrupts_init(priv);
- ops->pwenable = tsi721_pw_enable;
- } else {
- dev_err(&pdev->dev, "Unable to get assigned PCI IRQ "
- "vector %02X err=0x%x\n", pdev->irq, err);
- goto err_exit;
+ if (err) {
+ tsi_err(&pdev->dev, "Unable to get PCI IRQ %02X (err=0x%x)",
+ pdev->irq, err);
+ return err;
}
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
- tsi721_register_dma(priv);
+ err = tsi721_register_dma(priv);
+ if (err)
+ goto err_exit;
#endif
/* Enable SRIO link */
iowrite32(ioread32(priv->regs + TSI721_DEVCTL) |
TSI721_DEVCTL_SRBOOT_CMPL,
priv->regs + TSI721_DEVCTL);
- rio_register_mport(mport);
-
if (mport->host_deviceid >= 0)
iowrite32(RIO_PORT_GEN_HOST | RIO_PORT_GEN_MASTER |
RIO_PORT_GEN_DISCOVERED,
@@ -2302,11 +2701,16 @@ static int tsi721_setup_mport(struct tsi721_device *priv)
else
iowrite32(0, priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR));
+ err = rio_register_mport(mport);
+ if (err) {
+ tsi721_unregister_dma(priv);
+ goto err_exit;
+ }
+
return 0;
err_exit:
- kfree(mport);
- kfree(ops);
+ tsi721_free_irq(priv);
return err;
}
@@ -2317,15 +2721,14 @@ static int tsi721_probe(struct pci_dev *pdev,
int err;
priv = kzalloc(sizeof(struct tsi721_device), GFP_KERNEL);
- if (priv == NULL) {
- dev_err(&pdev->dev, "Failed to allocate memory for device\n");
+ if (!priv) {
err = -ENOMEM;
goto err_exit;
}
err = pci_enable_device(pdev);
if (err) {
- dev_err(&pdev->dev, "Failed to enable PCI device\n");
+ tsi_err(&pdev->dev, "Failed to enable PCI device");
goto err_clean;
}
@@ -2333,13 +2736,12 @@ static int tsi721_probe(struct pci_dev *pdev,
#ifdef DEBUG
{
- int i;
- for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
- dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n",
- i, (unsigned long long)pci_resource_start(pdev, i),
- (unsigned long)pci_resource_len(pdev, i),
- pci_resource_flags(pdev, i));
- }
+ int i;
+
+ for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+ tsi_debug(INIT, &pdev->dev, "res%d %pR",
+ i, &pdev->resource[i]);
+ }
}
#endif
/*
@@ -2350,8 +2752,7 @@ static int tsi721_probe(struct pci_dev *pdev,
if (!(pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM) ||
pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM_64 ||
pci_resource_len(pdev, BAR_0) < TSI721_REG_SPACE_SIZE) {
- dev_err(&pdev->dev,
- "Missing or misconfigured CSR BAR0, aborting.\n");
+ tsi_err(&pdev->dev, "Missing or misconfigured CSR BAR0");
err = -ENODEV;
goto err_disable_pdev;
}
@@ -2360,8 +2761,7 @@ static int tsi721_probe(struct pci_dev *pdev,
if (!(pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM) ||
pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM_64 ||
pci_resource_len(pdev, BAR_1) < TSI721_DB_WIN_SIZE) {
- dev_err(&pdev->dev,
- "Missing or misconfigured Doorbell BAR1, aborting.\n");
+ tsi_err(&pdev->dev, "Missing or misconfigured Doorbell BAR1");
err = -ENODEV;
goto err_disable_pdev;
}
@@ -2373,20 +2773,32 @@ static int tsi721_probe(struct pci_dev *pdev,
* It may be a good idea to keep them disabled using HW configuration
* to save PCI memory space.
*/
- if ((pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM) &&
- (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64)) {
- dev_info(&pdev->dev, "Outbound BAR2 is not used but enabled.\n");
+
+ priv->p2r_bar[0].size = priv->p2r_bar[1].size = 0;
+
+ if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64) {
+ if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_PREFETCH)
+ tsi_debug(INIT, &pdev->dev,
+ "Prefetchable OBW BAR2 will not be used");
+ else {
+ priv->p2r_bar[0].base = pci_resource_start(pdev, BAR_2);
+ priv->p2r_bar[0].size = pci_resource_len(pdev, BAR_2);
+ }
}
- if ((pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM) &&
- (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64)) {
- dev_info(&pdev->dev, "Outbound BAR4 is not used but enabled.\n");
+ if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64) {
+ if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_PREFETCH)
+ tsi_debug(INIT, &pdev->dev,
+ "Prefetchable OBW BAR4 will not be used");
+ else {
+ priv->p2r_bar[1].base = pci_resource_start(pdev, BAR_4);
+ priv->p2r_bar[1].size = pci_resource_len(pdev, BAR_4);
+ }
}
err = pci_request_regions(pdev, DRV_NAME);
if (err) {
- dev_err(&pdev->dev, "Cannot obtain PCI resources, "
- "aborting.\n");
+ tsi_err(&pdev->dev, "Unable to obtain PCI resources");
goto err_disable_pdev;
}
@@ -2394,16 +2806,14 @@ static int tsi721_probe(struct pci_dev *pdev,
priv->regs = pci_ioremap_bar(pdev, BAR_0);
if (!priv->regs) {
- dev_err(&pdev->dev,
- "Unable to map device registers space, aborting\n");
+ tsi_err(&pdev->dev, "Unable to map device registers space");
err = -ENOMEM;
goto err_free_res;
}
priv->odb_base = pci_ioremap_bar(pdev, BAR_1);
if (!priv->odb_base) {
- dev_err(&pdev->dev,
- "Unable to map outbound doorbells space, aborting\n");
+ tsi_err(&pdev->dev, "Unable to map outbound doorbells space");
err = -ENOMEM;
goto err_unmap_bars;
}
@@ -2412,25 +2822,23 @@ static int tsi721_probe(struct pci_dev *pdev,
if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
if (err) {
- dev_info(&pdev->dev, "Unable to set DMA mask\n");
+ tsi_err(&pdev->dev, "Unable to set DMA mask");
goto err_unmap_bars;
}
if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
- dev_info(&pdev->dev, "Unable to set consistent DMA mask\n");
+ tsi_info(&pdev->dev, "Unable to set consistent DMA mask");
} else {
err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
if (err)
- dev_info(&pdev->dev, "Unable to set consistent DMA mask\n");
+ tsi_info(&pdev->dev, "Unable to set consistent DMA mask");
}
BUG_ON(!pci_is_pcie(pdev));
- /* Clear "no snoop" and "relaxed ordering" bits, use default MRRS. */
+ /* Clear "no snoop" and "relaxed ordering" bits. */
pcie_capability_clear_and_set_word(pdev, PCI_EXP_DEVCTL,
- PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_RELAX_EN |
- PCI_EXP_DEVCTL_NOSNOOP_EN,
- PCI_EXP_DEVCTL_READRQ_512B);
+ PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN, 0);
/* Adjust PCIe completion timeout. */
pcie_capability_clear_and_set_word(pdev, PCI_EXP_DEVCTL2, 0xf, 0x2);
@@ -2452,7 +2860,7 @@ static int tsi721_probe(struct pci_dev *pdev,
tsi721_init_sr2pc_mapping(priv);
if (tsi721_bdma_maint_init(priv)) {
- dev_err(&pdev->dev, "BDMA initialization failed, aborting\n");
+ tsi_err(&pdev->dev, "BDMA initialization failed");
err = -ENOMEM;
goto err_unmap_bars;
}
@@ -2471,9 +2879,13 @@ static int tsi721_probe(struct pci_dev *pdev,
if (err)
goto err_free_consistent;
+ pci_set_drvdata(pdev, priv);
+ tsi721_interrupts_init(priv);
+
return 0;
err_free_consistent:
+ tsi721_port_write_free(priv);
tsi721_doorbell_free(priv);
err_free_bdma:
tsi721_bdma_maint_free(priv);
@@ -2493,6 +2905,53 @@ err_exit:
return err;
}
+static void tsi721_remove(struct pci_dev *pdev)
+{
+ struct tsi721_device *priv = pci_get_drvdata(pdev);
+
+ tsi_debug(EXIT, &pdev->dev, "enter");
+
+ tsi721_disable_ints(priv);
+ tsi721_free_irq(priv);
+ flush_scheduled_work();
+ rio_unregister_mport(&priv->mport);
+
+ tsi721_unregister_dma(priv);
+ tsi721_bdma_maint_free(priv);
+ tsi721_doorbell_free(priv);
+ tsi721_port_write_free(priv);
+ tsi721_close_sr2pc_mapping(priv);
+
+ if (priv->regs)
+ iounmap(priv->regs);
+ if (priv->odb_base)
+ iounmap(priv->odb_base);
+#ifdef CONFIG_PCI_MSI
+ if (priv->flags & TSI721_USING_MSIX)
+ pci_disable_msix(priv->pdev);
+ else if (priv->flags & TSI721_USING_MSI)
+ pci_disable_msi(priv->pdev);
+#endif
+ pci_release_regions(pdev);
+ pci_clear_master(pdev);
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ kfree(priv);
+ tsi_debug(EXIT, &pdev->dev, "exit");
+}
+
+static void tsi721_shutdown(struct pci_dev *pdev)
+{
+ struct tsi721_device *priv = pci_get_drvdata(pdev);
+
+ tsi_debug(EXIT, &pdev->dev, "enter");
+
+ tsi721_disable_ints(priv);
+ tsi721_dma_stop_all(priv);
+ pci_clear_master(pdev);
+ pci_disable_device(pdev);
+}
+
static const struct pci_device_id tsi721_pci_tbl[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_IDT, PCI_DEVICE_ID_TSI721) },
{ 0, } /* terminate list */
@@ -2504,14 +2963,11 @@ static struct pci_driver tsi721_driver = {
.name = "tsi721",
.id_table = tsi721_pci_tbl,
.probe = tsi721_probe,
+ .remove = tsi721_remove,
+ .shutdown = tsi721_shutdown,
};
-static int __init tsi721_init(void)
-{
- return pci_register_driver(&tsi721_driver);
-}
-
-device_initcall(tsi721_init);
+module_pci_driver(tsi721_driver);
MODULE_DESCRIPTION("IDT Tsi721 PCIExpress-to-SRIO bridge driver");
MODULE_AUTHOR("Integrated Device Technology, Inc.");
diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h
index 9d2502543ef6..5456dbddc929 100644
--- a/drivers/rapidio/devices/tsi721.h
+++ b/drivers/rapidio/devices/tsi721.h
@@ -21,6 +21,46 @@
#ifndef __TSI721_H
#define __TSI721_H
+/* Debug output filtering masks */
+enum {
+ DBG_NONE = 0,
+ DBG_INIT = BIT(0), /* driver init */
+ DBG_EXIT = BIT(1), /* driver exit */
+ DBG_MPORT = BIT(2), /* mport add/remove */
+ DBG_MAINT = BIT(3), /* maintenance ops messages */
+ DBG_DMA = BIT(4), /* DMA transfer messages */
+ DBG_DMAV = BIT(5), /* verbose DMA transfer messages */
+ DBG_IBW = BIT(6), /* inbound window */
+ DBG_EVENT = BIT(7), /* event handling messages */
+ DBG_OBW = BIT(8), /* outbound window messages */
+ DBG_DBELL = BIT(9), /* doorbell messages */
+ DBG_OMSG = BIT(10), /* doorbell messages */
+ DBG_IMSG = BIT(11), /* doorbell messages */
+ DBG_ALL = ~0,
+};
+
+#ifdef DEBUG
+extern u32 dbg_level;
+
+#define tsi_debug(level, dev, fmt, arg...) \
+ do { \
+ if (DBG_##level & dbg_level) \
+ dev_dbg(dev, "%s: " fmt "\n", __func__, ##arg); \
+ } while (0)
+#else
+#define tsi_debug(level, dev, fmt, arg...) \
+ no_printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##arg)
+#endif
+
+#define tsi_info(dev, fmt, arg...) \
+ dev_info(dev, "%s: " fmt "\n", __func__, ##arg)
+
+#define tsi_warn(dev, fmt, arg...) \
+ dev_warn(dev, "%s: WARNING " fmt "\n", __func__, ##arg)
+
+#define tsi_err(dev, fmt, arg...) \
+ dev_err(dev, "%s: ERROR " fmt "\n", __func__, ##arg)
+
#define DRV_NAME "tsi721"
#define DEFAULT_HOPCOUNT 0xff
@@ -674,7 +714,7 @@ struct tsi721_bdma_chan {
struct dma_chan dchan;
struct tsi721_tx_desc *tx_desc;
spinlock_t lock;
- struct list_head active_list;
+ struct tsi721_tx_desc *active_tx;
struct list_head queue;
struct list_head free_list;
struct tasklet_struct tasklet;
@@ -808,9 +848,38 @@ struct msix_irq {
};
#endif /* CONFIG_PCI_MSI */
+struct tsi721_ib_win_mapping {
+ struct list_head node;
+ dma_addr_t lstart;
+};
+
+struct tsi721_ib_win {
+ u64 rstart;
+ u32 size;
+ dma_addr_t lstart;
+ bool active;
+ bool xlat;
+ struct list_head mappings;
+};
+
+struct tsi721_obw_bar {
+ u64 base;
+ u64 size;
+ u64 free;
+};
+
+struct tsi721_ob_win {
+ u64 base;
+ u32 size;
+ u16 destid;
+ u64 rstart;
+ bool active;
+ struct tsi721_obw_bar *pbar;
+};
+
struct tsi721_device {
struct pci_dev *pdev;
- struct rio_mport *mport;
+ struct rio_mport mport;
u32 flags;
void __iomem *regs;
#ifdef CONFIG_PCI_MSI
@@ -843,11 +912,25 @@ struct tsi721_device {
/* Outbound Messaging */
int omsg_init[TSI721_OMSG_CHNUM];
struct tsi721_omsg_ring omsg_ring[TSI721_OMSG_CHNUM];
+
+ /* Inbound Mapping Windows */
+ struct tsi721_ib_win ib_win[TSI721_IBWIN_NUM];
+ int ibwin_cnt;
+
+ /* Outbound Mapping Windows */
+ struct tsi721_obw_bar p2r_bar[2];
+ struct tsi721_ob_win ob_win[TSI721_OBWIN_NUM];
+ int obwin_cnt;
};
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
extern void tsi721_bdma_handler(struct tsi721_bdma_chan *bdma_chan);
extern int tsi721_register_dma(struct tsi721_device *priv);
+extern void tsi721_unregister_dma(struct tsi721_device *priv);
+extern void tsi721_dma_stop_all(struct tsi721_device *priv);
+#else
+#define tsi721_dma_stop_all(priv) do {} while (0)
+#define tsi721_unregister_dma(priv) do {} while (0)
#endif
#endif
diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c
index 47295940a868..155cae1e62de 100644
--- a/drivers/rapidio/devices/tsi721_dma.c
+++ b/drivers/rapidio/devices/tsi721_dma.c
@@ -30,6 +30,7 @@
#include <linux/dma-mapping.h>
#include <linux/interrupt.h>
#include <linux/kfifo.h>
+#include <linux/sched.h>
#include <linux/delay.h>
#include "../../dma/dmaengine.h"
@@ -63,14 +64,6 @@ struct tsi721_tx_desc *to_tsi721_desc(struct dma_async_tx_descriptor *txd)
return container_of(txd, struct tsi721_tx_desc, txd);
}
-static inline
-struct tsi721_tx_desc *tsi721_dma_first_active(
- struct tsi721_bdma_chan *bdma_chan)
-{
- return list_first_entry(&bdma_chan->active_list,
- struct tsi721_tx_desc, desc_node);
-}
-
static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
{
struct tsi721_dma_desc *bd_ptr;
@@ -83,7 +76,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
struct tsi721_device *priv = to_tsi721(bdma_chan->dchan.device);
#endif
- dev_dbg(dev, "Init Block DMA Engine, CH%d\n", bdma_chan->id);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d", bdma_chan->id);
/*
* Allocate space for DMA descriptors
@@ -91,7 +84,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
*/
bd_ptr = dma_zalloc_coherent(dev,
(bd_num + 1) * sizeof(struct tsi721_dma_desc),
- &bd_phys, GFP_KERNEL);
+ &bd_phys, GFP_ATOMIC);
if (!bd_ptr)
return -ENOMEM;
@@ -99,8 +92,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
bdma_chan->bd_phys = bd_phys;
bdma_chan->bd_base = bd_ptr;
- dev_dbg(dev, "DMA descriptors @ %p (phys = %llx)\n",
- bd_ptr, (unsigned long long)bd_phys);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device,
+ "DMAC%d descriptors @ %p (phys = %pad)",
+ bdma_chan->id, bd_ptr, &bd_phys);
/* Allocate space for descriptor status FIFO */
sts_size = ((bd_num + 1) >= TSI721_DMA_MINSTSSZ) ?
@@ -108,7 +102,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
sts_size = roundup_pow_of_two(sts_size);
sts_ptr = dma_zalloc_coherent(dev,
sts_size * sizeof(struct tsi721_dma_sts),
- &sts_phys, GFP_KERNEL);
+ &sts_phys, GFP_ATOMIC);
if (!sts_ptr) {
/* Free space allocated for DMA descriptors */
dma_free_coherent(dev,
@@ -122,9 +116,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
bdma_chan->sts_base = sts_ptr;
bdma_chan->sts_size = sts_size;
- dev_dbg(dev,
- "desc status FIFO @ %p (phys = %llx) size=0x%x\n",
- sts_ptr, (unsigned long long)sts_phys, sts_size);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device,
+ "DMAC%d desc status FIFO @ %p (phys = %pad) size=0x%x",
+ bdma_chan->id, sts_ptr, &sts_phys, sts_size);
/* Initialize DMA descriptors ring using added link descriptor */
bd_ptr[bd_num].type_id = cpu_to_le32(DTYPE3 << 29);
@@ -163,8 +157,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
priv->msix[idx].irq_name, (void *)bdma_chan);
if (rc) {
- dev_dbg(dev, "Unable to get MSI-X for BDMA%d-DONE\n",
- bdma_chan->id);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device,
+ "Unable to get MSI-X for DMAC%d-DONE",
+ bdma_chan->id);
goto err_out;
}
@@ -174,8 +169,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num)
priv->msix[idx].irq_name, (void *)bdma_chan);
if (rc) {
- dev_dbg(dev, "Unable to get MSI-X for BDMA%d-INT\n",
- bdma_chan->id);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device,
+ "Unable to get MSI-X for DMAC%d-INT",
+ bdma_chan->id);
free_irq(
priv->msix[TSI721_VECT_DMA0_DONE +
bdma_chan->id].vector,
@@ -286,7 +282,7 @@ void tsi721_bdma_handler(struct tsi721_bdma_chan *bdma_chan)
/* Disable BDMA channel interrupts */
iowrite32(0, bdma_chan->regs + TSI721_DMAC_INTE);
if (bdma_chan->active)
- tasklet_schedule(&bdma_chan->tasklet);
+ tasklet_hi_schedule(&bdma_chan->tasklet);
}
#ifdef CONFIG_PCI_MSI
@@ -301,7 +297,8 @@ static irqreturn_t tsi721_bdma_msix(int irq, void *ptr)
{
struct tsi721_bdma_chan *bdma_chan = ptr;
- tsi721_bdma_handler(bdma_chan);
+ if (bdma_chan->active)
+ tasklet_hi_schedule(&bdma_chan->tasklet);
return IRQ_HANDLED;
}
#endif /* CONFIG_PCI_MSI */
@@ -310,20 +307,22 @@ static irqreturn_t tsi721_bdma_msix(int irq, void *ptr)
static void tsi721_start_dma(struct tsi721_bdma_chan *bdma_chan)
{
if (!tsi721_dma_is_idle(bdma_chan)) {
- dev_err(bdma_chan->dchan.device->dev,
- "BUG: Attempt to start non-idle channel\n");
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "DMAC%d Attempt to start non-idle channel",
+ bdma_chan->id);
return;
}
if (bdma_chan->wr_count == bdma_chan->wr_count_next) {
- dev_err(bdma_chan->dchan.device->dev,
- "BUG: Attempt to start DMA with no BDs ready\n");
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "DMAC%d Attempt to start DMA with no BDs ready %d",
+ bdma_chan->id, task_pid_nr(current));
return;
}
- dev_dbg(bdma_chan->dchan.device->dev,
- "%s: chan_%d (wrc=%d)\n", __func__, bdma_chan->id,
- bdma_chan->wr_count_next);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d (wrc=%d) %d",
+ bdma_chan->id, bdma_chan->wr_count_next,
+ task_pid_nr(current));
iowrite32(bdma_chan->wr_count_next,
bdma_chan->regs + TSI721_DMAC_DWRCNT);
@@ -425,10 +424,11 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
struct tsi721_dma_desc *bd_ptr = NULL;
u32 idx, rd_idx;
u32 add_count = 0;
+ struct device *ch_dev = &dchan->dev->device;
if (!tsi721_dma_is_idle(bdma_chan)) {
- dev_err(bdma_chan->dchan.device->dev,
- "BUG: Attempt to use non-idle channel\n");
+ tsi_err(ch_dev, "DMAC%d ERR: Attempt to use non-idle channel",
+ bdma_chan->id);
return -EIO;
}
@@ -439,7 +439,7 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
rio_addr = desc->rio_addr;
next_addr = -1;
bcount = 0;
- sys_size = dma_to_mport(bdma_chan->dchan.device)->sys_size;
+ sys_size = dma_to_mport(dchan->device)->sys_size;
rd_idx = ioread32(bdma_chan->regs + TSI721_DMAC_DRDCNT);
rd_idx %= (bdma_chan->bd_num + 1);
@@ -451,18 +451,18 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
add_count++;
}
- dev_dbg(dchan->device->dev, "%s: BD ring status: rdi=%d wri=%d\n",
- __func__, rd_idx, idx);
+ tsi_debug(DMA, ch_dev, "DMAC%d BD ring status: rdi=%d wri=%d",
+ bdma_chan->id, rd_idx, idx);
for_each_sg(desc->sg, sg, desc->sg_len, i) {
- dev_dbg(dchan->device->dev, "sg%d/%d addr: 0x%llx len: %d\n",
- i, desc->sg_len,
+ tsi_debug(DMAV, ch_dev, "DMAC%d sg%d/%d addr: 0x%llx len: %d",
+ bdma_chan->id, i, desc->sg_len,
(unsigned long long)sg_dma_address(sg), sg_dma_len(sg));
if (sg_dma_len(sg) > TSI721_BDMA_MAX_BCOUNT) {
- dev_err(dchan->device->dev,
- "%s: SG entry %d is too large\n", __func__, i);
+ tsi_err(ch_dev, "DMAC%d SG entry %d is too large",
+ bdma_chan->id, i);
err = -EINVAL;
break;
}
@@ -479,17 +479,16 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
} else if (next_addr != -1) {
/* Finalize descriptor using total byte count value */
tsi721_desc_fill_end(bd_ptr, bcount, 0);
- dev_dbg(dchan->device->dev,
- "%s: prev desc final len: %d\n",
- __func__, bcount);
+ tsi_debug(DMAV, ch_dev, "DMAC%d prev desc final len: %d",
+ bdma_chan->id, bcount);
}
desc->rio_addr = rio_addr;
if (i && idx == rd_idx) {
- dev_dbg(dchan->device->dev,
- "%s: HW descriptor ring is full @ %d\n",
- __func__, i);
+ tsi_debug(DMAV, ch_dev,
+ "DMAC%d HW descriptor ring is full @ %d",
+ bdma_chan->id, i);
desc->sg = sg;
desc->sg_len -= i;
break;
@@ -498,13 +497,12 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
bd_ptr = &((struct tsi721_dma_desc *)bdma_chan->bd_base)[idx];
err = tsi721_desc_fill_init(desc, bd_ptr, sg, sys_size);
if (err) {
- dev_err(dchan->device->dev,
- "Failed to build desc: err=%d\n", err);
+ tsi_err(ch_dev, "Failed to build desc: err=%d", err);
break;
}
- dev_dbg(dchan->device->dev, "bd_ptr = %p did=%d raddr=0x%llx\n",
- bd_ptr, desc->destid, desc->rio_addr);
+ tsi_debug(DMAV, ch_dev, "DMAC%d bd_ptr = %p did=%d raddr=0x%llx",
+ bdma_chan->id, bd_ptr, desc->destid, desc->rio_addr);
next_addr = sg_dma_address(sg);
bcount = sg_dma_len(sg);
@@ -519,8 +517,9 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc)
entry_done:
if (sg_is_last(sg)) {
tsi721_desc_fill_end(bd_ptr, bcount, 0);
- dev_dbg(dchan->device->dev, "%s: last desc final len: %d\n",
- __func__, bcount);
+ tsi_debug(DMAV, ch_dev,
+ "DMAC%d last desc final len: %d",
+ bdma_chan->id, bcount);
desc->sg_len = 0;
} else {
rio_addr += sg_dma_len(sg);
@@ -534,35 +533,43 @@ entry_done:
return err;
}
-static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan)
+static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan,
+ struct tsi721_tx_desc *desc)
{
- struct tsi721_tx_desc *desc;
int err;
- dev_dbg(bdma_chan->dchan.device->dev, "%s: Enter\n", __func__);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d", bdma_chan->id);
+
+ if (!tsi721_dma_is_idle(bdma_chan))
+ return;
/*
- * If there are any new transactions in the queue add them
- * into the processing list
- */
- if (!list_empty(&bdma_chan->queue))
- list_splice_init(&bdma_chan->queue, &bdma_chan->active_list);
+ * If there is no data transfer in progress, fetch new descriptor from
+ * the pending queue.
+ */
+
+ if (desc == NULL && bdma_chan->active_tx == NULL &&
+ !list_empty(&bdma_chan->queue)) {
+ desc = list_first_entry(&bdma_chan->queue,
+ struct tsi721_tx_desc, desc_node);
+ list_del_init((&desc->desc_node));
+ bdma_chan->active_tx = desc;
+ }
- /* Start new transaction (if available) */
- if (!list_empty(&bdma_chan->active_list)) {
- desc = tsi721_dma_first_active(bdma_chan);
+ if (desc) {
err = tsi721_submit_sg(desc);
if (!err)
tsi721_start_dma(bdma_chan);
else {
tsi721_dma_tx_err(bdma_chan, desc);
- dev_dbg(bdma_chan->dchan.device->dev,
- "ERR: tsi721_submit_sg failed with err=%d\n",
- err);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device,
+ "DMAC%d ERR: tsi721_submit_sg failed with err=%d",
+ bdma_chan->id, err);
}
}
- dev_dbg(bdma_chan->dchan.device->dev, "%s: Exit\n", __func__);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d Exit",
+ bdma_chan->id);
}
static void tsi721_dma_tasklet(unsigned long data)
@@ -571,22 +578,84 @@ static void tsi721_dma_tasklet(unsigned long data)
u32 dmac_int, dmac_sts;
dmac_int = ioread32(bdma_chan->regs + TSI721_DMAC_INT);
- dev_dbg(bdma_chan->dchan.device->dev, "%s: DMAC%d_INT = 0x%x\n",
- __func__, bdma_chan->id, dmac_int);
+ tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d_INT = 0x%x",
+ bdma_chan->id, dmac_int);
/* Clear channel interrupts */
iowrite32(dmac_int, bdma_chan->regs + TSI721_DMAC_INT);
if (dmac_int & TSI721_DMAC_INT_ERR) {
+ int i = 10000;
+ struct tsi721_tx_desc *desc;
+
+ desc = bdma_chan->active_tx;
dmac_sts = ioread32(bdma_chan->regs + TSI721_DMAC_STS);
- dev_err(bdma_chan->dchan.device->dev,
- "%s: DMA ERROR - DMAC%d_STS = 0x%x\n",
- __func__, bdma_chan->id, dmac_sts);
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "DMAC%d_STS = 0x%x did=%d raddr=0x%llx",
+ bdma_chan->id, dmac_sts, desc->destid, desc->rio_addr);
+
+ /* Re-initialize DMA channel if possible */
+
+ if ((dmac_sts & TSI721_DMAC_STS_ABORT) == 0)
+ goto err_out;
+
+ tsi721_clr_stat(bdma_chan);
+
+ spin_lock(&bdma_chan->lock);
+
+ /* Put DMA channel into init state */
+ iowrite32(TSI721_DMAC_CTL_INIT,
+ bdma_chan->regs + TSI721_DMAC_CTL);
+ do {
+ udelay(1);
+ dmac_sts = ioread32(bdma_chan->regs + TSI721_DMAC_STS);
+ i--;
+ } while ((dmac_sts & TSI721_DMAC_STS_ABORT) && i);
+
+ if (dmac_sts & TSI721_DMAC_STS_ABORT) {
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "Failed to re-initiate DMAC%d", bdma_chan->id);
+ spin_unlock(&bdma_chan->lock);
+ goto err_out;
+ }
+
+ /* Setup DMA descriptor pointers */
+ iowrite32(((u64)bdma_chan->bd_phys >> 32),
+ bdma_chan->regs + TSI721_DMAC_DPTRH);
+ iowrite32(((u64)bdma_chan->bd_phys & TSI721_DMAC_DPTRL_MASK),
+ bdma_chan->regs + TSI721_DMAC_DPTRL);
+
+ /* Setup descriptor status FIFO */
+ iowrite32(((u64)bdma_chan->sts_phys >> 32),
+ bdma_chan->regs + TSI721_DMAC_DSBH);
+ iowrite32(((u64)bdma_chan->sts_phys & TSI721_DMAC_DSBL_MASK),
+ bdma_chan->regs + TSI721_DMAC_DSBL);
+ iowrite32(TSI721_DMAC_DSSZ_SIZE(bdma_chan->sts_size),
+ bdma_chan->regs + TSI721_DMAC_DSSZ);
+
+ /* Clear interrupt bits */
+ iowrite32(TSI721_DMAC_INT_ALL,
+ bdma_chan->regs + TSI721_DMAC_INT);
+
+ ioread32(bdma_chan->regs + TSI721_DMAC_INT);
+
+ bdma_chan->wr_count = bdma_chan->wr_count_next = 0;
+ bdma_chan->sts_rdptr = 0;
+ udelay(10);
+
+ desc = bdma_chan->active_tx;
+ desc->status = DMA_ERROR;
+ dma_cookie_complete(&desc->txd);
+ list_add(&desc->desc_node, &bdma_chan->free_list);
+ bdma_chan->active_tx = NULL;
+ if (bdma_chan->active)
+ tsi721_advance_work(bdma_chan, NULL);
+ spin_unlock(&bdma_chan->lock);
}
if (dmac_int & TSI721_DMAC_INT_STFULL) {
- dev_err(bdma_chan->dchan.device->dev,
- "%s: DMAC%d descriptor status FIFO is full\n",
- __func__, bdma_chan->id);
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "DMAC%d descriptor status FIFO is full",
+ bdma_chan->id);
}
if (dmac_int & (TSI721_DMAC_INT_DONE | TSI721_DMAC_INT_IOFDONE)) {
@@ -594,7 +663,7 @@ static void tsi721_dma_tasklet(unsigned long data)
tsi721_clr_stat(bdma_chan);
spin_lock(&bdma_chan->lock);
- desc = tsi721_dma_first_active(bdma_chan);
+ desc = bdma_chan->active_tx;
if (desc->sg_len == 0) {
dma_async_tx_callback callback = NULL;
@@ -606,17 +675,21 @@ static void tsi721_dma_tasklet(unsigned long data)
callback = desc->txd.callback;
param = desc->txd.callback_param;
}
- list_move(&desc->desc_node, &bdma_chan->free_list);
+ list_add(&desc->desc_node, &bdma_chan->free_list);
+ bdma_chan->active_tx = NULL;
+ if (bdma_chan->active)
+ tsi721_advance_work(bdma_chan, NULL);
spin_unlock(&bdma_chan->lock);
if (callback)
callback(param);
- spin_lock(&bdma_chan->lock);
+ } else {
+ if (bdma_chan->active)
+ tsi721_advance_work(bdma_chan,
+ bdma_chan->active_tx);
+ spin_unlock(&bdma_chan->lock);
}
-
- tsi721_advance_work(bdma_chan);
- spin_unlock(&bdma_chan->lock);
}
-
+err_out:
/* Re-Enable BDMA channel interrupts */
iowrite32(TSI721_DMAC_INT_ALL, bdma_chan->regs + TSI721_DMAC_INTE);
}
@@ -629,8 +702,9 @@ static dma_cookie_t tsi721_tx_submit(struct dma_async_tx_descriptor *txd)
/* Check if the descriptor is detached from any lists */
if (!list_empty(&desc->desc_node)) {
- dev_err(bdma_chan->dchan.device->dev,
- "%s: wrong state of descriptor %p\n", __func__, txd);
+ tsi_err(&bdma_chan->dchan.dev->device,
+ "DMAC%d wrong state of descriptor %p",
+ bdma_chan->id, txd);
return -EIO;
}
@@ -655,25 +729,25 @@ static int tsi721_alloc_chan_resources(struct dma_chan *dchan)
struct tsi721_tx_desc *desc = NULL;
int i;
- dev_dbg(dchan->device->dev, "%s: for channel %d\n",
- __func__, bdma_chan->id);
+ tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id);
if (bdma_chan->bd_base)
return TSI721_DMA_TX_QUEUE_SZ;
/* Initialize BDMA channel */
if (tsi721_bdma_ch_init(bdma_chan, dma_desc_per_channel)) {
- dev_err(dchan->device->dev, "Unable to initialize data DMA"
- " channel %d, aborting\n", bdma_chan->id);
+ tsi_err(&dchan->dev->device, "Unable to initialize DMAC%d",
+ bdma_chan->id);
return -ENODEV;
}
/* Allocate queue of transaction descriptors */
desc = kcalloc(TSI721_DMA_TX_QUEUE_SZ, sizeof(struct tsi721_tx_desc),
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!desc) {
- dev_err(dchan->device->dev,
- "Failed to allocate logical descriptors\n");
+ tsi_err(&dchan->dev->device,
+ "DMAC%d Failed to allocate logical descriptors",
+ bdma_chan->id);
tsi721_bdma_ch_free(bdma_chan);
return -ENOMEM;
}
@@ -714,15 +788,11 @@ static void tsi721_free_chan_resources(struct dma_chan *dchan)
{
struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan);
- dev_dbg(dchan->device->dev, "%s: for channel %d\n",
- __func__, bdma_chan->id);
+ tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id);
if (bdma_chan->bd_base == NULL)
return;
- BUG_ON(!list_empty(&bdma_chan->active_list));
- BUG_ON(!list_empty(&bdma_chan->queue));
-
tsi721_bdma_interrupt_enable(bdma_chan, 0);
bdma_chan->active = false;
tsi721_sync_dma_irq(bdma_chan);
@@ -736,20 +806,26 @@ static
enum dma_status tsi721_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
struct dma_tx_state *txstate)
{
- return dma_cookie_status(dchan, cookie, txstate);
+ struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan);
+ enum dma_status status;
+
+ spin_lock_bh(&bdma_chan->lock);
+ status = dma_cookie_status(dchan, cookie, txstate);
+ spin_unlock_bh(&bdma_chan->lock);
+ return status;
}
static void tsi721_issue_pending(struct dma_chan *dchan)
{
struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan);
- dev_dbg(dchan->device->dev, "%s: Enter\n", __func__);
+ tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id);
+ spin_lock_bh(&bdma_chan->lock);
if (tsi721_dma_is_idle(bdma_chan) && bdma_chan->active) {
- spin_lock_bh(&bdma_chan->lock);
- tsi721_advance_work(bdma_chan);
- spin_unlock_bh(&bdma_chan->lock);
+ tsi721_advance_work(bdma_chan, NULL);
}
+ spin_unlock_bh(&bdma_chan->lock);
}
static
@@ -759,18 +835,19 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan,
void *tinfo)
{
struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan);
- struct tsi721_tx_desc *desc, *_d;
+ struct tsi721_tx_desc *desc;
struct rio_dma_ext *rext = tinfo;
enum dma_rtype rtype;
struct dma_async_tx_descriptor *txd = NULL;
if (!sgl || !sg_len) {
- dev_err(dchan->device->dev, "%s: No SG list\n", __func__);
- return NULL;
+ tsi_err(&dchan->dev->device, "DMAC%d No SG list",
+ bdma_chan->id);
+ return ERR_PTR(-EINVAL);
}
- dev_dbg(dchan->device->dev, "%s: %s\n", __func__,
- (dir == DMA_DEV_TO_MEM)?"READ":"WRITE");
+ tsi_debug(DMA, &dchan->dev->device, "DMAC%d %s", bdma_chan->id,
+ (dir == DMA_DEV_TO_MEM)?"READ":"WRITE");
if (dir == DMA_DEV_TO_MEM)
rtype = NREAD;
@@ -788,30 +865,36 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan,
break;
}
} else {
- dev_err(dchan->device->dev,
- "%s: Unsupported DMA direction option\n", __func__);
- return NULL;
+ tsi_err(&dchan->dev->device,
+ "DMAC%d Unsupported DMA direction option",
+ bdma_chan->id);
+ return ERR_PTR(-EINVAL);
}
spin_lock_bh(&bdma_chan->lock);
- list_for_each_entry_safe(desc, _d, &bdma_chan->free_list, desc_node) {
- if (async_tx_test_ack(&desc->txd)) {
- list_del_init(&desc->desc_node);
- desc->destid = rext->destid;
- desc->rio_addr = rext->rio_addr;
- desc->rio_addr_u = 0;
- desc->rtype = rtype;
- desc->sg_len = sg_len;
- desc->sg = sgl;
- txd = &desc->txd;
- txd->flags = flags;
- break;
- }
+ if (!list_empty(&bdma_chan->free_list)) {
+ desc = list_first_entry(&bdma_chan->free_list,
+ struct tsi721_tx_desc, desc_node);
+ list_del_init(&desc->desc_node);
+ desc->destid = rext->destid;
+ desc->rio_addr = rext->rio_addr;
+ desc->rio_addr_u = 0;
+ desc->rtype = rtype;
+ desc->sg_len = sg_len;
+ desc->sg = sgl;
+ txd = &desc->txd;
+ txd->flags = flags;
}
spin_unlock_bh(&bdma_chan->lock);
+ if (!txd) {
+ tsi_debug(DMA, &dchan->dev->device,
+ "DMAC%d free TXD is not available", bdma_chan->id);
+ return ERR_PTR(-EBUSY);
+ }
+
return txd;
}
@@ -819,16 +902,18 @@ static int tsi721_terminate_all(struct dma_chan *dchan)
{
struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan);
struct tsi721_tx_desc *desc, *_d;
- u32 dmac_int;
LIST_HEAD(list);
- dev_dbg(dchan->device->dev, "%s: Entry\n", __func__);
+ tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id);
spin_lock_bh(&bdma_chan->lock);
bdma_chan->active = false;
- if (!tsi721_dma_is_idle(bdma_chan)) {
+ while (!tsi721_dma_is_idle(bdma_chan)) {
+
+ udelay(5);
+#if (0)
/* make sure to stop the transfer */
iowrite32(TSI721_DMAC_CTL_SUSP,
bdma_chan->regs + TSI721_DMAC_CTL);
@@ -837,9 +922,11 @@ static int tsi721_terminate_all(struct dma_chan *dchan)
do {
dmac_int = ioread32(bdma_chan->regs + TSI721_DMAC_INT);
} while ((dmac_int & TSI721_DMAC_INT_SUSP) == 0);
+#endif
}
- list_splice_init(&bdma_chan->active_list, &list);
+ if (bdma_chan->active_tx)
+ list_add(&bdma_chan->active_tx->desc_node, &list);
list_splice_init(&bdma_chan->queue, &list);
list_for_each_entry_safe(desc, _d, &list, desc_node)
@@ -850,12 +937,42 @@ static int tsi721_terminate_all(struct dma_chan *dchan)
return 0;
}
+static void tsi721_dma_stop(struct tsi721_bdma_chan *bdma_chan)
+{
+ if (!bdma_chan->active)
+ return;
+ spin_lock_bh(&bdma_chan->lock);
+ if (!tsi721_dma_is_idle(bdma_chan)) {
+ int timeout = 100000;
+
+ /* stop the transfer in progress */
+ iowrite32(TSI721_DMAC_CTL_SUSP,
+ bdma_chan->regs + TSI721_DMAC_CTL);
+
+ /* Wait until DMA channel stops */
+ while (!tsi721_dma_is_idle(bdma_chan) && --timeout)
+ udelay(1);
+ }
+
+ spin_unlock_bh(&bdma_chan->lock);
+}
+
+void tsi721_dma_stop_all(struct tsi721_device *priv)
+{
+ int i;
+
+ for (i = 0; i < TSI721_DMA_MAXCH; i++) {
+ if (i != TSI721_DMACH_MAINT)
+ tsi721_dma_stop(&priv->bdma[i]);
+ }
+}
+
int tsi721_register_dma(struct tsi721_device *priv)
{
int i;
int nr_channels = 0;
int err;
- struct rio_mport *mport = priv->mport;
+ struct rio_mport *mport = &priv->mport;
INIT_LIST_HEAD(&mport->dma.channels);
@@ -875,7 +992,7 @@ int tsi721_register_dma(struct tsi721_device *priv)
spin_lock_init(&bdma_chan->lock);
- INIT_LIST_HEAD(&bdma_chan->active_list);
+ bdma_chan->active_tx = NULL;
INIT_LIST_HEAD(&bdma_chan->queue);
INIT_LIST_HEAD(&bdma_chan->free_list);
@@ -901,7 +1018,33 @@ int tsi721_register_dma(struct tsi721_device *priv)
err = dma_async_device_register(&mport->dma);
if (err)
- dev_err(&priv->pdev->dev, "Failed to register DMA device\n");
+ tsi_err(&priv->pdev->dev, "Failed to register DMA device");
return err;
}
+
+void tsi721_unregister_dma(struct tsi721_device *priv)
+{
+ struct rio_mport *mport = &priv->mport;
+ struct dma_chan *chan, *_c;
+ struct tsi721_bdma_chan *bdma_chan;
+
+ tsi721_dma_stop_all(priv);
+ dma_async_device_unregister(&mport->dma);
+
+ list_for_each_entry_safe(chan, _c, &mport->dma.channels,
+ device_node) {
+ bdma_chan = to_tsi721_chan(chan);
+ if (bdma_chan->active) {
+ tsi721_bdma_interrupt_enable(bdma_chan, 0);
+ bdma_chan->active = false;
+ tsi721_sync_dma_irq(bdma_chan);
+ tasklet_kill(&bdma_chan->tasklet);
+ INIT_LIST_HEAD(&bdma_chan->free_list);
+ kfree(bdma_chan->tx_desc);
+ tsi721_bdma_ch_free(bdma_chan);
+ }
+
+ list_del(&chan->device_node);
+ }
+}
diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c
index f301f059bb85..128350f4d17a 100644
--- a/drivers/rapidio/rio-driver.c
+++ b/drivers/rapidio/rio-driver.c
@@ -131,6 +131,17 @@ static int rio_device_remove(struct device *dev)
return 0;
}
+static void rio_device_shutdown(struct device *dev)
+{
+ struct rio_dev *rdev = to_rio_dev(dev);
+ struct rio_driver *rdrv = rdev->driver;
+
+ dev_dbg(dev, "RIO: %s\n", __func__);
+
+ if (rdrv && rdrv->shutdown)
+ rdrv->shutdown(rdev);
+}
+
/**
* rio_register_driver - register a new RIO driver
* @rdrv: the RIO driver structure to register
@@ -229,6 +240,7 @@ struct bus_type rio_bus_type = {
.bus_groups = rio_bus_groups,
.probe = rio_device_probe,
.remove = rio_device_remove,
+ .shutdown = rio_device_shutdown,
.uevent = rio_uevent,
};
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index d6a126c17c03..a63a380809d1 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -39,6 +39,13 @@
static void rio_init_em(struct rio_dev *rdev);
+struct rio_id_table {
+ u16 start; /* logical minimal id */
+ u32 max; /* max number of IDs in table */
+ spinlock_t lock;
+ unsigned long table[0];
+};
+
static int next_destid = 0;
static int next_comptag = 1;
@@ -62,7 +69,7 @@ static int rio_mport_phys_table[] = {
static u16 rio_destid_alloc(struct rio_net *net)
{
int destid;
- struct rio_id_table *idtab = &net->destid_table;
+ struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data;
spin_lock(&idtab->lock);
destid = find_first_zero_bit(idtab->table, idtab->max);
@@ -88,7 +95,7 @@ static u16 rio_destid_alloc(struct rio_net *net)
static int rio_destid_reserve(struct rio_net *net, u16 destid)
{
int oldbit;
- struct rio_id_table *idtab = &net->destid_table;
+ struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data;
destid -= idtab->start;
spin_lock(&idtab->lock);
@@ -106,7 +113,7 @@ static int rio_destid_reserve(struct rio_net *net, u16 destid)
*/
static void rio_destid_free(struct rio_net *net, u16 destid)
{
- struct rio_id_table *idtab = &net->destid_table;
+ struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data;
destid -= idtab->start;
spin_lock(&idtab->lock);
@@ -121,7 +128,7 @@ static void rio_destid_free(struct rio_net *net, u16 destid)
static u16 rio_destid_first(struct rio_net *net)
{
int destid;
- struct rio_id_table *idtab = &net->destid_table;
+ struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data;
spin_lock(&idtab->lock);
destid = find_first_bit(idtab->table, idtab->max);
@@ -141,7 +148,7 @@ static u16 rio_destid_first(struct rio_net *net)
static u16 rio_destid_next(struct rio_net *net, u16 from)
{
int destid;
- struct rio_id_table *idtab = &net->destid_table;
+ struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data;
spin_lock(&idtab->lock);
destid = find_next_bit(idtab->table, idtab->max, from);
@@ -187,19 +194,6 @@ static void rio_set_device_id(struct rio_mport *port, u16 destid, u8 hopcount, u
}
/**
- * rio_local_set_device_id - Set the base/extended device id for a port
- * @port: RIO master port
- * @did: Device ID value to be written
- *
- * Writes the base/extended device id from a device.
- */
-static void rio_local_set_device_id(struct rio_mport *port, u16 did)
-{
- rio_local_write_config_32(port, RIO_DID_CSR, RIO_SET_DID(port->sys_size,
- did));
-}
-
-/**
* rio_clear_locks- Release all host locks and signal enumeration complete
* @net: RIO network to run on
*
@@ -449,9 +443,6 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
if (do_enum)
rio_route_clr_table(rdev, RIO_GLOBAL_TABLE, 0);
-
- list_add_tail(&rswitch->node, &net->switches);
-
} else {
if (do_enum)
/*Enable Input Output Port (transmitter reviever)*/
@@ -461,13 +452,9 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
rdev->comp_tag & RIO_CTAG_UDEVID);
}
- rdev->dev.parent = &port->dev;
+ rdev->dev.parent = &net->dev;
rio_attach_device(rdev);
-
- device_initialize(&rdev->dev);
rdev->dev.release = rio_release_dev;
- rio_dev_get(rdev);
-
rdev->dma_mask = DMA_BIT_MASK(32);
rdev->dev.dma_mask = &rdev->dma_mask;
rdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
@@ -480,6 +467,8 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
if (ret)
goto cleanup;
+ rio_dev_get(rdev);
+
return rdev;
cleanup:
@@ -621,8 +610,6 @@ static int rio_enum_peer(struct rio_net *net, struct rio_mport *port,
rdev = rio_setup_device(net, port, RIO_ANY_DESTID(port->sys_size),
hopcount, 1);
if (rdev) {
- /* Add device to the global and bus/net specific list. */
- list_add_tail(&rdev->net_list, &net->devices);
rdev->prev = prev;
if (prev && rio_is_switch(prev))
prev->rswitch->nextdev[prev_port] = rdev;
@@ -778,8 +765,6 @@ rio_disc_peer(struct rio_net *net, struct rio_mport *port, u16 destid,
/* Setup new RIO device */
if ((rdev = rio_setup_device(net, port, destid, hopcount, 0))) {
- /* Add device to the global and bus/net specific list. */
- list_add_tail(&rdev->net_list, &net->devices);
rdev->prev = prev;
if (prev && rio_is_switch(prev))
prev->rswitch->nextdev[prev_port] = rdev;
@@ -864,50 +849,71 @@ static int rio_mport_is_active(struct rio_mport *port)
return result & RIO_PORT_N_ERR_STS_PORT_OK;
}
-/**
- * rio_alloc_net- Allocate and configure a new RIO network
- * @port: Master port associated with the RIO network
+static void rio_scan_release_net(struct rio_net *net)
+{
+ pr_debug("RIO-SCAN: %s: net_%d\n", __func__, net->id);
+ kfree(net->enum_data);
+}
+
+static void rio_scan_release_dev(struct device *dev)
+{
+ struct rio_net *net;
+
+ net = to_rio_net(dev);
+ pr_debug("RIO-SCAN: %s: net_%d\n", __func__, net->id);
+ kfree(net);
+}
+
+/*
+ * rio_scan_alloc_net - Allocate and configure a new RIO network
+ * @mport: Master port associated with the RIO network
* @do_enum: Enumeration/Discovery mode flag
* @start: logical minimal start id for new net
*
- * Allocates a RIO network structure, initializes per-network
- * list heads, and adds the associated master port to the
- * network list of associated master ports. Returns a
- * RIO network pointer on success or %NULL on failure.
+ * Allocates a new RIO network structure and initializes enumerator-specific
+ * part of it (if required).
+ * Returns a RIO network pointer on success or %NULL on failure.
*/
-static struct rio_net *rio_alloc_net(struct rio_mport *port,
- int do_enum, u16 start)
+static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport,
+ int do_enum, u16 start)
{
struct rio_net *net;
- net = kzalloc(sizeof(struct rio_net), GFP_KERNEL);
+ net = rio_alloc_net(mport);
+
if (net && do_enum) {
- net->destid_table.table = kcalloc(
- BITS_TO_LONGS(RIO_MAX_ROUTE_ENTRIES(port->sys_size)),
- sizeof(long),
- GFP_KERNEL);
+ struct rio_id_table *idtab;
+ size_t size;
+
+ size = sizeof(struct rio_id_table) +
+ BITS_TO_LONGS(
+ RIO_MAX_ROUTE_ENTRIES(mport->sys_size)
+ ) * sizeof(long);
+
+ idtab = kzalloc(size, GFP_KERNEL);
- if (net->destid_table.table == NULL) {
+ if (idtab == NULL) {
pr_err("RIO: failed to allocate destID table\n");
- kfree(net);
+ rio_free_net(net);
net = NULL;
} else {
- net->destid_table.start = start;
- net->destid_table.max =
- RIO_MAX_ROUTE_ENTRIES(port->sys_size);
- spin_lock_init(&net->destid_table.lock);
+ net->enum_data = idtab;
+ net->release = rio_scan_release_net;
+ idtab->start = start;
+ idtab->max = RIO_MAX_ROUTE_ENTRIES(mport->sys_size);
+ spin_lock_init(&idtab->lock);
}
}
if (net) {
- INIT_LIST_HEAD(&net->node);
- INIT_LIST_HEAD(&net->devices);
- INIT_LIST_HEAD(&net->switches);
- INIT_LIST_HEAD(&net->mports);
- list_add_tail(&port->nnode, &net->mports);
- net->hport = port;
- net->id = port->id;
+ net->id = mport->id;
+ net->hport = mport;
+ dev_set_name(&net->dev, "rnet_%d", net->id);
+ net->dev.parent = &mport->dev;
+ net->dev.release = rio_scan_release_dev;
+ rio_add_net(net);
}
+
return net;
}
@@ -968,17 +974,6 @@ static void rio_init_em(struct rio_dev *rdev)
}
/**
- * rio_pw_enable - Enables/disables port-write handling by a master port
- * @port: Master port associated with port-write handling
- * @enable: 1=enable, 0=disable
- */
-static void rio_pw_enable(struct rio_mport *port, int enable)
-{
- if (port->ops->pwenable)
- port->ops->pwenable(port, enable);
-}
-
-/**
* rio_enum_mport- Start enumeration through a master port
* @mport: Master port to send transactions
* @flags: Enumeration control flags
@@ -1016,7 +1011,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags)
/* If master port has an active link, allocate net and enum peers */
if (rio_mport_is_active(mport)) {
- net = rio_alloc_net(mport, 1, 0);
+ net = rio_scan_alloc_net(mport, 1, 0);
if (!net) {
printk(KERN_ERR "RIO: failed to allocate new net\n");
rc = -ENOMEM;
@@ -1133,7 +1128,7 @@ static int rio_disc_mport(struct rio_mport *mport, u32 flags)
enum_done:
pr_debug("RIO: ... enumeration done\n");
- net = rio_alloc_net(mport, 0, 0);
+ net = rio_scan_alloc_net(mport, 0, 0);
if (!net) {
printk(KERN_ERR "RIO: Failed to allocate new net\n");
goto bail;
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index e220edc85c68..74aee4f5eb73 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -30,6 +30,20 @@
#include "rio.h"
+/*
+ * struct rio_pwrite - RIO portwrite event
+ * @node: Node in list of doorbell events
+ * @pwcback: Doorbell event callback
+ * @context: Handler specific context to pass on event
+ */
+struct rio_pwrite {
+ struct list_head node;
+
+ int (*pwcback)(struct rio_mport *mport, void *context,
+ union rio_pw_msg *msg, int step);
+ void *context;
+};
+
MODULE_DESCRIPTION("RapidIO Subsystem Core");
MODULE_AUTHOR("Matt Porter <mporter@kernel.crashing.org>");
MODULE_AUTHOR("Alexandre Bounine <alexandre.bounine@idt.com>");
@@ -42,6 +56,7 @@ MODULE_PARM_DESC(hdid,
"Destination ID assignment to local RapidIO controllers");
static LIST_HEAD(rio_devices);
+static LIST_HEAD(rio_nets);
static DEFINE_SPINLOCK(rio_global_list_lock);
static LIST_HEAD(rio_mports);
@@ -68,6 +83,89 @@ u16 rio_local_get_device_id(struct rio_mport *port)
}
/**
+ * rio_query_mport - Query mport device attributes
+ * @port: mport device to query
+ * @mport_attr: mport attributes data structure
+ *
+ * Returns attributes of specified mport through the
+ * pointer to attributes data structure.
+ */
+int rio_query_mport(struct rio_mport *port,
+ struct rio_mport_attr *mport_attr)
+{
+ if (!port->ops->query_mport)
+ return -ENODATA;
+ return port->ops->query_mport(port, mport_attr);
+}
+EXPORT_SYMBOL(rio_query_mport);
+
+/**
+ * rio_alloc_net- Allocate and initialize a new RIO network data structure
+ * @mport: Master port associated with the RIO network
+ *
+ * Allocates a RIO network structure, initializes per-network
+ * list heads, and adds the associated master port to the
+ * network list of associated master ports. Returns a
+ * RIO network pointer on success or %NULL on failure.
+ */
+struct rio_net *rio_alloc_net(struct rio_mport *mport)
+{
+ struct rio_net *net;
+
+ net = kzalloc(sizeof(struct rio_net), GFP_KERNEL);
+ if (net) {
+ INIT_LIST_HEAD(&net->node);
+ INIT_LIST_HEAD(&net->devices);
+ INIT_LIST_HEAD(&net->switches);
+ INIT_LIST_HEAD(&net->mports);
+ mport->net = net;
+ }
+ return net;
+}
+EXPORT_SYMBOL_GPL(rio_alloc_net);
+
+int rio_add_net(struct rio_net *net)
+{
+ int err;
+
+ err = device_register(&net->dev);
+ if (err)
+ return err;
+ spin_lock(&rio_global_list_lock);
+ list_add_tail(&net->node, &rio_nets);
+ spin_unlock(&rio_global_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rio_add_net);
+
+void rio_free_net(struct rio_net *net)
+{
+ spin_lock(&rio_global_list_lock);
+ if (!list_empty(&net->node))
+ list_del(&net->node);
+ spin_unlock(&rio_global_list_lock);
+ if (net->release)
+ net->release(net);
+ device_unregister(&net->dev);
+}
+EXPORT_SYMBOL_GPL(rio_free_net);
+
+/**
+ * rio_local_set_device_id - Set the base/extended device id for a port
+ * @port: RIO master port
+ * @did: Device ID value to be written
+ *
+ * Writes the base/extended device id from a device.
+ */
+void rio_local_set_device_id(struct rio_mport *port, u16 did)
+{
+ rio_local_write_config_32(port, RIO_DID_CSR,
+ RIO_SET_DID(port->sys_size, did));
+}
+EXPORT_SYMBOL_GPL(rio_local_set_device_id);
+
+/**
* rio_add_device- Adds a RIO device to the device model
* @rdev: RIO device
*
@@ -79,12 +177,19 @@ int rio_add_device(struct rio_dev *rdev)
{
int err;
- err = device_add(&rdev->dev);
+ atomic_set(&rdev->state, RIO_DEVICE_RUNNING);
+ err = device_register(&rdev->dev);
if (err)
return err;
spin_lock(&rio_global_list_lock);
list_add_tail(&rdev->global_list, &rio_devices);
+ if (rdev->net) {
+ list_add_tail(&rdev->net_list, &rdev->net->devices);
+ if (rdev->pef & RIO_PEF_SWITCH)
+ list_add_tail(&rdev->rswitch->node,
+ &rdev->net->switches);
+ }
spin_unlock(&rio_global_list_lock);
rio_create_sysfs_dev_files(rdev);
@@ -93,6 +198,33 @@ int rio_add_device(struct rio_dev *rdev)
}
EXPORT_SYMBOL_GPL(rio_add_device);
+/*
+ * rio_del_device - removes a RIO device from the device model
+ * @rdev: RIO device
+ * @state: device state to set during removal process
+ *
+ * Removes the RIO device to the kernel device list and subsystem's device list.
+ * Clears sysfs entries for the removed device.
+ */
+void rio_del_device(struct rio_dev *rdev, enum rio_device_state state)
+{
+ pr_debug("RIO: %s: removing %s\n", __func__, rio_name(rdev));
+ atomic_set(&rdev->state, state);
+ spin_lock(&rio_global_list_lock);
+ list_del(&rdev->global_list);
+ if (rdev->net) {
+ list_del(&rdev->net_list);
+ if (rdev->pef & RIO_PEF_SWITCH) {
+ list_del(&rdev->rswitch->node);
+ kfree(rdev->rswitch->route_table);
+ }
+ }
+ spin_unlock(&rio_global_list_lock);
+ rio_remove_sysfs_dev_files(rdev);
+ device_unregister(&rdev->dev);
+}
+EXPORT_SYMBOL_GPL(rio_del_device);
+
/**
* rio_request_inb_mbox - request inbound mailbox service
* @mport: RIO master port from which to allocate the mailbox resource
@@ -258,7 +390,9 @@ rio_setup_inb_dbell(struct rio_mport *mport, void *dev_id, struct resource *res,
dbell->dinb = dinb;
dbell->dev_id = dev_id;
+ mutex_lock(&mport->lock);
list_add_tail(&dbell->node, &mport->dbells);
+ mutex_unlock(&mport->lock);
out:
return rc;
@@ -322,12 +456,15 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
int rc = 0, found = 0;
struct rio_dbell *dbell;
+ mutex_lock(&mport->lock);
list_for_each_entry(dbell, &mport->dbells, node) {
if ((dbell->res->start == start) && (dbell->res->end == end)) {
+ list_del(&dbell->node);
found = 1;
break;
}
}
+ mutex_unlock(&mport->lock);
/* If we can't find an exact match, fail */
if (!found) {
@@ -335,9 +472,6 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
goto out;
}
- /* Delete from list */
- list_del(&dbell->node);
-
/* Release the doorbell resource */
rc = release_resource(dbell->res);
@@ -394,7 +528,71 @@ int rio_release_outb_dbell(struct rio_dev *rdev, struct resource *res)
}
/**
- * rio_request_inb_pwrite - request inbound port-write message service
+ * rio_add_mport_pw_handler - add port-write message handler into the list
+ * of mport specific pw handlers
+ * @mport: RIO master port to bind the portwrite callback
+ * @context: Handler specific context to pass on event
+ * @pwcback: Callback to execute when portwrite is received
+ *
+ * Returns 0 if the request has been satisfied.
+ */
+int rio_add_mport_pw_handler(struct rio_mport *mport, void *context,
+ int (*pwcback)(struct rio_mport *mport,
+ void *context, union rio_pw_msg *msg, int step))
+{
+ int rc = 0;
+ struct rio_pwrite *pwrite;
+
+ pwrite = kzalloc(sizeof(struct rio_pwrite), GFP_KERNEL);
+ if (!pwrite) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ pwrite->pwcback = pwcback;
+ pwrite->context = context;
+ mutex_lock(&mport->lock);
+ list_add_tail(&pwrite->node, &mport->pwrites);
+ mutex_unlock(&mport->lock);
+out:
+ return rc;
+}
+EXPORT_SYMBOL_GPL(rio_add_mport_pw_handler);
+
+/**
+ * rio_del_mport_pw_handler - remove port-write message handler from the list
+ * of mport specific pw handlers
+ * @mport: RIO master port to bind the portwrite callback
+ * @context: Registered handler specific context to pass on event
+ * @pwcback: Registered callback function
+ *
+ * Returns 0 if the request has been satisfied.
+ */
+int rio_del_mport_pw_handler(struct rio_mport *mport, void *context,
+ int (*pwcback)(struct rio_mport *mport,
+ void *context, union rio_pw_msg *msg, int step))
+{
+ int rc = -EINVAL;
+ struct rio_pwrite *pwrite;
+
+ mutex_lock(&mport->lock);
+ list_for_each_entry(pwrite, &mport->pwrites, node) {
+ if (pwrite->pwcback == pwcback && pwrite->context == context) {
+ list_del(&pwrite->node);
+ kfree(pwrite);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&mport->lock);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(rio_del_mport_pw_handler);
+
+/**
+ * rio_request_inb_pwrite - request inbound port-write message service for
+ * specific RapidIO device
* @rdev: RIO device to which register inbound port-write callback routine
* @pwcback: Callback routine to execute when port-write is received
*
@@ -419,6 +617,7 @@ EXPORT_SYMBOL_GPL(rio_request_inb_pwrite);
/**
* rio_release_inb_pwrite - release inbound port-write message service
+ * associated with specific RapidIO device
* @rdev: RIO device which registered for inbound port-write callback
*
* Removes callback from the rio_dev structure. Returns 0 if the request
@@ -440,6 +639,24 @@ int rio_release_inb_pwrite(struct rio_dev *rdev)
EXPORT_SYMBOL_GPL(rio_release_inb_pwrite);
/**
+ * rio_pw_enable - Enables/disables port-write handling by a master port
+ * @port: Master port associated with port-write handling
+ * @enable: 1=enable, 0=disable
+ */
+void rio_pw_enable(struct rio_mport *mport, int enable)
+{
+ if (mport->ops->pwenable) {
+ mutex_lock(&mport->lock);
+
+ if ((enable && ++mport->pwe_refcnt == 1) ||
+ (!enable && mport->pwe_refcnt && --mport->pwe_refcnt == 0))
+ mport->ops->pwenable(mport, enable);
+ mutex_unlock(&mport->lock);
+ }
+}
+EXPORT_SYMBOL_GPL(rio_pw_enable);
+
+/**
* rio_map_inb_region -- Map inbound memory region.
* @mport: Master port.
* @local: physical address of memory region to be mapped
@@ -483,6 +700,56 @@ void rio_unmap_inb_region(struct rio_mport *mport, dma_addr_t lstart)
EXPORT_SYMBOL_GPL(rio_unmap_inb_region);
/**
+ * rio_map_outb_region -- Map outbound memory region.
+ * @mport: Master port.
+ * @destid: destination id window points to
+ * @rbase: RIO base address window translates to
+ * @size: Size of the memory region
+ * @rflags: Flags for mapping.
+ * @local: physical address of memory region mapped
+ *
+ * Return: 0 -- Success.
+ *
+ * This function will create the mapping from RIO space to local memory.
+ */
+int rio_map_outb_region(struct rio_mport *mport, u16 destid, u64 rbase,
+ u32 size, u32 rflags, dma_addr_t *local)
+{
+ int rc = 0;
+ unsigned long flags;
+
+ if (!mport->ops->map_outb)
+ return -ENODEV;
+
+ spin_lock_irqsave(&rio_mmap_lock, flags);
+ rc = mport->ops->map_outb(mport, destid, rbase, size,
+ rflags, local);
+ spin_unlock_irqrestore(&rio_mmap_lock, flags);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(rio_map_outb_region);
+
+/**
+ * rio_unmap_inb_region -- Unmap the inbound memory region
+ * @mport: Master port
+ * @destid: destination id mapping points to
+ * @rstart: RIO base address window translates to
+ */
+void rio_unmap_outb_region(struct rio_mport *mport, u16 destid, u64 rstart)
+{
+ unsigned long flags;
+
+ if (!mport->ops->unmap_outb)
+ return;
+
+ spin_lock_irqsave(&rio_mmap_lock, flags);
+ mport->ops->unmap_outb(mport, destid, rstart);
+ spin_unlock_irqrestore(&rio_mmap_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rio_unmap_outb_region);
+
+/**
* rio_mport_get_physefb - Helper function that returns register offset
* for Physical Layer Extended Features Block.
* @port: Master port to issue transaction
@@ -864,52 +1131,66 @@ rd_err:
}
/**
- * rio_inb_pwrite_handler - process inbound port-write message
+ * rio_inb_pwrite_handler - inbound port-write message handler
+ * @mport: mport device associated with port-write
* @pw_msg: pointer to inbound port-write message
*
* Processes an inbound port-write message. Returns 0 if the request
* has been satisfied.
*/
-int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
+int rio_inb_pwrite_handler(struct rio_mport *mport, union rio_pw_msg *pw_msg)
{
struct rio_dev *rdev;
u32 err_status, em_perrdet, em_ltlerrdet;
int rc, portnum;
-
- rdev = rio_get_comptag((pw_msg->em.comptag & RIO_CTAG_UDEVID), NULL);
- if (rdev == NULL) {
- /* Device removed or enumeration error */
- pr_debug("RIO: %s No matching device for CTag 0x%08x\n",
- __func__, pw_msg->em.comptag);
- return -EIO;
- }
-
- pr_debug("RIO: Port-Write message from %s\n", rio_name(rdev));
+ struct rio_pwrite *pwrite;
#ifdef DEBUG_PW
{
- u32 i;
- for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32);) {
+ u32 i;
+
+ pr_debug("%s: PW to mport_%d:\n", __func__, mport->id);
+ for (i = 0; i < RIO_PW_MSG_SIZE / sizeof(u32); i = i + 4) {
pr_debug("0x%02x: %08x %08x %08x %08x\n",
- i*4, pw_msg->raw[i], pw_msg->raw[i + 1],
- pw_msg->raw[i + 2], pw_msg->raw[i + 3]);
- i += 4;
- }
+ i * 4, pw_msg->raw[i], pw_msg->raw[i + 1],
+ pw_msg->raw[i + 2], pw_msg->raw[i + 3]);
+ }
}
#endif
- /* Call an external service function (if such is registered
- * for this device). This may be the service for endpoints that send
- * device-specific port-write messages. End-point messages expected
- * to be handled completely by EP specific device driver.
+ rdev = rio_get_comptag((pw_msg->em.comptag & RIO_CTAG_UDEVID), NULL);
+ if (rdev) {
+ pr_debug("RIO: Port-Write message from %s\n", rio_name(rdev));
+ } else {
+ pr_debug("RIO: %s No matching device for CTag 0x%08x\n",
+ __func__, pw_msg->em.comptag);
+ }
+
+ /* Call a device-specific handler (if it is registered for the device).
+ * This may be the service for endpoints that send device-specific
+ * port-write messages. End-point messages expected to be handled
+ * completely by EP specific device driver.
* For switches rc==0 signals that no standard processing required.
*/
- if (rdev->pwcback != NULL) {
+ if (rdev && rdev->pwcback) {
rc = rdev->pwcback(rdev, pw_msg, 0);
if (rc == 0)
return 0;
}
+ mutex_lock(&mport->lock);
+ list_for_each_entry(pwrite, &mport->pwrites, node)
+ pwrite->pwcback(mport, pwrite->context, pw_msg, 0);
+ mutex_unlock(&mport->lock);
+
+ if (!rdev)
+ return 0;
+
+ /*
+ * FIXME: The code below stays as it was before for now until we decide
+ * how to do default PW handling in combination with per-mport callbacks
+ */
+
portnum = pw_msg->em.is_port & 0xFF;
/* Check if device and route to it are functional:
@@ -1909,32 +2190,31 @@ static int rio_get_hdid(int index)
return hdid[index];
}
-int rio_register_mport(struct rio_mport *port)
+int rio_mport_initialize(struct rio_mport *mport)
{
- struct rio_scan_node *scan = NULL;
- int res = 0;
-
if (next_portid >= RIO_MAX_MPORTS) {
pr_err("RIO: reached specified max number of mports\n");
- return 1;
+ return -ENODEV;
}
- port->id = next_portid++;
- port->host_deviceid = rio_get_hdid(port->id);
- port->nscan = NULL;
+ atomic_set(&mport->state, RIO_DEVICE_INITIALIZING);
+ mport->id = next_portid++;
+ mport->host_deviceid = rio_get_hdid(mport->id);
+ mport->nscan = NULL;
+ mutex_init(&mport->lock);
+ mport->pwe_refcnt = 0;
+ INIT_LIST_HEAD(&mport->pwrites);
- dev_set_name(&port->dev, "rapidio%d", port->id);
- port->dev.class = &rio_mport_class;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rio_mport_initialize);
- res = device_register(&port->dev);
- if (res)
- dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n",
- port->id, res);
- else
- dev_dbg(&port->dev, "RIO: mport%d registered\n", port->id);
+int rio_register_mport(struct rio_mport *port)
+{
+ struct rio_scan_node *scan = NULL;
+ int res = 0;
mutex_lock(&rio_mport_list_lock);
- list_add_tail(&port->node, &rio_mports);
/*
* Check if there are any registered enumeration/discovery operations
@@ -1948,12 +2228,73 @@ int rio_register_mport(struct rio_mport *port)
break;
}
}
+
+ list_add_tail(&port->node, &rio_mports);
mutex_unlock(&rio_mport_list_lock);
+ dev_set_name(&port->dev, "rapidio%d", port->id);
+ port->dev.class = &rio_mport_class;
+ atomic_set(&port->state, RIO_DEVICE_RUNNING);
+
+ res = device_register(&port->dev);
+ if (res)
+ dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n",
+ port->id, res);
+ else
+ dev_dbg(&port->dev, "RIO: registered mport%d\n", port->id);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(rio_register_mport);
+
+static int rio_mport_cleanup_callback(struct device *dev, void *data)
+{
+ struct rio_dev *rdev = to_rio_dev(dev);
+
+ if (dev->bus == &rio_bus_type)
+ rio_del_device(rdev, RIO_DEVICE_SHUTDOWN);
+ return 0;
+}
+
+static int rio_net_remove_children(struct rio_net *net)
+{
+ /*
+ * Unregister all RapidIO devices residing on this net (this will
+ * invoke notification of registered subsystem interfaces as well).
+ */
+ device_for_each_child(&net->dev, NULL, rio_mport_cleanup_callback);
+ return 0;
+}
+
+int rio_unregister_mport(struct rio_mport *port)
+{
pr_debug("RIO: %s %s id=%d\n", __func__, port->name, port->id);
+
+ /* Transition mport to the SHUTDOWN state */
+ if (atomic_cmpxchg(&port->state,
+ RIO_DEVICE_RUNNING,
+ RIO_DEVICE_SHUTDOWN) != RIO_DEVICE_RUNNING) {
+ pr_err("RIO: %s unexpected state transition for mport %s\n",
+ __func__, port->name);
+ }
+
+ if (port->net && port->net->hport == port) {
+ rio_net_remove_children(port->net);
+ rio_free_net(port->net);
+ }
+
+ /*
+ * Unregister all RapidIO devices attached to this mport (this will
+ * invoke notification of registered subsystem interfaces as well).
+ */
+ mutex_lock(&rio_mport_list_lock);
+ list_del(&port->node);
+ mutex_unlock(&rio_mport_list_lock);
+ device_unregister(&port->dev);
+
return 0;
}
-EXPORT_SYMBOL_GPL(rio_register_mport);
+EXPORT_SYMBOL_GPL(rio_unregister_mport);
EXPORT_SYMBOL_GPL(rio_local_get_device_id);
EXPORT_SYMBOL_GPL(rio_get_device);
diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h
index 2d0550e08ea2..625d09add001 100644
--- a/drivers/rapidio/rio.h
+++ b/drivers/rapidio/rio.h
@@ -28,6 +28,7 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid,
extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid,
u8 hopcount);
extern int rio_create_sysfs_dev_files(struct rio_dev *rdev);
+extern void rio_remove_sysfs_dev_files(struct rio_dev *rdev);
extern int rio_lock_device(struct rio_mport *port, u16 destid,
u8 hopcount, int wait_ms);
extern int rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount);
@@ -38,7 +39,11 @@ extern int rio_route_get_entry(struct rio_dev *rdev, u16 table,
extern int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock);
extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock);
extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from);
+extern struct rio_net *rio_alloc_net(struct rio_mport *mport);
+extern int rio_add_net(struct rio_net *net);
+extern void rio_free_net(struct rio_net *net);
extern int rio_add_device(struct rio_dev *rdev);
+extern void rio_del_device(struct rio_dev *rdev, enum rio_device_state state);
extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid,
u8 hopcount, u8 port_num);
extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops);
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 29c325d5c4e6..8fdd7931fbdb 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -647,7 +647,7 @@ static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
#if BITS_PER_LONG == 32
return 1;
#elif defined(CONFIG_COMPAT)
- return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API));
+ return unlikely(in_compat_syscall() || (sbi->ll_flags & LL_SBI_32BIT_API));
#else
return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
#endif
diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index 49fbfe8b0f24..e3d01619d6b3 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -65,18 +65,15 @@ EXPORT_SYMBOL_GPL(usb_speed_string);
enum usb_device_speed usb_get_maximum_speed(struct device *dev)
{
const char *maximum_speed;
- int err;
- int i;
+ int ret;
- err = device_property_read_string(dev, "maximum-speed", &maximum_speed);
- if (err < 0)
+ ret = device_property_read_string(dev, "maximum-speed", &maximum_speed);
+ if (ret < 0)
return USB_SPEED_UNKNOWN;
- for (i = 0; i < ARRAY_SIZE(speed_names); i++)
- if (strcmp(maximum_speed, speed_names[i]) == 0)
- return i;
+ ret = match_string(speed_names, ARRAY_SIZE(speed_names), maximum_speed);
- return USB_SPEED_UNKNOWN;
+ return (ret < 0) ? USB_SPEED_UNKNOWN : ret;
}
EXPORT_SYMBOL_GPL(usb_get_maximum_speed);
@@ -110,13 +107,10 @@ static const char *const usb_dr_modes[] = {
static enum usb_dr_mode usb_get_dr_mode_from_string(const char *str)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(usb_dr_modes); i++)
- if (!strcmp(usb_dr_modes[i], str))
- return i;
+ int ret;
- return USB_DR_MODE_UNKNOWN;
+ ret = match_string(usb_dr_modes, ARRAY_SIZE(usb_dr_modes), str);
+ return (ret < 0) ? USB_DR_MODE_UNKNOWN : ret;
}
enum usb_dr_mode usb_get_dr_mode(struct device *dev)
diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c
index a305caea58ee..fb75b7e5a19a 100644
--- a/drivers/video/fbdev/acornfb.c
+++ b/drivers/video/fbdev/acornfb.c
@@ -1040,8 +1040,8 @@ static int acornfb_probe(struct platform_device *dev)
* for the framebuffer if we are not using
* VRAM.
*/
- base = dma_alloc_writecombine(current_par.dev, size, &handle,
- GFP_KERNEL);
+ base = dma_alloc_wc(current_par.dev, size, &handle,
+ GFP_KERNEL);
if (base == NULL) {
printk(KERN_ERR "acornfb: unable to allocate screen "
"memory\n");
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c
index 7a8afcd4573e..a8a22daa3f9d 100644
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -154,8 +154,8 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
{
dma_addr_t dma;
- fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
- &dma, GFP_KERNEL);
+ fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, framesize, &dma,
+ GFP_KERNEL);
if (!fb->fb.screen_base) {
pr_err("CLCD: unable to map framebuffer\n");
return -ENOMEM;
@@ -169,14 +169,12 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vma)
{
- return dma_mmap_writecombine(&fb->dev->dev, vma,
- fb->fb.screen_base,
- fb->fb.fix.smem_start,
- fb->fb.fix.smem_len);
+ return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+ fb->fb.fix.smem_start, fb->fb.fix.smem_len);
}
void versatile_clcd_remove_dma(struct clcd_fb *fb)
{
- dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
- fb->fb.screen_base, fb->fb.fix.smem_start);
+ dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+ fb->fb.fix.smem_start);
}
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
index 9362424c2340..fe274b5851c7 100644
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -774,8 +774,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb)
static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
{
- return dma_mmap_writecombine(&fb->dev->dev, vma, fb->fb.screen_base,
- fb->fb.fix.smem_start, fb->fb.fix.smem_len);
+ return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+ fb->fb.fix.smem_start, fb->fb.fix.smem_len);
}
static void clcdfb_of_dma_remove(struct clcd_fb *fb)
diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c
index 19eb42b57d87..56c60e67316a 100644
--- a/drivers/video/fbdev/atmel_lcdfb.c
+++ b/drivers/video/fbdev/atmel_lcdfb.c
@@ -414,8 +414,8 @@ static inline void atmel_lcdfb_free_video_memory(struct atmel_lcdfb_info *sinfo)
{
struct fb_info *info = sinfo->info;
- dma_free_writecombine(info->device, info->fix.smem_len,
- info->screen_base, info->fix.smem_start);
+ dma_free_wc(info->device, info->fix.smem_len, info->screen_base,
+ info->fix.smem_start);
}
/**
@@ -435,8 +435,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
* ((var->bits_per_pixel + 7) / 8));
info->fix.smem_len = max(smem_len, sinfo->smem_len);
- info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
- (dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
+ info->screen_base = dma_alloc_wc(info->device, info->fix.smem_len,
+ (dma_addr_t *)&info->fix.smem_start,
+ GFP_KERNEL);
if (!info->screen_base) {
return -ENOMEM;
diff --git a/drivers/video/fbdev/ep93xx-fb.c b/drivers/video/fbdev/ep93xx-fb.c
index 5b1081030cbb..75f0db25d19f 100644
--- a/drivers/video/fbdev/ep93xx-fb.c
+++ b/drivers/video/fbdev/ep93xx-fb.c
@@ -316,9 +316,8 @@ static int ep93xxfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
unsigned int offset = vma->vm_pgoff << PAGE_SHIFT;
if (offset < info->fix.smem_len) {
- return dma_mmap_writecombine(info->dev, vma, info->screen_base,
- info->fix.smem_start,
- info->fix.smem_len);
+ return dma_mmap_wc(info->dev, vma, info->screen_base,
+ info->fix.smem_start, info->fix.smem_len);
}
return -EINVAL;
@@ -428,8 +427,7 @@ static int ep93xxfb_alloc_videomem(struct fb_info *info)
/* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
- virt_addr = dma_alloc_writecombine(info->dev, fb_size,
- &phys_addr, GFP_KERNEL);
+ virt_addr = dma_alloc_wc(info->dev, fb_size, &phys_addr, GFP_KERNEL);
if (!virt_addr)
return -ENOMEM;
diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c
index b63d55f481fa..1a242b1338e9 100644
--- a/drivers/video/fbdev/gbefb.c
+++ b/drivers/video/fbdev/gbefb.c
@@ -1185,8 +1185,8 @@ static int gbefb_probe(struct platform_device *p_dev)
} else {
/* try to allocate memory with the classical allocator
* this has high chance to fail on low memory machines */
- gbe_mem = dma_alloc_writecombine(NULL, gbe_mem_size,
- &gbe_dma_addr, GFP_KERNEL);
+ gbe_mem = dma_alloc_wc(NULL, gbe_mem_size, &gbe_dma_addr,
+ GFP_KERNEL);
if (!gbe_mem) {
printk(KERN_ERR "gbefb: couldn't allocate framebuffer memory\n");
ret = -ENOMEM;
@@ -1238,7 +1238,7 @@ static int gbefb_probe(struct platform_device *p_dev)
out_gbe_unmap:
arch_phys_wc_del(par->wc_cookie);
if (gbe_dma_addr)
- dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+ dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
out_tiles_free:
dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
(void *)gbe_tiles.cpu, gbe_tiles.dma);
@@ -1259,7 +1259,7 @@ static int gbefb_remove(struct platform_device* p_dev)
gbe_turn_off();
arch_phys_wc_del(par->wc_cookie);
if (gbe_dma_addr)
- dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+ dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
(void *)gbe_tiles.cpu, gbe_tiles.dma);
release_mem_region(GBE_BASE, sizeof(struct sgi_gbe));
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index cee88603efc9..465b3387c549 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -922,8 +922,8 @@ static int imxfb_probe(struct platform_device *pdev)
}
fbi->map_size = PAGE_ALIGN(info->fix.smem_len);
- info->screen_base = dma_alloc_writecombine(&pdev->dev, fbi->map_size,
- &fbi->map_dma, GFP_KERNEL);
+ info->screen_base = dma_alloc_wc(&pdev->dev, fbi->map_size,
+ &fbi->map_dma, GFP_KERNEL);
if (!info->screen_base) {
dev_err(&pdev->dev, "Failed to allocate video RAM: %d\n", ret);
@@ -990,8 +990,8 @@ failed_cmap:
if (pdata && pdata->exit)
pdata->exit(fbi->pdev);
failed_platform_init:
- dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
- fbi->map_dma);
+ dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+ fbi->map_dma);
failed_map:
iounmap(fbi->regs);
failed_ioremap:
@@ -1026,8 +1026,8 @@ static int imxfb_remove(struct platform_device *pdev)
kfree(info->pseudo_palette);
framebuffer_release(info);
- dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
- fbi->map_dma);
+ dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+ fbi->map_dma);
iounmap(fbi->regs);
release_mem_region(res->start, resource_size(res));
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c
index 7947634ee6b0..f91b1db262b0 100644
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -1336,9 +1336,8 @@ static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len,
int retval = 0;
dma_addr_t addr;
- fbi->screen_base = dma_alloc_writecombine(fbi->device,
- mem_len,
- &addr, GFP_DMA | GFP_KERNEL);
+ fbi->screen_base = dma_alloc_wc(fbi->device, mem_len, &addr,
+ GFP_DMA | GFP_KERNEL);
if (!fbi->screen_base) {
dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
@@ -1378,8 +1377,8 @@ err0:
*/
static int mx3fb_unmap_video_memory(struct fb_info *fbi)
{
- dma_free_writecombine(fbi->device, fbi->fix.smem_len,
- fbi->screen_base, fbi->fix.smem_start);
+ dma_free_wc(fbi->device, fbi->fix.smem_len, fbi->screen_base,
+ fbi->fix.smem_start);
fbi->screen_base = NULL;
mutex_lock(&fbi->mm_lock);
diff --git a/drivers/video/fbdev/nuc900fb.c b/drivers/video/fbdev/nuc900fb.c
index 389fa2cbb713..6680edae4696 100644
--- a/drivers/video/fbdev/nuc900fb.c
+++ b/drivers/video/fbdev/nuc900fb.c
@@ -396,8 +396,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
dev_dbg(fbi->dev, "nuc900fb_map_video_memory(fbi=%p) map_size %lu\n",
fbi, map_size);
- info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
- &map_dma, GFP_KERNEL);
+ info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+ GFP_KERNEL);
if (!info->screen_base)
return -ENOMEM;
@@ -411,8 +411,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
static inline void nuc900fb_unmap_video_memory(struct fb_info *info)
{
struct nuc900fb_info *fbi = info->par;
- dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
- info->screen_base, info->fix.smem_start);
+ dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+ info->screen_base, info->fix.smem_start);
}
static irqreturn_t nuc900fb_irqhandler(int irq, void *dev_id)
diff --git a/drivers/video/fbdev/omap/lcdc.c b/drivers/video/fbdev/omap/lcdc.c
index 6efa2591eaa8..e3d9b9ea5498 100644
--- a/drivers/video/fbdev/omap/lcdc.c
+++ b/drivers/video/fbdev/omap/lcdc.c
@@ -612,8 +612,8 @@ static void lcdc_dma_handler(u16 status, void *data)
static int alloc_palette_ram(void)
{
- lcdc.palette_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
- MAX_PALETTE_SIZE, &lcdc.palette_phys, GFP_KERNEL);
+ lcdc.palette_virt = dma_alloc_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
+ &lcdc.palette_phys, GFP_KERNEL);
if (lcdc.palette_virt == NULL) {
dev_err(lcdc.fbdev->dev, "failed to alloc palette memory\n");
return -ENOMEM;
@@ -625,8 +625,8 @@ static int alloc_palette_ram(void)
static void free_palette_ram(void)
{
- dma_free_writecombine(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
- lcdc.palette_virt, lcdc.palette_phys);
+ dma_free_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE, lcdc.palette_virt,
+ lcdc.palette_phys);
}
static int alloc_fbmem(struct omapfb_mem_region *region)
@@ -642,8 +642,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
if (region->size > frame_size)
frame_size = region->size;
lcdc.vram_size = frame_size;
- lcdc.vram_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
- lcdc.vram_size, &lcdc.vram_phys, GFP_KERNEL);
+ lcdc.vram_virt = dma_alloc_wc(lcdc.fbdev->dev, lcdc.vram_size,
+ &lcdc.vram_phys, GFP_KERNEL);
if (lcdc.vram_virt == NULL) {
dev_err(lcdc.fbdev->dev, "unable to allocate FB DMA memory\n");
return -ENOMEM;
@@ -660,8 +660,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
static void free_fbmem(void)
{
- dma_free_writecombine(lcdc.fbdev->dev, lcdc.vram_size,
- lcdc.vram_virt, lcdc.vram_phys);
+ dma_free_wc(lcdc.fbdev->dev, lcdc.vram_size, lcdc.vram_virt,
+ lcdc.vram_phys);
}
static int setup_fbmem(struct omapfb_mem_desc *req_md)
diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c
index efb57c059997..def3a501acd6 100644
--- a/drivers/video/fbdev/pxa168fb.c
+++ b/drivers/video/fbdev/pxa168fb.c
@@ -680,8 +680,8 @@ static int pxa168fb_probe(struct platform_device *pdev)
*/
info->fix.smem_len = PAGE_ALIGN(DEFAULT_FB_SIZE);
- info->screen_base = dma_alloc_writecombine(fbi->dev, info->fix.smem_len,
- &fbi->fb_start_dma, GFP_KERNEL);
+ info->screen_base = dma_alloc_wc(fbi->dev, info->fix.smem_len,
+ &fbi->fb_start_dma, GFP_KERNEL);
if (info->screen_base == NULL) {
ret = -ENOMEM;
goto failed_free_info;
@@ -804,8 +804,8 @@ static int pxa168fb_remove(struct platform_device *pdev)
irq = platform_get_irq(pdev, 0);
- dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
- info->screen_base, info->fix.smem_start);
+ dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+ info->screen_base, info->fix.smem_start);
clk_disable(fbi->clk);
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index 33b2bb315a2a..2c0487f4f805 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2446,8 +2446,8 @@ static int pxafb_remove(struct platform_device *dev)
free_pages_exact(fbi->video_mem, fbi->video_mem_size);
- dma_free_writecombine(&dev->dev, fbi->dma_buff_size,
- fbi->dma_buff, fbi->dma_buff_phys);
+ dma_free_wc(&dev->dev, fbi->dma_buff_size, fbi->dma_buff,
+ fbi->dma_buff_phys);
iounmap(fbi->mmio_base);
diff --git a/drivers/video/fbdev/s3c-fb.c b/drivers/video/fbdev/s3c-fb.c
index f72dd12456f9..7e8e0ae98fdf 100644
--- a/drivers/video/fbdev/s3c-fb.c
+++ b/drivers/video/fbdev/s3c-fb.c
@@ -1105,8 +1105,7 @@ static int s3c_fb_alloc_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
dev_dbg(sfb->dev, "want %u bytes for window\n", size);
- fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
- &map_dma, GFP_KERNEL);
+ fbi->screen_base = dma_alloc_wc(sfb->dev, size, &map_dma, GFP_KERNEL);
if (!fbi->screen_base)
return -ENOMEM;
@@ -1131,8 +1130,8 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
struct fb_info *fbi = win->fbinfo;
if (fbi->screen_base)
- dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
- fbi->screen_base, fbi->fix.smem_start);
+ dma_free_wc(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+ fbi->screen_base, fbi->fix.smem_start);
}
/**
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c
index d6704add1601..0dd86be36afb 100644
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -645,8 +645,8 @@ static int s3c2410fb_map_video_memory(struct fb_info *info)
dprintk("map_video_memory(fbi=%p) map_size %u\n", fbi, map_size);
- info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
- &map_dma, GFP_KERNEL);
+ info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+ GFP_KERNEL);
if (info->screen_base) {
/* prevent initial garbage on screen */
@@ -667,8 +667,8 @@ static inline void s3c2410fb_unmap_video_memory(struct fb_info *info)
{
struct s3c2410fb_info *fbi = info->par;
- dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
- info->screen_base, info->fix.smem_start);
+ dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+ info->screen_base, info->fix.smem_start);
}
static inline void modify_gpio(void __iomem *reg,
diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c
index dcf774c15889..fc2aaa5aca23 100644
--- a/drivers/video/fbdev/sa1100fb.c
+++ b/drivers/video/fbdev/sa1100fb.c
@@ -567,8 +567,8 @@ static int sa1100fb_mmap(struct fb_info *info,
if (off < info->fix.smem_len) {
vma->vm_pgoff += 1; /* skip over the palette */
- return dma_mmap_writecombine(fbi->dev, vma, fbi->map_cpu,
- fbi->map_dma, fbi->map_size);
+ return dma_mmap_wc(fbi->dev, vma, fbi->map_cpu, fbi->map_dma,
+ fbi->map_size);
}
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1099,8 +1099,8 @@ static int sa1100fb_map_video_memory(struct sa1100fb_info *fbi)
* of the framebuffer.
*/
fbi->map_size = PAGE_ALIGN(fbi->fb.fix.smem_len + PAGE_SIZE);
- fbi->map_cpu = dma_alloc_writecombine(fbi->dev, fbi->map_size,
- &fbi->map_dma, GFP_KERNEL);
+ fbi->map_cpu = dma_alloc_wc(fbi->dev, fbi->map_size, &fbi->map_dma,
+ GFP_KERNEL);
if (fbi->map_cpu) {
fbi->fb.screen_base = fbi->map_cpu + PAGE_SIZE;
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 73708acce3ca..979a8317204f 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,23 +37,30 @@ config XEN_BALLOON_MEMORY_HOTPLUG
Memory could be hotplugged in following steps:
- 1) dom0: xl mem-max <domU> <maxmem>
+ 1) target domain: ensure that memory auto online policy is in
+ effect by checking /sys/devices/system/memory/auto_online_blocks
+ file (should be 'online').
+
+ 2) control domain: xl mem-max <target-domain> <maxmem>
where <maxmem> is >= requested memory size,
- 2) dom0: xl mem-set <domU> <memory>
+ 3) control domain: xl mem-set <target-domain> <memory>
where <memory> is requested memory size; alternatively memory
could be added by writing proper value to
/sys/devices/system/xen_memory/xen_memory0/target or
- /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU,
+ /sys/devices/system/xen_memory/xen_memory0/target_kb on the
+ target domain.
- 3) domU: for i in /sys/devices/system/memory/memory*/state; do \
- [ "`cat "$i"`" = offline ] && echo online > "$i"; done
+ Alternatively, if memory auto onlining was not requested at step 1
+ the newly added memory can be manually onlined in the target domain
+ by doing the following:
- Memory could be onlined automatically on domU by adding following line to udev rules:
+ for i in /sys/devices/system/memory/memory*/state; do \
+ [ "`cat "$i"`" = offline ] && echo online > "$i"; done
- SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
+ or by adding the following line to udev rules:
- In that case step 3 should be omitted.
+ SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
int "Hotplugged memory limit (in GiB) for a PV guest"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index dc4305b407bf..7c8a2cf16f58 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -338,7 +338,16 @@ static enum bp_state reserve_additional_memory(void)
}
#endif
- rc = add_memory_resource(nid, resource);
+ /*
+ * add_memory_resource() will call online_pages() which in its turn
+ * will call xen_online_page() callback causing deadlock if we don't
+ * release balloon_mutex here. Unlocking here is safe because the
+ * callers drop the mutex before trying again.
+ */
+ mutex_unlock(&balloon_mutex);
+ rc = add_memory_resource(nid, resource, memhp_auto_online);
+ mutex_lock(&balloon_mutex);
+
if (rc) {
pr_warn("Cannot add additional memory (%i)\n", rc);
goto err;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
- *
- * linux/fs/autofs/autofs_i.h
- *
- * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
/* Internal header file for autofs */
@@ -35,28 +31,23 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* #define DEBUG */
-#define DPRINTK(fmt, ...) \
- pr_debug("pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_WARN(fmt, ...) \
- printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_ERROR(fmt, ...) \
- printk(KERN_ERR "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-/* Unified info structure. This is pointed to by both the dentry and
- inode structures. Each file in the filesystem has an instance of this
- structure. It holds a reference to the dentry, so dentries are never
- flushed while the file exists. All name lookups are dealt with at the
- dentry level, although the filesystem can interfere in the validation
- process. Readdir is implemented by traversing the dentry lists. */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+
+/*
+ * Unified info structure. This is pointed to by both the dentry and
+ * inode structures. Each file in the filesystem has an instance of this
+ * structure. It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists. All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process. Readdir is implemented by traversing the dentry lists.
+ */
struct autofs_info {
struct dentry *dentry;
struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
kgid_t gid;
};
-#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
* not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
}
/* autofs4_oz_mode(): do we see the man behind the curtain? (The
- processes which do manipulations for us in user space sees the raw
- filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
int is_autofs4_dentry(struct dentry *);
int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
+ struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
struct vfsmount *mnt,
struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
}
- return;
}
static inline void autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static inline void autofs4_del_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!list_empty(&ino->expiring))
list_del_init(&ino->expiring);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
{
int err = 0;
- if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
- (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
- AUTOFS_WARN("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)",
- AUTOFS_DEV_IOCTL_VERSION_MAJOR,
- AUTOFS_DEV_IOCTL_VERSION_MINOR,
- param->ver_major, param->ver_minor, cmd);
+ if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+ (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
+ pr_warn("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
err = -EINVAL;
}
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* Copy parameter control struct, including a possible path allocated
* at the end of the struct.
*/
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+ copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
{
kfree(param);
- return;
}
/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
err = check_dev_ioctl_version(cmd, param);
if (err) {
- AUTOFS_WARN("invalid device control module version "
- "supplied for cmd(0x%08x)", cmd);
+ pr_warn("invalid device control module version "
+ "supplied for cmd(0x%08x)\n", cmd);
goto out;
}
if (param->size > sizeof(*param)) {
err = invalid_str(param->path, param->size - sizeof(*param));
if (err) {
- AUTOFS_WARN(
- "path string terminator missing for cmd(0x%08x)",
+ pr_warn(
+ "path string terminator missing for cmd(0x%08x)\n",
cmd);
goto out;
}
err = check_name(param->path);
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ pr_warn("invalid path supplied for cmd(0x%08x)\n",
+ cmd);
goto out;
}
}
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
void *data)
{
struct path path;
- int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ int err;
+
+ err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
if (err)
return err;
err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
static int test_by_type(struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
return ino && ino->sbi->type & *(unsigned *)p;
}
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
new_pid = get_task_pid(current, PIDTYPE_PGID);
if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
- AUTOFS_WARN("Not allowed to change PID namespace");
+ pr_warn("not allowed to change PID namespace\n");
err = -EINVAL;
goto out;
}
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
- param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
- param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+ param->requester.uid =
+ from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid =
+ from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
}
/* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+ struct autofs_dev_ioctl __user *user)
{
struct autofs_dev_ioctl *param;
struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
fn = lookup_dev_ioctl(cmd);
if (!fn) {
- AUTOFS_WARN("unknown command 0x%08x", command);
+ pr_warn("unknown command 0x%08x\n", command);
return -ENOTTY;
}
@@ -711,6 +717,7 @@ out:
static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
{
int err;
+
err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
return (long) err;
}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
- .name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
r = misc_register(&_autofs_dev_ioctl_misc);
if (r) {
- AUTOFS_ERROR("misc_register failed for control device");
+ pr_err("misc_register failed for control device\n");
return r;
}
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
void autofs_dev_ioctl_exit(void)
{
misc_deregister(&_autofs_dev_ioctl_misc);
- return;
}
-
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,7 +14,7 @@ static unsigned long now;
/* Check if a dentry can be expired */
static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, int do_now)
{
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %pd", dentry, dentry);
+ pr_debug("dentry %p %pd\n", dentry, dentry);
path_get(&path);
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
status = 0;
done:
- DPRINTK("returning = %d", status);
+ pr_debug("returning = %d\n", status);
path_put(&path);
return status;
}
@@ -74,7 +72,7 @@ done:
* Calculate and dget next entry in the subdirs list under root.
*/
static struct dentry *get_next_positive_subdir(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -121,7 +119,7 @@ cont:
* Calculate and dget next entry in top down tree traversal.
*/
static struct dentry *get_next_positive_dentry(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -187,15 +185,17 @@ again:
* autofs submounts.
*/
static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
return 0;
}
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
/*
* Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
{
struct dentry *p;
- DPRINTK("parent %p %pd", parent, parent);
+ pr_debug("parent %p %pd\n", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+ pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
- DPRINTK("checking symlink %p %pd", dentry, dentry);
+ pr_debug("checking symlink %p %pd\n", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
} else {
/* Path walk currently on this dentry? */
struct dentry *expired;
+
ino_count = atomic_read(&ino->count) + 1;
if (d_count(dentry) > ino_count)
return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
return NULL;
found:
- DPRINTK("returning %p %pd", expired, expired);
+ pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+ pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
- DPRINTK("expire done status=%d", status);
+ pr_debug("expire done status=%d\n", status);
if (d_unhashed(dentry))
return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
struct dentry *dentry;
int ret = 0;
- memset(&pkt,0,sizeof pkt);
+ memset(&pkt, 0, sizeof(pkt));
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ if (!dentry)
return -EAGAIN;
pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
pkt.name[pkt.len] = '\0';
dput(dentry);
- if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+ if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(dentry);
/* This is synchronous because it makes the daemon a
- little easier */
+ * little easier
+ */
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/module.h>
#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -24,7 +20,9 @@
struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ struct autofs_info *ino;
+
+ ino = kzalloc(sizeof(*ino), GFP_KERNEL);
if (ino) {
INIT_LIST_HEAD(&ino->active);
INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
put_pid(sbi->oz_pgrp);
}
- DPRINTK("shutting down");
+ pr_debug("shutting down\n");
kill_litter_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (sbi->pipe)
+ seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ else
+ seq_printf(m, ",pipe_ino=-1");
+#endif
return 0;
}
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
- struct dentry * root;
- struct file * pipe;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct file *pipe;
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- DPRINTK("starting up, sbi = %p",sbi);
+ pr_debug("starting up, sbi = %p\n", sbi);
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
&pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
- printk("autofs: called with bogus options\n");
+ pr_err("called with bogus options\n");
goto fail_dput;
}
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
- pr_warn("autofs: could not find process group %d\n",
+ pr_err("could not find process group %d\n",
pgrp);
goto fail_dput;
}
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- printk("autofs: kernel does not match daemon version "
+ pr_err("kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
goto fail_dput;
}
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+ pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
- printk("autofs: could not open pipe file descriptor\n");
+ pr_err("could not open pipe file descriptor\n");
goto fail_dput;
}
ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_root = root;
return 0;
-
+
/*
* Failure ... clean up.
*/
fail_fput:
- printk("autofs: pipe file descriptor does not contain proper ops\n");
+ pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..9328b5861c7a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/capability.h>
#include <linux/errno.h>
@@ -23,16 +19,18 @@
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+ struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
static void autofs4_add_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
ino->active_count++;
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static void autofs4_del_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
}
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+ pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
struct autofs_info *ino = autofs4_dentry_ino(de);
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
- DPRINTK("releasing %p", de);
+ pr_debug("releasing %p\n", de);
if (!ino)
return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%pd", dentry);
+ pr_debug("waiting for mount name=%pd\n", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
- DPRINTK("mount wait done status=%d", status);
+ pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
struct autofs_info *ino;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
+ struct dentry *new;
+
+ new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* a mount-trap.
*/
struct inode *inode;
+
if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
return 0;
if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %pd", dentry);
+ pr_debug("name = %pd\n", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
sbi = autofs4_sbi(dir->i_sb);
- DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
- if (active) {
+ if (active)
return active;
- } else {
+ else {
/*
* A dentry that is not within the root can never trigger a
* mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
- if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ if (IS_ROOT(dentry->d_parent) &&
+ autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
ino = autofs4_new_ino(sbi);
@@ -554,7 +559,7 @@ static int autofs4_dir_symlink(struct inode *dir,
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %pd", symname, dentry);
+ pr_debug("%s <- %pd\n", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -613,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
+
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -664,7 +669,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
if (IS_ROOT(parent->d_parent))
return;
managed_dentry_clear_managed(parent);
- return;
}
static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +691,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
managed_dentry_set_managed(parent);
- return;
}
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +698,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
- DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+ pr_debug("dentry %p, removing %pd\n", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -728,7 +731,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +742,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %pd", dentry, dentry);
+ pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -768,14 +772,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
- compat_ulong_t __user *p)
+ compat_ulong_t __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > UINT_MAX/HZ)
sbi->exp_timeout = 0;
@@ -783,18 +791,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
#endif
static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
- unsigned long __user *p)
+ unsigned long __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > ULONG_MAX/HZ)
sbi->exp_timeout = 0;
@@ -802,16 +816,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->sub_version, p);
}
@@ -826,7 +844,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
if (may_umount(mnt))
status = 1;
- DPRINTK("returning %d", status);
+ pr_debug("returning %d\n", status);
status = put_user(status, p);
@@ -834,9 +852,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
}
/* Identify autofs4_dentries - this is so we can tell if there's
- an extra dentry refcount or not. We only hold a refcount on the
- dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not. We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +872,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
- DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
- cmd,arg,sbi,task_pgrp_nr(current));
+ pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
+ cmd, arg, sbi, task_pgrp_nr(current));
if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
-
+
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
-
- switch(cmd) {
+
+ switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
autofs4_catatonic_mode(sbi);
return 0;
@@ -888,13 +906,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_run(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
- return -ENOSYS;
+ return -EINVAL;
}
}
@@ -902,12 +922,13 @@ static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
@@ -916,7 +937,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
- (unsigned long)compat_ptr(arg));
+ (unsigned long) compat_ptr(arg));
return ret;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+
if (!dentry)
return ERR_PTR(-ECHILD);
sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/slab.h>
#include <linux/time.h>
@@ -18,7 +14,8 @@
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
- is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
return;
}
- DPRINTK("entering catatonic mode");
+ pr_debug("entering catatonic mode\n");
sbi->catatonic = 1;
wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- while (bytes &&
- (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ while (bytes && wr) {
data += wr;
bytes -= wr;
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
/* Keep the currently executing process from receiving a
- SIGPIPE unless it was already supposed to get one */
+ * SIGPIPE unless it was already supposed to get one
+ */
if (wr == -EPIPE && !sigpipe) {
spin_lock_irqsave(&current->sighand->siglock, flags);
sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return (bytes > 0);
}
-
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct file *pipe = NULL;
size_t pktsz;
- DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+ pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
+ (unsigned long) wq->wait_queue_token,
+ wq->name.len, wq->name.name, type);
- memset(&pkt,0,sizeof pkt); /* For security reasons */
+ memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
case autofs_ptype_expire_multi:
{
- struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+ struct autofs_packet_expire_multi *ep =
+ &pkt.v4_pkt.expire_multi;
pktsz = sizeof(*ep);
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
break;
}
default:
- printk("autofs4_notify_daemon: bad type %d!\n", type);
+ pr_warn("bad type %d!\n", type);
mutex_unlock(&sbi->wq_mutex);
return;
}
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
if (wq->name.hash == qstr->hash &&
wq->name.len == qstr->len &&
wq->name.name &&
- !memcmp(wq->name.name, qstr->name, qstr->len))
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
struct qstr *qstr,
- struct dentry*dentry, enum autofs_notify notify)
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+ if (d_unhashed(dentry) &&
+ d_really_is_positive(dentry)) {
struct dentry *parent = dentry->d_parent;
+
new = d_lookup(parent, &dentry->d_name);
if (new)
dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
if (!wq) {
/* Create a new wait queue */
- wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
autofs_ptype_expire_indirect;
}
- DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
- /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+ /*
+ * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ */
autofs4_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
- DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
}
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
- sigset_t oldset;
+ unsigned long shutdown_sigs_mask;
unsigned long irqflags;
+ sigset_t oldset;
spin_lock_irqsave(&current->sighand->siglock, irqflags);
oldset = current->blocked;
- siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+ shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+ siginitsetinv(&current->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
} else {
- DPRINTK("skipped sleeping");
+ pr_debug("skipped sleeping\n");
}
status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
return 0;
}
-
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 051ea4809c14..7d914c67a9d0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = (unsigned long) get_random_int();
+ random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 0e1e61a7ec23..1c76d73e06dc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void **slot;
spin_lock(&fs_info->buffer_lock);
-restart:
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -147,7 +146,7 @@ restart:
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
continue;
}
spin_unlock(&fs_info->buffer_lock);
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, int warn)
+ int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
- struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, memcg, 1);
+ __set_page_dirty(page, mapping, 1);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
- struct mem_cgroup *memcg;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, memcg, 0);
+ __set_page_dirty(page, mapping, 0);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 50b268483302..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
bool bv;
int rc;
struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- rc = get_user(c, buffer);
- if (rc)
- return rc;
-
- if (strtobool(&c, &bv) == 0) {
+ rc = kstrtobool_from_user(buffer, count, &bv);
+ if (rc == 0) {
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
}
}
spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ return rc;
}
return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
+ char c[2] = { '\0' };
bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = get_user(c[0], buffer);
if (rc)
return rc;
- if (strtobool(&c, &bv) == 0)
+ if (strtobool(c, &bv) == 0)
cifsFYI = bv;
- else if ((c > '1') && (c <= '9'))
- cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+ else if ((c[0] > '1') && (c[0] <= '9'))
+ cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
return count;
}
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_linux_ext_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- linuxExtEnabled = bv;
-
return count;
}
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_lookup_cache_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- lookupCacheEnabled = bv;
-
return count;
}
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &traceSMB);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- traceSMB = bv;
-
return count;
}
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
int rc;
unsigned int flags;
char flags_string[12];
- char c;
bool bv;
if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
- c = flags_string[0];
- if (strtobool(&c, &bv) == 0) {
+ if (strtobool(flags_string, &bv) == 0) {
global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
- } else if (!isdigit(c)) {
+ } else if (!isdigit(flags_string[0])) {
cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
flags_string);
return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 66cf0f9fff89..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
void cifs_dump_mem(char *label, void *data, int length);
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
-extern int traceSMB; /* flag which enables the function below */
+extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13673e3..931b446f2a44 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
#endif
int cifsFYI = 0;
-int traceSMB = 0;
+bool traceSMB;
bool enable_oplocks = true;
-unsigned int linuxExtEnabled = 1;
-unsigned int lookupCacheEnabled = 1;
+bool linuxExtEnabled = true;
+bool lookupCacheEnabled = true;
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a25b2513f146..d21da9f05bae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1596,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN bool lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 6d17f31a31d7..b7bcc15f90a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -282,7 +282,7 @@ errout:
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
- return is_compat_task();
+ return in_compat_syscall();
#else
return (BITS_PER_LONG == 32);
#endif
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5dcc5f5a842e..56ce3704d66f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,158 +499,6 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- set_buffer_new(bh_result);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static int ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- if (bytes <= 0)
- return 0;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-
- return 0;
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
@@ -658,362 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ret = is_overwrite;
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
@@ -1200,6 +692,13 @@ next_bh:
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
+
/*
* Describe the state of a single cluster to be written to.
*/
@@ -1211,7 +710,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -1223,6 +722,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -1271,6 +773,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1309,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
@@ -1318,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -1333,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
@@ -1340,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -1400,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1535,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
@@ -1558,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1582,19 +1114,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1604,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1623,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1635,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
@@ -1705,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1777,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
}
/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
+}
+
+/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
*
@@ -1851,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -2021,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -2076,9 +1682,8 @@ out:
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
@@ -2095,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -2114,14 +1719,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2152,7 +1760,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
@@ -2182,17 +1790,17 @@ try_again:
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -2259,7 +1867,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
@@ -2270,7 +1879,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
@@ -2322,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2380,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2393,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
@@ -2423,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2451,7 +2073,8 @@ out:
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2476,6 +2099,361 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /*
+ * Because we need to change file size in ocfs2_dio_end_io_write(), or
+ * we may need to add it to orphan dir. So can not fall to fast path
+ * while file size will be changed.
+ */
+ if (pos + total_len <= i_size_read(inode)) {
+ down_read(&oi->ip_alloc_sem);
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ up_read(&oi->ip_alloc_sem);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+ }
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ if (bytes <= 0)
+ return 0;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+
+ return 0;
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
- set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
- clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
- test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a76b9ea7722e..1d3bf9e2c86f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -272,10 +272,21 @@ struct o2hb_region {
struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
+ /* negotiate timer, used to negotiate extending hb timeout. */
+ struct delayed_work hr_nego_timeout_work;
+ unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
/* Used during o2hb_check_slot to hold a copy of the block
* being checked because we temporarily have to zero out the
* crc field. */
struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+ /* Message key for negotiate timeout message. */
+ unsigned int hr_key;
+ struct list_head hr_handler_list;
+
+ /* last hb status, 0 for success, other value for error. */
+ int hr_last_hb_status;
};
struct o2hb_bio_wait_ctxt {
@@ -284,10 +295,20 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
+
+enum {
+ O2HB_NEGO_TIMEOUT_MSG = 1,
+ O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+ u8 node_num;
+};
+
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -297,14 +318,14 @@ static void o2hb_write_timeout(struct work_struct *work)
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
@@ -320,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
o2quo_disk_timeout();
}
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
{
/* Arm writeout only after thread reaches steady state */
if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -335,14 +356,133 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
spin_unlock(&o2hb_live_lock);
}
cancel_delayed_work(&reg->hr_write_timeout_work);
- reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+ cancel_delayed_work(&reg->hr_nego_timeout_work);
+ /* negotiate timeout must be less than write timeout. */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+ memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
}
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
{
cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+ cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+ struct o2hb_nego_msg msg;
+ int status, ret;
+
+ msg.node_num = o2nm_this_node();
+again:
+ ret = o2net_send_message(type, key, &msg, sizeof(msg),
+ target, &status);
+
+ if (ret == -EAGAIN || ret == -ENOMEM) {
+ msleep(100);
+ goto again;
+ }
+
+ return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+ struct o2hb_region *reg =
+ container_of(work, struct o2hb_region,
+ hr_nego_timeout_work.work);
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int master_node, i, ret;
+
+ /* don't negotiate timeout if last hb failed since it is very
+ * possible io failed. Should let write timeout fence self.
+ */
+ if (reg->hr_last_hb_status)
+ return;
+
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ /* lowest node as master node to make negotiate decision. */
+ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+
+ if (master_node == o2nm_this_node()) {
+ if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ set_bit(master_node, reg->hr_nego_node_bitmap);
+ }
+ if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
+ sizeof(reg->hr_nego_node_bitmap))) {
+ /* check negotiate bitmap every second to do timeout
+ * approve decision.
+ */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(1000));
+
+ return;
+ }
+
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ /* approve negotiate timeout request. */
+ o2hb_arm_timeout(reg);
+
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ if (i == master_node)
+ continue;
+
+ mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+ ret = o2hb_send_nego_msg(reg->hr_key,
+ O2HB_NEGO_APPROVE_MSG, i);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+ i, ret);
+ }
+ } else {
+ /* negotiate timeout with master node. */
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+ reg->hr_dev_name, master_node);
+ ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+ master_node);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+ master_node, ret);
+ }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = (struct o2hb_region *)data;
+ struct o2hb_nego_msg *nego_msg;
+
+ nego_msg = (struct o2hb_nego_msg *)msg->buf;
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+ if (nego_msg->node_num < O2NM_MAX_NODES)
+ set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+ else
+ mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+ return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = (struct o2hb_region *)data;
+
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ o2hb_arm_timeout(reg);
+ return 0;
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1033,7 +1173,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Skip disarming the timeout if own slot has stale/bad data */
if (own_slot_ok) {
o2hb_set_quorum_device(reg);
- o2hb_arm_write_timeout(reg);
+ o2hb_arm_timeout(reg);
+ reg->hr_last_timeout_start = jiffies;
}
bail:
@@ -1097,6 +1238,7 @@ static int o2hb_thread(void *data)
before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
+ reg->hr_last_hb_status = ret;
after_hb = ktime_get_real();
@@ -1115,7 +1257,7 @@ static int o2hb_thread(void *data)
}
}
- o2hb_disarm_write_timeout(reg);
+ o2hb_disarm_timeout(reg);
/* unclean stop is only used in very bad situation */
for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1452,6 +1594,7 @@ static void o2hb_region_release(struct config_item *item)
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
+ o2net_unregister_handler_list(&reg->hr_handler_list);
kfree(reg);
}
@@ -1764,6 +1907,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+ INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
@@ -1997,13 +2141,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ /* this is the same way to generate msg key as dlm, for local heartbeat,
+ * name is also the same, so make initial crc value different to avoid
+ * message key conflict.
+ */
+ reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+ name, strlen(name));
+ INIT_LIST_HEAD(&reg->hr_handler_list);
+ ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_timeout_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto free;
+
+ ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_approve_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto free_handler;
+
ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
if (ret) {
config_item_put(&reg->hr_item);
- goto free;
+ goto free_handler;
}
return &reg->hr_item;
+
+free_handler:
+ o2net_unregister_handler_list(&reg->hr_handler_list);
free:
kfree(reg);
return ERR_PTR(ret);
@@ -2425,11 +2593,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long flags;
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..3b77862fc85d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -451,6 +451,7 @@ enum {
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
@@ -545,7 +546,7 @@ struct dlm_master_requery
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +587,7 @@ struct dlm_migratable_lockres
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
@@ -782,6 +783,20 @@ struct dlm_deref_lockres
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -968,6 +983,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ /*
+ * Move the lock to the tail because it may be the only lock which has
+ * an invalid lvb.
+ */
+ list_move_tail(&lock->list, &res->granted);
+
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..c73c68efdf67 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..87e22541850e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
dlm_print_one_lock_resource(res);
BUG();
}
- return ret;
+ return ret ? ret : r;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
@@ -2375,6 +2375,122 @@ done:
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ dlm_lockres_put(res);
+
+ spin_unlock(&dlm->spinlock);
+
+done:
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b94a425f0175..48ddf4c1a78c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
* and RECOVERY flag changed when it completes. */
hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
@@ -2071,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
@@ -2377,14 +2388,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when it died. "
+ "continue, dropping the flag.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ }
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..22e6eb8b8d22 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -202,6 +202,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
+ if (!master && ret != 0) {
+ mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -2129,18 +2091,12 @@ out:
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
- int unaligned_dio = 0;
- int dropped_dio = 0;
+ void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
inode_lock(inode);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
@@ -2343,41 +2235,18 @@ relock:
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
-
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
- if (unaligned_dio) {
+ if (direct_io && !is_sync_kiocb(iocb) &&
+ ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
/*
- * Wait on previous unaligned aio to complete before
- * proceeding.
+ * Make it a sync io if it's an unaligned aio.
*/
- mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
- /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
- ocfs2_iocb_set_unaligned_aio(iocb);
+ saved_ki_complete = xchg(&iocb->ki_complete, NULL);
}
/* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
*/
if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- unaligned_dio = 0;
}
if (unlikely(written <= 0))
- goto no_sync;
+ goto out;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
iocb->ki_pos - 1);
}
-no_sync:
- if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
out:
+ if (saved_ki_complete)
+ xchg(&iocb->ki_complete, saved_ki_complete);
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..a83e4ba1fe43
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,605 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2015 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
+ struct list_head fs_list;
+ atomic_t fs_count;
+ struct super_block *fs_sb;
+ struct kset *fs_devicekset;
+ struct kset *fs_fcheckkset;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned short fe_done:1;
+ unsigned short fe_status:15;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck_chk =
+ __ATTR(check, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_fix =
+ __ATTR(fix, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_set =
+ __ATTR(set, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ if (!atomic_dec_and_test(&entry->fs_count))
+ wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kset_unregister(entry->fs_fcheckkset);
+ kset_unregister(entry->fs_devicekset);
+ kfree(entry->fs_fcheck);
+ kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ list_del(&p->fs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ ocfs2_filecheck_sysfs_free(p);
+ return 0;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->fs_count))
+ wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ atomic_inc(&p->fs_count);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return p;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+ int ret = 0;
+ struct kset *device_kset = NULL;
+ struct kset *fcheck_kset = NULL;
+ struct ocfs2_filecheck *fcheck = NULL;
+ struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+ struct attribute **attrs = NULL;
+ struct attribute_group attrgp;
+
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
+ if (!attrs) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ attrs[0] = &ocfs2_attr_filecheck_chk.attr;
+ attrs[1] = &ocfs2_attr_filecheck_fix.attr;
+ attrs[2] = &ocfs2_attr_filecheck_set.attr;
+ attrs[3] = NULL;
+ memset(&attrgp, 0, sizeof(attrgp));
+ attrgp.attrs = attrs;
+ }
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+ }
+
+ if (strlen(sb->s_id) <= 0) {
+ mlog(ML_ERROR,
+ "Cannot get device basename when create filecheck sysfs\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
+ if (!device_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ fcheck_kset = kset_create_and_add("filecheck", NULL,
+ &device_kset->kobj);
+ if (!fcheck_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
+ if (ret)
+ goto error;
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ atomic_set(&entry->fs_count, 1);
+ entry->fs_sb = sb;
+ entry->fs_devicekset = device_kset;
+ entry->fs_fcheckkset = fcheck_kset;
+ entry->fs_fcheck = fcheck;
+ ocfs2_filecheck_sysfs_add(entry);
+ }
+
+ kfree(attrs);
+ return 0;
+
+error:
+ kfree(attrs);
+ kfree(entry);
+ kfree(fcheck);
+ kset_unregister(fcheck_kset);
+ kset_unregister(device_kset);
+ return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+ return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_ERROR,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+ if (!strncmp(name, "fix", 4))
+ *type = OCFS2_FILECHECK_TYPE_FIX;
+ else if (!strncmp(name, "check", 6))
+ *type = OCFS2_FILECHECK_TYPE_CHK;
+ else if (!strncmp(name, "set", 4))
+ *type = OCFS2_FILECHECK_TYPE_SET;
+ else
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+ unsigned int type;
+
+ /* too short/long args length */
+ if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if (ocfs2_filecheck_type_parse(name, &type))
+ return 1;
+ if (ocfs2_filecheck_args_get_long(buf, count, &val))
+ return 1;
+
+ if (val <= 0)
+ return 1;
+
+ args->fa_type = type;
+ if (type == OCFS2_FILECHECK_TYPE_SET)
+ args->fa_len = (unsigned int)val;
+ else
+ args->fa_ino = val;
+
+ return 0;
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ unsigned int type;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+
+ if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+ return -EINVAL;
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ if (type == OCFS2_FILECHECK_TYPE_SET) {
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+ goto exit;
+ }
+
+ ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_type != type)
+ continue;
+
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+ p->fe_ino, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ entry->fe_done = 1;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned short
+ocfs2_filecheck_handle(struct super_block *sb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned short ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+ ssize_t ret = 0;
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
+ mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ return -EINVAL;
+ }
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->parent->name);
+ return -ENODEV;
+ }
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ goto exit;
+ }
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_ERROR,
+ "Cannot do more file check "
+ "since file check queue(%u) is full now\n",
+ ent->fs_fcheck->fc_max);
+ ret = -EBUSY;
+ kfree(entry);
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+ ent->fs_fcheck->fc_size++;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (!ret)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return (!ret ? count : ret);
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..5ec331b655e2
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,48 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2015 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 36294446d960..ee491d1a14fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = 0;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
trace_ocfs2_iget5_locked(inode->i_state);
if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+ (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+ /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+ inode = ERR_PTR(rc);
+ else
+ inode = ERR_PTR(-ESTALE);
goto bail;
}
@@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -532,6 +563,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh)) {
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
@@ -1126,6 +1165,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
@@ -1397,6 +1439,155 @@ bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ goto bail;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+ "not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ goto bail;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+ return 0;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu "
+ "on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ /* Cannot fix invalid inode block */
+ return -OCFS2_FILECHECK_ERR_INVALIDINO;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ /* Cannot just add VALID_FL flag back as a fix,
+ * need more things to check here.
+ */
+ return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
- /* Number of outstanding AIO's which are not page aligned */
- struct mutex ip_unaligned_aio;
-
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..a88707a0f4da 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a52a2dbc064e..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
@@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 91bc674203ed..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,17 +74,12 @@
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
@@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
@@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
+ /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+ ocfs2_filecheck_create_sysfs(sb);
+
return status;
read_super_error:
@@ -1608,33 +1607,25 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1645,11 +1636,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1667,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
+ ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1739,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
- mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -2343,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
@@ -2530,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b2855eea5405..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapcount() is not enough.
+ * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapcount(page))
+ if (!PageSlab(page) && page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
*/
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ u |= 1 << KPF_BUDDY;
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
+ if (PageTail(page) && PageSlab(compound_head(page)))
+ u |= 1 << KPF_SLAB;
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 14ac9822b303..8c1853dfe63b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1883,7 +1883,6 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
- struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1904,10 +1903,10 @@ xfs_vm_set_page_dirty(
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1918,13 +1917,13 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index eb1973bad80b..5e1f345b58dd 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -98,14 +98,14 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
#define atomic_long_xchg(v, new) \
(ATOMIC_LONG_PFX(_xchg)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
-static inline void atomic_long_inc(atomic_long_t *l)
+static __always_inline void atomic_long_inc(atomic_long_t *l)
{
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
ATOMIC_LONG_PFX(_inc)(v);
}
-static inline void atomic_long_dec(atomic_long_t *l)
+static __always_inline void atomic_long_dec(atomic_long_t *l)
{
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
@@ -113,7 +113,7 @@ static inline void atomic_long_dec(atomic_long_t *l)
}
#define ATOMIC_LONG_OP(op) \
-static inline void \
+static __always_inline void \
atomic_long_##op(long i, atomic_long_t *l) \
{ \
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l; \
diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 850f39b33e74..7caaf298f539 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -11,12 +11,7 @@
#define _LINUX_AUTO_DEV_IOCTL_H
#include <linux/auto_fs.h>
-
-#ifdef __KERNEL__
#include <linux/string.h>
-#else
-#include <string.h>
-#endif /* __KERNEL__ */
#define AUTOFS_DEVICE_NAME "autofs"
@@ -125,7 +120,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
in->size = sizeof(struct autofs_dev_ioctl);
in->ioctlfd = -1;
- return;
}
/*
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index fcd704d354c4..b4066bb89083 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -1,14 +1,10 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * linux/include/linux/auto_fs.h
- *
- * Copyright 1997 Transmeta Corporation - All Rights Reserved
+/*
+ * Copyright 1997 Transmeta Corporation - All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
#ifndef _LINUX_AUTO_FS_H
#define _LINUX_AUTO_FS_H
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 89d9aa9e79bf..c67f052cc5e5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -82,15 +82,15 @@ struct buffer_head {
* and buffer_foo() functions.
*/
#define BUFFER_FNS(bit, name) \
-static inline void set_buffer_##name(struct buffer_head *bh) \
+static __always_inline void set_buffer_##name(struct buffer_head *bh) \
{ \
set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline void clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline void clear_buffer_##name(struct buffer_head *bh) \
{ \
clear_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int buffer_##name(const struct buffer_head *bh) \
+static __always_inline int buffer_##name(const struct buffer_head *bh) \
{ \
return test_bit(BH_##bit, &(bh)->b_state); \
}
@@ -99,11 +99,11 @@ static inline int buffer_##name(const struct buffer_head *bh) \
* test_set_buffer_foo() and test_clear_buffer_foo()
*/
#define TAS_BUFFER_FNS(bit, name) \
-static inline int test_set_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{ \
return test_and_set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int test_clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{ \
return test_and_clear_bit(BH_##bit, &(bh)->b_state); \
} \
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4cd4ddf64cc7..d7c8de583a23 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
extern bool compaction_restarting(struct zone *zone, int order);
+extern int kcompactd_run(int nid);
+extern void kcompactd_stop(int nid);
+extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
+
#else
static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, int alloc_flags,
@@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return true;
}
+static inline int kcompactd_run(int nid)
+{
+ return 0;
+}
+static inline void kcompactd_stop(int nid)
+{
+}
+
+static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+}
+
#endif /* CONFIG_COMPACTION */
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index a76c9172b2eb..f911bcec618f 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -5,6 +5,8 @@
* syscall compatibility layer.
*/
+#include <linux/types.h>
+
#ifdef CONFIG_COMPAT
#include <linux/stat.h>
@@ -713,9 +715,22 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
int, const char __user *);
+
+/*
+ * For most but not all architectures, "am I in a compat syscall?" and
+ * "am I a compat task?" are the same question. For architectures on which
+ * they aren't the same question, arch code can override in_compat_syscall.
+ */
+
+#ifndef in_compat_syscall
+static inline bool in_compat_syscall(void) { return is_compat_task(); }
+#endif
+
#else
#define is_compat_task() (0)
+static inline bool in_compat_syscall(void) { return false; }
#endif /* CONFIG_COMPAT */
+
#endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 75857cda38e9..471e064af29f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev)
}
#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
-static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t gfp)
+static inline void *dma_alloc_wc(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t gfp)
{
DEFINE_DMA_ATTRS(attrs);
dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs);
}
+#ifndef dma_alloc_writecombine
+#define dma_alloc_writecombine dma_alloc_wc
+#endif
-static inline void dma_free_writecombine(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr)
+static inline void dma_free_wc(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr)
{
DEFINE_DMA_ATTRS(attrs);
dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs);
}
+#ifndef dma_free_writecombine
+#define dma_free_writecombine dma_free_wc
+#endif
-static inline int dma_mmap_writecombine(struct device *dev,
- struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr,
- size_t size)
+static inline int dma_mmap_wc(struct device *dev,
+ struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr,
+ size_t size)
{
DEFINE_DMA_ATTRS(attrs);
dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
}
+#ifndef dma_mmap_writecombine
+#define dma_mmap_writecombine dma_mmap_wc
+#endif
#ifdef CONFIG_NEED_DMA_MAP_STATE
#define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME
diff --git a/include/linux/err.h b/include/linux/err.h
index 56762ab41713..b7d4a9ff6342 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -18,7 +18,9 @@
#ifndef __ASSEMBLY__
-#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+#define IS_ERR_VALUE(x) ((typeof(x))(-1) <= 0 \
+ ? unlikely((x) <= -1) \
+ : unlikely((x) >= (typeof(x))-MAX_ERRNO))
static inline void * __must_check ERR_PTR(long error)
{
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 3159a7dba034..9f4956d8601c 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -62,10 +62,9 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
#endif /* CONFIG_FAULT_INJECTION */
#ifdef CONFIG_FAILSLAB
-extern bool should_failslab(size_t size, gfp_t gfpflags, unsigned long flags);
+extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags);
#else
-static inline bool should_failslab(size_t size, gfp_t gfpflags,
- unsigned long flags)
+static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
return false;
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index af1f2b24bbe4..36e0c5e14c44 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -9,6 +9,11 @@
struct vm_area_struct;
+/*
+ * In case of changes, please don't forget to update
+ * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
+ */
+
/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA 0x01u
#define ___GFP_HIGHMEM 0x02u
@@ -48,7 +53,6 @@ struct vm_area_struct;
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
-#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
@@ -329,22 +333,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
* 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
* 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
*
- * ZONES_SHIFT must be <= 2 on 32 bit platforms.
+ * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
*/
-#if 16 * ZONES_SHIFT > BITS_PER_LONG
-#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
+#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
+/* ZONE_DEVICE is not a valid GFP zone specifier */
+#define GFP_ZONES_SHIFT 2
+#else
+#define GFP_ZONES_SHIFT ZONES_SHIFT
+#endif
+
+#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
+#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif
#define GFP_ZONE_TABLE ( \
- (ZONE_NORMAL << 0 * ZONES_SHIFT) \
- | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \
- | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \
- | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \
- | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \
- | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \
- | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \
- | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \
+ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \
+ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \
+ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
+ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)
/*
@@ -369,8 +380,8 @@ static inline enum zone_type gfp_zone(gfp_t flags)
enum zone_type z;
int bit = (__force int) (flags & GFP_ZONEMASK);
- z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
- ((1 << ZONES_SHIFT) - 1);
+ z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
+ ((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
@@ -515,13 +526,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
void page_alloc_init_late(void);
-#else
-static inline void page_alloc_init_late(void)
-{
-}
-#endif
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..a477e0766d2e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)
#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
new file mode 100644
index 000000000000..01b4845a4086
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,27 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+ /*
+ * Tracing coverage collection mode.
+ * Covered PCs are collected in a per-task buffer.
+ */
+ KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f31638c6e873..f4fa2b29c38c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -357,6 +357,7 @@ int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
+int __must_check kstrtobool(const char *s, bool *res);
int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
@@ -368,6 +369,7 @@ int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigne
int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
{
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 2cc643c6e870..f82d6a280e49 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -258,6 +258,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -317,6 +319,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
Elf_Shdr *sechdrs, unsigned int relsec);
int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);
#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index ee7229a6c06a..cb483305e1f5 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -48,7 +48,7 @@ static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)
-static inline int hlist_bl_unhashed(const struct hlist_bl_node *h)
+static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h)
{
return !h->pprev;
}
@@ -68,7 +68,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}
-static inline int hlist_bl_empty(const struct hlist_bl_head *h)
+static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 792c8981e633..1191d79aa495 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -28,6 +28,7 @@
#include <linux/eventfd.h>
#include <linux/mmzone.h>
#include <linux/writeback.h>
+#include <linux/page-flags.h>
struct mem_cgroup;
struct page;
@@ -51,7 +52,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
/* default hierarchy stats */
- MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
+ MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
+ MEMCG_SLAB_RECLAIMABLE,
+ MEMCG_SLAB_UNRECLAIMABLE,
+ MEMCG_SOCK,
MEMCG_NR_STAT,
};
@@ -89,6 +93,10 @@ enum mem_cgroup_events_target {
};
#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT 16
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
@@ -265,6 +273,11 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
+static inline bool mem_cgroup_disabled(void)
+{
+ return !cgroup_subsys_enabled(memory_cgrp_subsys);
+}
+
/**
* mem_cgroup_events - count memory events against a cgroup
* @memcg: the memory cgroup
@@ -291,7 +304,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -312,6 +325,28 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ return memcg->css.id;
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from an id
+ * @id: the id to look up
+ *
+ * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
+ */
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id, &memory_cgrp_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/**
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
@@ -353,11 +388,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
ino_t page_cgroup_ino(struct page *page);
-static inline bool mem_cgroup_disabled(void)
-{
- return !cgroup_subsys_enabled(memory_cgrp_subsys);
-}
-
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
@@ -373,6 +403,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages);
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask);
+
static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
@@ -429,36 +462,43 @@ bool mem_cgroup_oom_synchronize(bool wait);
extern int do_swap_account;
#endif
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
+void lock_page_memcg(struct page *page);
+void unlock_page_memcg(struct page *page);
/**
* mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
+ * @page: the page
* @idx: page state item to account
* @val: number of pages (positive or negative)
*
- * See mem_cgroup_begin_page_stat() for locking requirements.
+ * The @page must be locked or the caller must use lock_page_memcg()
+ * to prevent double accounting when the page is concurrently being
+ * moved to another memcg:
+ *
+ * lock_page(page) or lock_page_memcg(page)
+ * if (TestClearPageState(page))
+ * mem_cgroup_update_page_stat(page, state, -1);
+ * unlock_page(page) or unlock_page_memcg(page)
*/
-static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx, int val)
{
- VM_BUG_ON(!rcu_read_lock_held());
+ VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
+ if (page->mem_cgroup)
+ this_cpu_add(page->mem_cgroup->stat->count[idx], val);
}
-static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(memcg, idx, 1);
+ mem_cgroup_update_page_stat(page, idx, 1);
}
-static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(memcg, idx, -1);
+ mem_cgroup_update_page_stat(page, idx, -1);
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
@@ -496,8 +536,17 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#endif
#else /* CONFIG_MEMCG */
+
+#define MEM_CGROUP_ID_SHIFT 0
+#define MEM_CGROUP_ID_MAX 0
+
struct mem_cgroup;
+static inline bool mem_cgroup_disabled(void)
+{
+ return true;
+}
+
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
@@ -539,7 +588,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{
}
-static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
+static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}
@@ -580,9 +629,16 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
{
}
-static inline bool mem_cgroup_disabled(void)
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
- return true;
+ return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(id);
+ /* XXX: This should always return root_mem_cgroup */
+ return NULL;
}
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -608,17 +664,23 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
}
+static inline unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+static inline void lock_page_memcg(struct page *page)
{
- return NULL;
}
-static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+static inline void unlock_page_memcg(struct page *page)
{
}
@@ -644,12 +706,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
return false;
}
-static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
}
-static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
}
@@ -743,11 +805,6 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_unlikely(&memcg_kmem_enabled_key);
}
-static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
-{
- return memcg->kmem_state == KMEM_ONLINE;
-}
-
/*
* In general, we'll do everything in our power to not incur in any overhead
* for non-memcg users for the kmem functions. Not even a function call, if we
@@ -765,7 +822,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge(struct page *page, int order);
/*
- * helper for acessing a memcg's index. It will be used as an index in the
+ * helper for accessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
* will return -1 when this is not a kmem-limited memcg.
*/
@@ -834,6 +891,20 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
if (memcg_kmem_enabled())
__memcg_kmem_put_cache(cachep);
}
+
+/**
+ * memcg_kmem_update_page_stat - update kmem page state statistics
+ * @page: the page
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ */
+static inline void memcg_kmem_update_page_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ if (memcg_kmem_enabled() && page->mem_cgroup)
+ this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+}
+
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
@@ -843,11 +914,6 @@ static inline bool memcg_kmem_enabled(void)
return false;
}
-static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
-{
- return false;
-}
-
static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
return 0;
@@ -879,6 +945,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
}
+
+static inline void memcg_kmem_update_page_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
+{
+}
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 8b8d8d12348e..82730adba950 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -109,6 +109,9 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
+extern int memory_block_change_state(struct memory_block *mem,
+ unsigned long to_state,
+ unsigned long from_state_req);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern int unregister_memory_section(struct mem_section *);
#endif
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 43405992d027..adbef586e696 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -99,6 +99,8 @@ extern void __online_page_free(struct page *page);
extern int try_online_node(int nid);
+extern bool memhp_auto_online;
+
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
@@ -196,6 +198,9 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
* Stub functions for when hotplug is off
@@ -267,7 +272,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource);
+extern int add_memory_resource(int nid, struct resource *resource, bool online);
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
bool for_device);
extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index cac1c0904d5f..9b50325e4ddf 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,9 +23,13 @@ enum migrate_reason {
MR_SYSCALL, /* also applies to cpusets */
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
- MR_CMA
+ MR_CMA,
+ MR_TYPES
};
+/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
+extern char *migrate_reason_names[MR_TYPES];
+
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3579d1e2fe3a..a0ad7af5a1a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -905,20 +905,11 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
{
return page->mem_cgroup;
}
-
-static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
-{
- page->mem_cgroup = memcg;
-}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
}
-
-static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
-{
-}
#endif
/*
@@ -1114,6 +1105,8 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
+ bool ignore_dirty; /* Ignore dirty pages */
+ bool check_swap_entries; /* Check also swap entries */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1300,10 +1293,9 @@ int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg);
+void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb);
+ struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
void cancel_dirty_page(struct page *page);
@@ -2178,6 +2170,19 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
+#ifdef CONFIG_PAGE_POISONING
+extern void poison_pages(struct page *page, int n);
+extern void unpoison_pages(struct page *page, int n);
+extern bool page_poisoning_enabled(void);
+extern void kernel_poison_pages(struct page *page, int numpages, int enable);
+#else
+static inline void poison_pages(struct page *page, int n) { }
+static inline void unpoison_pages(struct page *page, int n) { }
+static inline bool page_poisoning_enabled(void) { return false; }
+static inline void kernel_poison_pages(struct page *page, int numpages,
+ int enable) { }
+#endif
+
#ifdef CONFIG_DEBUG_PAGEALLOC
extern bool _debug_pagealloc_enabled;
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
@@ -2197,14 +2202,18 @@ kernel_map_pages(struct page *page, int numpages, int enable)
}
#ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page);
-#endif /* CONFIG_HIBERNATION */
-#else
+#endif /* CONFIG_HIBERNATION */
+#else /* CONFIG_DEBUG_PAGEALLOC */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; }
-#endif /* CONFIG_HIBERNATION */
-#endif
+#endif /* CONFIG_HIBERNATION */
+static inline bool debug_pagealloc_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 053824b0a412..de7be78c6f0e 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -9,8 +9,7 @@ struct vm_area_struct;
struct mm_struct;
extern void dump_page(struct page *page, const char *reason);
-extern void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags);
+extern void __dump_page(struct page *page, const char *reason);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7b6c2cfee390..bdd9a270a813 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,9 @@ enum {
MIGRATE_TYPES
};
+/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
+extern char * const migratetype_names[MIGRATE_TYPES];
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#else
@@ -209,10 +212,12 @@ struct zone_reclaim_stat {
};
struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ struct list_head lists[NR_LRU_LISTS];
+ struct zone_reclaim_stat reclaim_stat;
+ /* Evictions & activations on the inactive file list */
+ atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
- struct zone *zone;
+ struct zone *zone;
#endif
};
@@ -487,9 +492,6 @@ struct zone {
spinlock_t lru_lock;
struct lruvec lruvec;
- /* Evictions & activations on the inactive file list */
- atomic_long_t inactive_age;
-
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
@@ -520,6 +522,8 @@ struct zone {
bool compact_blockskip_flush;
#endif
+ bool contiguous;
+
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
@@ -664,6 +668,12 @@ typedef struct pglist_data {
mem_hotplug_begin/end() */
int kswapd_max_order;
enum zone_type classzone_idx;
+#ifdef CONFIG_COMPACTION
+ int kcompactd_max_order;
+ enum zone_type kcompactd_classzone_idx;
+ wait_queue_head_t kcompactd_wait;
+ struct task_struct *kcompactd;
+#endif
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
@@ -758,6 +768,8 @@ static inline struct zone *lruvec_zone(struct lruvec *lruvec)
#endif
}
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+
#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
#else
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 03e6257321f0..45993b840ed6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -91,7 +91,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
extern bool out_of_memory(struct oom_control *oc);
-extern void exit_oom_victim(void);
+extern void exit_oom_victim(struct task_struct *tsk);
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index da523661500a..77b078c103b2 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -17,6 +17,8 @@
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
#define ZONES_SHIFT 2
+#elif MAX_NR_ZONES <= 8
+#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 19724e6ebd26..f4ed4f1b0c77 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -144,12 +144,12 @@ static inline struct page *compound_head(struct page *page)
return page;
}
-static inline int PageTail(struct page *page)
+static __always_inline int PageTail(struct page *page)
{
return READ_ONCE(page->compound_head) & 1;
}
-static inline int PageCompound(struct page *page)
+static __always_inline int PageCompound(struct page *page)
{
return test_bit(PG_head, &page->flags) || PageTail(page);
}
@@ -184,31 +184,31 @@ static inline int PageCompound(struct page *page)
* Macros to create function definitions for page flags
*/
#define TESTPAGEFLAG(uname, lname, policy) \
-static inline int Page##uname(struct page *page) \
+static __always_inline int Page##uname(struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
#define SETPAGEFLAG(uname, lname, policy) \
-static inline void SetPage##uname(struct page *page) \
+static __always_inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
#define CLEARPAGEFLAG(uname, lname, policy) \
-static inline void ClearPage##uname(struct page *page) \
+static __always_inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define __SETPAGEFLAG(uname, lname, policy) \
-static inline void __SetPage##uname(struct page *page) \
+static __always_inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
-static inline void __ClearPage##uname(struct page *page) \
+static __always_inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTSETFLAG(uname, lname, policy) \
-static inline int TestSetPage##uname(struct page *page) \
+static __always_inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTCLEARFLAG(uname, lname, policy) \
-static inline int TestClearPage##uname(struct page *page) \
+static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define PAGEFLAG(uname, lname, policy) \
@@ -371,7 +371,7 @@ PAGEFLAG(Idle, idle, PF_ANY)
#define PAGE_MAPPING_KSM 2
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
-static inline int PageAnon(struct page *page)
+static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
@@ -384,7 +384,7 @@ static inline int PageAnon(struct page *page)
* is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
* anon_vma, but to that page's node of the stable tree.
*/
-static inline int PageKsm(struct page *page)
+static __always_inline int PageKsm(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
@@ -415,14 +415,14 @@ static inline int PageUptodate(struct page *page)
return ret;
}
-static inline void __SetPageUptodate(struct page *page)
+static __always_inline void __SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
__set_bit(PG_uptodate, &page->flags);
}
-static inline void SetPageUptodate(struct page *page)
+static __always_inline void SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
/*
@@ -456,12 +456,12 @@ static inline void set_page_writeback_keepwrite(struct page *page)
__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
-static inline void set_compound_head(struct page *page, struct page *head)
+static __always_inline void set_compound_head(struct page *page, struct page *head)
{
WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}
-static inline void clear_compound_head(struct page *page)
+static __always_inline void clear_compound_head(struct page *page)
{
WRITE_ONCE(page->compound_head, 0);
}
@@ -593,6 +593,8 @@ static inline void __ClearPageBuddy(struct page *page)
atomic_set(&page->_mapcount, -1);
}
+extern bool is_free_buddy_page(struct page *page);
+
#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
static inline int PageBalloon(struct page *page)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 17f118a82854..e1fe7cf5bddf 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -45,6 +45,7 @@ struct page_ext {
unsigned int order;
gfp_t gfp_mask;
unsigned int nr_entries;
+ int last_migrate_reason;
unsigned long trace_entries[8];
#endif
};
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index cacaabea8a09..46f1b939948c 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -1,38 +1,54 @@
#ifndef __LINUX_PAGE_OWNER_H
#define __LINUX_PAGE_OWNER_H
+#include <linux/jump_label.h>
+
#ifdef CONFIG_PAGE_OWNER
-extern bool page_owner_inited;
+extern struct static_key_false page_owner_inited;
extern struct page_ext_operations page_owner_ops;
extern void __reset_page_owner(struct page *page, unsigned int order);
extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __set_page_owner_migrate_reason(struct page *page, int reason);
+extern void __dump_page_owner(struct page *page);
static inline void reset_page_owner(struct page *page, unsigned int order)
{
- if (likely(!page_owner_inited))
- return;
-
- __reset_page_owner(page, order);
+ if (static_branch_unlikely(&page_owner_inited))
+ __reset_page_owner(page, order);
}
static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
{
- if (likely(!page_owner_inited))
- return;
-
- __set_page_owner(page, order, gfp_mask);
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner(page, order, gfp_mask);
}
static inline gfp_t get_page_owner_gfp(struct page *page)
{
- if (likely(!page_owner_inited))
+ if (static_branch_unlikely(&page_owner_inited))
+ return __get_page_owner_gfp(page);
+ else
return 0;
-
- return __get_page_owner_gfp(page);
+}
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __copy_page_owner(oldpage, newpage);
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner_migrate_reason(page, reason);
+}
+static inline void dump_page_owner(struct page *page)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __dump_page_owner(page);
}
#else
static inline void reset_page_owner(struct page *page, unsigned int order)
@@ -46,6 +62,14 @@ static inline gfp_t get_page_owner_gfp(struct page *page)
{
return 0;
}
-
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+}
+static inline void dump_page_owner(struct page *page)
+{
+}
#endif /* CONFIG_PAGE_OWNER */
#endif /* __LINUX_PAGE_OWNER_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 92395a0a7dc5..183b15ea052b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -663,8 +663,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg);
+extern void __delete_from_page_cache(struct page *page, void *shadow);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 4a27153574e2..51334edec506 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -30,7 +30,11 @@
#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
#define PAGE_POISON 0xaa
+#endif
/********** mm/page_alloc.c ************/
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..51dd6b824fe2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f54be7082207..a59f940a54f5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -21,6 +21,7 @@
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H
+#include <linux/bitops.h>
#include <linux/preempt.h>
#include <linux/types.h>
#include <linux/bug.h>
@@ -270,8 +271,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
}
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
- struct radix_tree_node **nodep, void ***slotp);
-int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+ unsigned order, struct radix_tree_node **nodep,
+ void ***slotp);
+int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
+ unsigned order, void *);
+static inline int radix_tree_insert(struct radix_tree_root *root,
+ unsigned long index, void *entry)
+{
+ return __radix_tree_insert(root, index, 0, entry);
+}
void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
@@ -395,6 +403,21 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
}
/**
+ * radix_tree_iter_next - resume iterating when the chunk may be invalid
+ * @iter: iterator state
+ *
+ * If the iterator needs to release then reacquire a lock, the chunk may
+ * have been invalidated by an insertion or deletion. Call this function
+ * to continue the iteration from the next index.
+ */
+static inline __must_check
+void **radix_tree_iter_next(struct radix_tree_iter *iter)
+{
+ iter->next_index = iter->index + 1;
+ return NULL;
+}
+
+/**
* radix_tree_chunk_size - get current chunk size
*
* @iter: pointer to radix tree iterator
diff --git a/include/linux/random.h b/include/linux/random.h
index a75840c1aa71..9c29122037f9 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -34,6 +34,7 @@ extern const struct file_operations random_fops, urandom_fops;
#endif
unsigned int get_random_int(void);
+unsigned long get_random_long(void);
unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
u32 prandom_u32(void);
diff --git a/include/linux/rio.h b/include/linux/rio.h
index cde976e86b48..eca09bfe7f60 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -137,6 +137,13 @@ struct rio_switch_ops {
int (*em_handle) (struct rio_dev *dev, u8 swport);
};
+enum rio_device_state {
+ RIO_DEVICE_INITIALIZING,
+ RIO_DEVICE_RUNNING,
+ RIO_DEVICE_GONE,
+ RIO_DEVICE_SHUTDOWN,
+};
+
/**
* struct rio_dev - RIO device info
* @global_list: Node in list of all RIO devices
@@ -165,6 +172,7 @@ struct rio_switch_ops {
* @destid: Network destination ID (or associated destid for switch)
* @hopcount: Hopcount to this device
* @prev: Previous RIO device connected to the current one
+ * @state: device state
* @rswitch: struct rio_switch (if valid for this device)
*/
struct rio_dev {
@@ -194,6 +202,7 @@ struct rio_dev {
u16 destid;
u8 hopcount;
struct rio_dev *prev;
+ atomic_t state;
struct rio_switch rswitch[0]; /* RIO switch info */
};
@@ -202,6 +211,7 @@ struct rio_dev {
#define to_rio_dev(n) container_of(n, struct rio_dev, dev)
#define sw_to_rio_dev(n) container_of(n, struct rio_dev, rswitch[0])
#define to_rio_mport(n) container_of(n, struct rio_mport, dev)
+#define to_rio_net(n) container_of(n, struct rio_net, dev)
/**
* struct rio_msg - RIO message event
@@ -235,8 +245,11 @@ enum rio_phy_type {
/**
* struct rio_mport - RIO master port info
* @dbells: List of doorbell events
+ * @pwrites: List of portwrite events
* @node: Node in global list of master ports
* @nnode: Node in network list of master ports
+ * @net: RIO net this mport is attached to
+ * @mutex lock: lock to synchronize lists manipulations
* @iores: I/O mem resource that this master port interface owns
* @riores: RIO resources that this master port interfaces owns
* @inb_msg: RIO inbound message event descriptors
@@ -253,11 +266,16 @@ enum rio_phy_type {
* @priv: Master port private data
* @dma: DMA device associated with mport
* @nscan: RapidIO network enumeration/discovery operations
+ * @state: mport device state
+ * @pwe_refcnt: port-write enable ref counter to track enable/disable requests
*/
struct rio_mport {
struct list_head dbells; /* list of doorbell events */
+ struct list_head pwrites; /* list of portwrite events */
struct list_head node; /* node in global list of ports */
struct list_head nnode; /* node in net list of ports */
+ struct rio_net *net; /* RIO net this mport is attached to */
+ struct mutex lock; /* lock to synchronize lists manipulations */
struct resource iores;
struct resource riores[RIO_MAX_MPORT_RESOURCES];
struct rio_msg inb_msg[RIO_MAX_MBOX];
@@ -280,20 +298,20 @@ struct rio_mport {
struct dma_device dma;
#endif
struct rio_scan *nscan;
+ atomic_t state;
+ unsigned int pwe_refcnt;
};
+static inline int rio_mport_is_running(struct rio_mport *mport)
+{
+ return atomic_read(&mport->state) == RIO_DEVICE_RUNNING;
+}
+
/*
* Enumeration/discovery control flags
*/
#define RIO_SCAN_ENUM_NO_WAIT 0x00000001 /* Do not wait for enum completed */
-struct rio_id_table {
- u16 start; /* logical minimal id */
- u32 max; /* max number of IDs in table */
- spinlock_t lock;
- unsigned long *table;
-};
-
/**
* struct rio_net - RIO network info
* @node: Node in global list of RIO networks
@@ -302,7 +320,8 @@ struct rio_id_table {
* @mports: List of master ports accessing this network
* @hport: Default port for accessing this network
* @id: RIO network ID
- * @destid_table: destID allocation table
+ * @enum_data: private data specific to a network enumerator
+ * @release: enumerator-specific release callback
*/
struct rio_net {
struct list_head node; /* node in list of networks */
@@ -311,7 +330,53 @@ struct rio_net {
struct list_head mports; /* list of ports accessing net */
struct rio_mport *hport; /* primary port for accessing net */
unsigned char id; /* RIO network ID */
- struct rio_id_table destid_table; /* destID allocation table */
+ struct device dev;
+ void *enum_data; /* private data for enumerator of the network */
+ void (*release)(struct rio_net *net);
+};
+
+enum rio_link_speed {
+ RIO_LINK_DOWN = 0, /* SRIO Link not initialized */
+ RIO_LINK_125 = 1, /* 1.25 GBaud */
+ RIO_LINK_250 = 2, /* 2.5 GBaud */
+ RIO_LINK_312 = 3, /* 3.125 GBaud */
+ RIO_LINK_500 = 4, /* 5.0 GBaud */
+ RIO_LINK_625 = 5 /* 6.25 GBaud */
+};
+
+enum rio_link_width {
+ RIO_LINK_1X = 0,
+ RIO_LINK_1XR = 1,
+ RIO_LINK_2X = 3,
+ RIO_LINK_4X = 2,
+ RIO_LINK_8X = 4,
+ RIO_LINK_16X = 5
+};
+
+enum rio_mport_flags {
+ RIO_MPORT_DMA = (1 << 0), /* supports DMA data transfers */
+ RIO_MPORT_DMA_SG = (1 << 1), /* DMA supports HW SG mode */
+ RIO_MPORT_IBSG = (1 << 2), /* inbound mapping supports SG */
+};
+
+/**
+ * struct rio_mport_attr - RIO mport device attributes
+ * @flags: mport device capability flags
+ * @link_speed: SRIO link speed value (as defined by RapidIO specification)
+ * @link_width: SRIO link width value (as defined by RapidIO specification)
+ * @dma_max_sge: number of SG list entries that can be handled by DMA channel(s)
+ * @dma_max_size: max number of bytes in single DMA transfer (SG entry)
+ * @dma_align: alignment shift for DMA operations (as for other DMA operations)
+ */
+struct rio_mport_attr {
+ int flags;
+ int link_speed;
+ int link_width;
+
+ /* DMA capability info: valid only if RIO_MPORT_DMA flag is set */
+ int dma_max_sge;
+ int dma_max_size;
+ int dma_align;
};
/* Low-level architecture-dependent routines */
@@ -333,6 +398,9 @@ struct rio_net {
* @get_inb_message: Callback to get a message from an inbound mailbox queue.
* @map_inb: Callback to map RapidIO address region into local memory space.
* @unmap_inb: Callback to unmap RapidIO address region mapped with map_inb().
+ * @query_mport: Callback to query mport device attributes.
+ * @map_outb: Callback to map outbound address region into local memory space.
+ * @unmap_outb: Callback to unmap outbound RapidIO address region.
*/
struct rio_ops {
int (*lcread) (struct rio_mport *mport, int index, u32 offset, int len,
@@ -358,6 +426,11 @@ struct rio_ops {
int (*map_inb)(struct rio_mport *mport, dma_addr_t lstart,
u64 rstart, u32 size, u32 flags);
void (*unmap_inb)(struct rio_mport *mport, dma_addr_t lstart);
+ int (*query_mport)(struct rio_mport *mport,
+ struct rio_mport_attr *attr);
+ int (*map_outb)(struct rio_mport *mport, u16 destid, u64 rstart,
+ u32 size, u32 flags, dma_addr_t *laddr);
+ void (*unmap_outb)(struct rio_mport *mport, u16 destid, u64 rstart);
};
#define RIO_RESOURCE_MEM 0x00000100
@@ -376,6 +449,7 @@ struct rio_ops {
* @id_table: RIO device ids to be associated with this driver
* @probe: RIO device inserted
* @remove: RIO device removed
+ * @shutdown: shutdown notification callback
* @suspend: RIO device suspended
* @resume: RIO device awakened
* @enable_wake: RIO device enable wake event
@@ -390,6 +464,7 @@ struct rio_driver {
const struct rio_device_id *id_table;
int (*probe) (struct rio_dev * dev, const struct rio_device_id * id);
void (*remove) (struct rio_dev * dev);
+ void (*shutdown)(struct rio_dev *dev);
int (*suspend) (struct rio_dev * dev, u32 state);
int (*resume) (struct rio_dev * dev);
int (*enable_wake) (struct rio_dev * dev, u32 state, int enable);
@@ -476,10 +551,14 @@ struct rio_scan_node {
};
/* Architecture and hardware-specific functions */
+extern int rio_mport_initialize(struct rio_mport *);
extern int rio_register_mport(struct rio_mport *);
+extern int rio_unregister_mport(struct rio_mport *);
extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int);
extern void rio_close_inb_mbox(struct rio_mport *, int);
extern int rio_open_outb_mbox(struct rio_mport *, void *, int, int);
extern void rio_close_outb_mbox(struct rio_mport *, int);
+extern int rio_query_mport(struct rio_mport *port,
+ struct rio_mport_attr *mport_attr);
#endif /* LINUX_RIO_H */
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 9fc2f213e74f..0834264fb7f2 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -369,12 +369,24 @@ void rio_release_region(struct rio_dev *, int);
extern int rio_map_inb_region(struct rio_mport *mport, dma_addr_t local,
u64 rbase, u32 size, u32 rflags);
extern void rio_unmap_inb_region(struct rio_mport *mport, dma_addr_t lstart);
+extern int rio_map_outb_region(struct rio_mport *mport, u16 destid, u64 rbase,
+ u32 size, u32 rflags, dma_addr_t *local);
+extern void rio_unmap_outb_region(struct rio_mport *mport,
+ u16 destid, u64 rstart);
/* Port-Write management */
extern int rio_request_inb_pwrite(struct rio_dev *,
int (*)(struct rio_dev *, union rio_pw_msg*, int));
extern int rio_release_inb_pwrite(struct rio_dev *);
-extern int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg);
+extern int rio_add_mport_pw_handler(struct rio_mport *mport, void *dev_id,
+ int (*pwcback)(struct rio_mport *mport, void *dev_id,
+ union rio_pw_msg *msg, int step));
+extern int rio_del_mport_pw_handler(struct rio_mport *mport, void *dev_id,
+ int (*pwcback)(struct rio_mport *mport, void *dev_id,
+ union rio_pw_msg *msg, int step));
+extern int rio_inb_pwrite_handler(struct rio_mport *mport,
+ union rio_pw_msg *pw_msg);
+extern void rio_pw_enable(struct rio_mport *mport, int enable);
/* LDM support */
int rio_register_driver(struct rio_driver *);
@@ -435,6 +447,7 @@ static inline void rio_set_drvdata(struct rio_dev *rdev, void *data)
/* Misc driver helpers */
extern u16 rio_local_get_device_id(struct rio_mport *port);
+extern void rio_local_set_device_id(struct rio_mport *port, u16 did);
extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from);
extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did,
struct rio_dev *from);
diff --git a/include/linux/rio_mport_cdev.h b/include/linux/rio_mport_cdev.h
new file mode 100644
index 000000000000..b65d19df76d2
--- /dev/null
+++ b/include/linux/rio_mport_cdev.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015-2016, Integrated Device Technology Inc.
+ * Copyright (c) 2015, Prodrive Technologies
+ * Copyright (c) 2015, Texas Instruments Incorporated
+ * Copyright (c) 2015, RapidIO Trade Association
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License(GPL) Version 2, or the BSD-3 Clause license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RIO_MPORT_CDEV_H_
+#define _RIO_MPORT_CDEV_H_
+
+#ifndef __user
+#define __user
+#endif
+
+struct rio_mport_maint_io {
+ uint32_t rioid; /* destID of remote device */
+ uint32_t hopcount; /* hopcount to remote device */
+ uint32_t offset; /* offset in register space */
+ size_t length; /* length in bytes */
+ void __user *buffer; /* data buffer */
+};
+
+/*
+ * Definitions for RapidIO data transfers:
+ * - memory mapped (MAPPED)
+ * - packet generation from memory (TRANSFER)
+ */
+#define RIO_TRANSFER_MODE_MAPPED (1 << 0)
+#define RIO_TRANSFER_MODE_TRANSFER (1 << 1)
+#define RIO_CAP_DBL_SEND (1 << 2)
+#define RIO_CAP_DBL_RECV (1 << 3)
+#define RIO_CAP_PW_SEND (1 << 4)
+#define RIO_CAP_PW_RECV (1 << 5)
+#define RIO_CAP_MAP_OUTB (1 << 6)
+#define RIO_CAP_MAP_INB (1 << 7)
+
+struct rio_mport_properties {
+ uint16_t hdid;
+ uint8_t id; /* Physical port ID */
+ uint8_t index;
+ uint32_t flags;
+ uint32_t sys_size; /* Default addressing size */
+ uint8_t port_ok;
+ uint8_t link_speed;
+ uint8_t link_width;
+ uint32_t dma_max_sge;
+ uint32_t dma_max_size;
+ uint32_t dma_align;
+ uint32_t transfer_mode; /* Default transfer mode */
+ uint32_t cap_sys_size; /* Capable system sizes */
+ uint32_t cap_addr_size; /* Capable addressing sizes */
+ uint32_t cap_transfer_mode; /* Capable transfer modes */
+ uint32_t cap_mport; /* Mport capabilities */
+};
+
+/*
+ * Definitions for RapidIO events;
+ * - incoming port-writes
+ * - incoming doorbells
+ */
+#define RIO_DOORBELL (1 << 0)
+#define RIO_PORTWRITE (1 << 1)
+
+struct rio_doorbell {
+ uint32_t rioid;
+ uint16_t payload;
+};
+
+struct rio_doorbell_filter {
+ uint32_t rioid; /* 0xffffffff to match all ids */
+ uint16_t low;
+ uint16_t high;
+};
+
+
+struct rio_portwrite {
+ uint32_t payload[16];
+};
+
+struct rio_pw_filter {
+ uint32_t mask;
+ uint32_t low;
+ uint32_t high;
+};
+
+/* RapidIO base address for inbound requests set to value defined below
+ * indicates that no specific RIO-to-local address translation is requested
+ * and driver should use direct (one-to-one) address mapping.
+*/
+#define RIO_MAP_ANY_ADDR (uint64_t)(~((uint64_t) 0))
+
+struct rio_mmap {
+ uint32_t rioid;
+ uint64_t rio_addr;
+ uint64_t length;
+ uint64_t handle;
+ void *address;
+};
+
+struct rio_dma_mem {
+ uint64_t length; /* length of DMA memory */
+ uint64_t dma_handle; /* handle associated with this memory */
+ void *buffer; /* pointer to this memory */
+};
+
+
+struct rio_event {
+ unsigned int header; /* event type RIO_DOORBELL or RIO_PORTWRITE */
+ union {
+ struct rio_doorbell doorbell; /* header for RIO_DOORBELL */
+ struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */
+ } u;
+};
+
+enum rio_transfer_sync {
+ RIO_TRANSFER_SYNC, /* synchronous transfer */
+ RIO_TRANSFER_ASYNC, /* asynchronous transfer */
+ RIO_TRANSFER_FAF, /* fire-and-forget transfer */
+};
+
+enum rio_transfer_dir {
+ RIO_TRANSFER_DIR_READ, /* Read operation */
+ RIO_TRANSFER_DIR_WRITE, /* Write operation */
+};
+
+/*
+ * RapidIO data exchange transactions are lists of individual transfers. Each
+ * transfer exchanges data between two RapidIO devices by remote direct memory
+ * access and has its own completion code.
+ *
+ * The RapidIO specification defines four types of data exchange requests:
+ * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows
+ * to specify the required type of write operation or combination of them when
+ * only the last data packet requires response.
+ *
+ * NREAD: read up to 256 bytes from remote device memory into local memory
+ * NWRITE: write up to 256 bytes from local memory to remote device memory
+ * without confirmation
+ * SWRITE: as NWRITE, but all addresses and payloads must be 64-bit aligned
+ * NWRITE_R: as NWRITE, but expect acknowledgment from remote device.
+ *
+ * The default exchange is chosen from NREAD and any of the WRITE modes as the
+ * driver sees fit. For write requests the user can explicitly choose between
+ * any of the write modes for each transaction.
+ */
+enum rio_exchange {
+ RIO_EXCHANGE_DEFAULT, /* Default method */
+ RIO_EXCHANGE_NWRITE, /* All packets using NWRITE */
+ RIO_EXCHANGE_SWRITE, /* All packets using SWRITE */
+ RIO_EXCHANGE_NWRITE_R, /* Last packet NWRITE_R, others NWRITE */
+ RIO_EXCHANGE_SWRITE_R, /* Last packet NWRITE_R, others SWRITE */
+ RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */
+};
+
+struct rio_transfer_io {
+ uint32_t rioid; /* Target destID */
+ uint64_t rio_addr; /* Address in target's RIO mem space */
+ enum rio_exchange method; /* Data exchange method */
+ void __user *loc_addr;
+ uint64_t handle;
+ uint64_t offset; /* Offset in buffer */
+ uint64_t length; /* Length in bytes */
+ uint32_t completion_code; /* Completion code for this transfer */
+};
+
+struct rio_transaction {
+ uint32_t transfer_mode; /* Data transfer mode */
+ enum rio_transfer_sync sync; /* Synchronization method */
+ enum rio_transfer_dir dir; /* Transfer direction */
+ size_t count; /* Number of transfers */
+ struct rio_transfer_io __user *block; /* Array of <count> transfers */
+};
+
+struct rio_async_tx_wait {
+ uint32_t token; /* DMA transaction ID token */
+ uint32_t timeout; /* Wait timeout in msec, if 0 use default TO */
+};
+
+#define RIO_MAX_DEVNAME_SZ 20
+
+struct rio_rdev_info {
+ uint32_t destid;
+ uint8_t hopcount;
+ uint32_t comptag;
+ char name[RIO_MAX_DEVNAME_SZ + 1];
+};
+
+/* Driver IOCTL codes */
+#define RIO_MPORT_DRV_MAGIC 'm'
+
+#define RIO_MPORT_MAINT_HDID_SET \
+ _IOW(RIO_MPORT_DRV_MAGIC, 1, uint16_t)
+#define RIO_MPORT_MAINT_COMPTAG_SET \
+ _IOW(RIO_MPORT_DRV_MAGIC, 2, uint32_t)
+#define RIO_MPORT_MAINT_PORT_IDX_GET \
+ _IOR(RIO_MPORT_DRV_MAGIC, 3, uint32_t)
+#define RIO_MPORT_GET_PROPERTIES \
+ _IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties)
+#define RIO_MPORT_MAINT_READ_LOCAL \
+ _IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_WRITE_LOCAL \
+ _IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_READ_REMOTE \
+ _IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_WRITE_REMOTE \
+ _IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io)
+#define RIO_ENABLE_DOORBELL_RANGE \
+ _IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter)
+#define RIO_DISABLE_DOORBELL_RANGE \
+ _IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter)
+#define RIO_ENABLE_PORTWRITE_RANGE \
+ _IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter)
+#define RIO_DISABLE_PORTWRITE_RANGE \
+ _IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter)
+#define RIO_SET_EVENT_MASK \
+ _IOW(RIO_MPORT_DRV_MAGIC, 13, unsigned int)
+#define RIO_GET_EVENT_MASK \
+ _IOR(RIO_MPORT_DRV_MAGIC, 14, unsigned int)
+#define RIO_MAP_OUTBOUND \
+ _IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap)
+#define RIO_UNMAP_OUTBOUND \
+ _IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap)
+#define RIO_MAP_INBOUND \
+ _IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap)
+#define RIO_UNMAP_INBOUND \
+ _IOW(RIO_MPORT_DRV_MAGIC, 18, uint64_t)
+#define RIO_ALLOC_DMA \
+ _IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem)
+#define RIO_FREE_DMA \
+ _IOW(RIO_MPORT_DRV_MAGIC, 20, uint64_t)
+#define RIO_TRANSFER \
+ _IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction)
+#define RIO_WAIT_FOR_ASYNC \
+ _IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait)
+#define RIO_DEV_ADD \
+ _IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info)
+#define RIO_DEV_DEL \
+ _IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info)
+
+#endif /* _RIO_MPORT_CDEV_H_ */
diff --git a/include/linux/rio_regs.h b/include/linux/rio_regs.h
index 218168a2b5e9..1063ae382bc2 100644
--- a/include/linux/rio_regs.h
+++ b/include/linux/rio_regs.h
@@ -238,6 +238,8 @@
#define RIO_PORT_N_ACK_INBOUND 0x3f000000
#define RIO_PORT_N_ACK_OUTSTAND 0x00003f00
#define RIO_PORT_N_ACK_OUTBOUND 0x0000003f
+#define RIO_PORT_N_CTL2_CSR(x) (0x0054 + x*0x20)
+#define RIO_PORT_N_CTL2_SEL_BAUD 0xf0000000
#define RIO_PORT_N_ERR_STS_CSR(x) (0x0058 + x*0x20)
#define RIO_PORT_N_ERR_STS_PW_OUT_ES 0x00010000 /* Output Error-stopped */
#define RIO_PORT_N_ERR_STS_PW_INP_ES 0x00000100 /* Input Error-stopped */
@@ -249,6 +251,7 @@
#define RIO_PORT_N_CTL_PWIDTH 0xc0000000
#define RIO_PORT_N_CTL_PWIDTH_1 0x00000000
#define RIO_PORT_N_CTL_PWIDTH_4 0x40000000
+#define RIO_PORT_N_CTL_IPW 0x38000000 /* Initialized Port Width */
#define RIO_PORT_N_CTL_P_TYP_SER 0x00000001
#define RIO_PORT_N_CTL_LOCKOUT 0x00000002
#define RIO_PORT_N_CTL_EN_RX_SER 0x00200000
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34a4f0728794..dc805d61ed87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
#include <linux/resource.h>
#include <linux/timer.h>
#include <linux/hrtimer.h>
+#include <linux/kcov.h>
#include <linux/task_io_accounting.h>
#include <linux/latencytop.h>
#include <linux/cred.h>
@@ -427,6 +428,7 @@ extern signed long schedule_timeout(signed long timeout);
extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout);
+extern signed long schedule_timeout_idle(signed long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
@@ -1819,6 +1821,16 @@ struct task_struct {
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+ /* Coverage collection mode enabled for this task (0 if disabled). */
+ enum kcov_mode kcov_mode;
+ /* Size of the kcov_area. */
+ unsigned kcov_size;
+ /* Buffer for coverage collection. */
+ void *kcov_area;
+ /* kcov desciptor wired with this task or NULL. */
+ struct kcov *kcov;
+#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
@@ -1838,6 +1850,9 @@ struct task_struct {
unsigned long task_state_change;
#endif
int pagefault_disabled;
+#ifdef CONFIG_MMU
+ struct list_head oom_reaper_list;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/*
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 976ce3a19f1b..d0efd6e6c20a 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,6 +21,7 @@ struct sem_array {
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
+ bool complex_mode; /* no parallel simple ops */
};
#ifdef CONFIG_SYSVIPC
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3627d5c1bc47..5d49f0c60dcb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -314,7 +314,7 @@ void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment
void kmem_cache_free(struct kmem_cache *, void *);
/*
- * Bulk allocation and freeing operations. These are accellerated in an
+ * Bulk allocation and freeing operations. These are accelerated in an
* allocator specific way to avoid taking locks repeatedly or building
* metadata structures unnecessarily.
*
@@ -323,6 +323,15 @@ void kmem_cache_free(struct kmem_cache *, void *);
void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+/*
+ * Caller must not use kfree_bulk() on memory not originally allocated
+ * by kmalloc(), because the SLOB allocator cannot handle this.
+ */
+static __always_inline void kfree_bulk(size_t size, void **p)
+{
+ kmem_cache_free_bulk(NULL, size, p);
+}
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index cf139d3fa513..e878ba35ae91 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -60,6 +60,9 @@ struct kmem_cache {
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ atomic_t store_user_clean;
+#endif
/*
* If debugging is enabled, then the allocator can add additional
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index b7e57927f521..a33869b961fc 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -77,6 +77,7 @@ struct kmem_cache {
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
int inuse; /* Offset to metadata */
+ int red_left_pad; /* Left redzone padding size */
int align; /* Alignment */
int reserved; /* Reserved bytes at the end of slabs */
const char *name; /* Name (only for display!) */
diff --git a/include/linux/string.h b/include/linux/string.h
index 9eebc66d957a..d501477c14cb 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -122,13 +122,20 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);
extern bool sysfs_streq(const char *s1, const char *s2);
-extern int strtobool(const char *s, bool *res);
+extern int kstrtobool(const char *s, bool *res);
+static inline int strtobool(const char *s, bool *res)
+{
+ return kstrtobool(s, res);
+}
+
+int match_string(const char * const *array, size_t n, const char *string);
#ifdef CONFIG_BINARY_PRINTF
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d18b65c53dbb..b14a2bb33514 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma);
/* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 97fd4e543846..0ecdf0e248f4 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -98,7 +98,7 @@ static inline void tick_broadcast_exit(void)
}
#ifdef CONFIG_NO_HZ_COMMON
-extern int tick_nohz_enabled;
+extern bool tick_nohz_enabled;
extern int tick_nohz_tick_stopped(void);
extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 429fdfc3baf5..d91404f89ff2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -15,16 +15,6 @@ struct tracer;
struct dentry;
struct bpf_prog;
-struct trace_print_flags {
- unsigned long mask;
- const char *name;
-};
-
-struct trace_print_flags_u64 {
- unsigned long long mask;
- const char *name;
-};
-
const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
const struct trace_print_flags *flag_array);
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index e1ee97c713bf..4ac89acb6136 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -3,13 +3,23 @@
/*
* File can be included directly by headers who only want to access
- * tracepoint->key to guard out of line trace calls. Otherwise
- * linux/tracepoint.h should be used.
+ * tracepoint->key to guard out of line trace calls, or the definition of
+ * trace_print_flags{_u64}. Otherwise linux/tracepoint.h should be used.
*/
#include <linux/atomic.h>
#include <linux/static_key.h>
+struct trace_print_flags {
+ unsigned long mask;
+ const char *name;
+};
+
+struct trace_print_flags_u64 {
+ unsigned long long mask;
+ const char *name;
+};
+
struct tracepoint_func {
void *func;
void *data;
diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h
index 99c1b4d20b0f..33383ca23837 100644
--- a/include/linux/unaligned/access_ok.h
+++ b/include/linux/unaligned/access_ok.h
@@ -4,62 +4,62 @@
#include <linux/kernel.h>
#include <asm/byteorder.h>
-static inline u16 get_unaligned_le16(const void *p)
+static __always_inline u16 get_unaligned_le16(const void *p)
{
return le16_to_cpup((__le16 *)p);
}
-static inline u32 get_unaligned_le32(const void *p)
+static __always_inline u32 get_unaligned_le32(const void *p)
{
return le32_to_cpup((__le32 *)p);
}
-static inline u64 get_unaligned_le64(const void *p)
+static __always_inline u64 get_unaligned_le64(const void *p)
{
return le64_to_cpup((__le64 *)p);
}
-static inline u16 get_unaligned_be16(const void *p)
+static __always_inline u16 get_unaligned_be16(const void *p)
{
return be16_to_cpup((__be16 *)p);
}
-static inline u32 get_unaligned_be32(const void *p)
+static __always_inline u32 get_unaligned_be32(const void *p)
{
return be32_to_cpup((__be32 *)p);
}
-static inline u64 get_unaligned_be64(const void *p)
+static __always_inline u64 get_unaligned_be64(const void *p)
{
return be64_to_cpup((__be64 *)p);
}
-static inline void put_unaligned_le16(u16 val, void *p)
+static __always_inline void put_unaligned_le16(u16 val, void *p)
{
*((__le16 *)p) = cpu_to_le16(val);
}
-static inline void put_unaligned_le32(u32 val, void *p)
+static __always_inline void put_unaligned_le32(u32 val, void *p)
{
*((__le32 *)p) = cpu_to_le32(val);
}
-static inline void put_unaligned_le64(u64 val, void *p)
+static __always_inline void put_unaligned_le64(u64 val, void *p)
{
*((__le64 *)p) = cpu_to_le64(val);
}
-static inline void put_unaligned_be16(u16 val, void *p)
+static __always_inline void put_unaligned_be16(u16 val, void *p)
{
*((__be16 *)p) = cpu_to_be16(val);
}
-static inline void put_unaligned_be32(u32 val, void *p)
+static __always_inline void put_unaligned_be32(u32 val, void *p)
{
*((__be32 *)p) = cpu_to_be32(val);
}
-static inline void put_unaligned_be64(u64 val, void *p)
+static __always_inline void put_unaligned_be64(u64 val, void *p)
{
*((__be64 *)p) = cpu_to_be64(val);
}
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 67c1dbd19c6d..ec084321fe09 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
COMPACTISOLATED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
+ KCOMPACTD_WAKE,
#endif
#ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
@@ -71,6 +72,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_COLLAPSE_ALLOC_FAILED,
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
+ THP_DEFERRED_SPLIT_PAGE,
THP_SPLIT_PMD,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d866f21efbbf..677807f29a1c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -6,7 +6,7 @@
#include <linux/writeback.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
struct btrfs_root;
struct btrfs_fs_info;
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index c92d1e1cbad9..e215bf68f521 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
#define COMPACTION_STATUS \
EM( COMPACT_DEFERRED, "deferred") \
@@ -350,6 +350,61 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
);
#endif
+TRACE_EVENT(mm_compaction_kcompactd_sleep,
+
+ TP_PROTO(int nid),
+
+ TP_ARGS(nid),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ ),
+
+ TP_printk("nid=%d", __entry->nid)
+);
+
+DECLARE_EVENT_CLASS(kcompactd_wake_template,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(enum zone_type, classzone_idx)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->classzone_idx = classzone_idx;
+ ),
+
+ TP_printk("nid=%d order=%d classzone_idx=%-8s",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->classzone_idx, ZONE_TYPE))
+);
+
+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx)
+);
+
+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx)
+);
+
#endif /* _TRACE_COMPACTION_H */
/* This part must be outside protection */
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
deleted file mode 100644
index dde6bf092c8a..000000000000
--- a/include/trace/events/gfpflags.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * The order of these masks is important. Matching masks will be seen
- * first and the left over flags will end up showing by themselves.
- *
- * For example, if we have GFP_KERNEL before GFP_USER we wil get:
- *
- * GFP_KERNEL|GFP_HARDWALL
- *
- * Thus most bits set go first.
- */
-#define show_gfp_flags(flags) \
- (flags) ? __print_flags(flags, "|", \
- {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
- {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
- {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
- {(unsigned long)GFP_USER, "GFP_USER"}, \
- {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
- {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
- {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
- {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
- {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
- {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
- {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \
- {(unsigned long)__GFP_IO, "GFP_IO"}, \
- {(unsigned long)__GFP_COLD, "GFP_COLD"}, \
- {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
- {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
- {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \
- {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \
- {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
- {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
- {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
- {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \
- {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
- {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
- {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
- {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
- {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
- {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \
- {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \
- {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
- ) : "GFP_NOWAIT"
-
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 47c6212d8f3c..bda21183eb05 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -6,8 +6,6 @@
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
-
#define SCAN_STATUS \
EM( SCAN_FAIL, "failed") \
EM( SCAN_SUCCEED, "succeeded") \
@@ -30,7 +28,8 @@
EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
- EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed")
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte")
#undef EM
#undef EMe
@@ -47,9 +46,9 @@ SCAN_STATUS
TRACE_EVENT(mm_khugepaged_scan_pmd,
TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
- bool referenced, int none_or_zero, int status),
+ bool referenced, int none_or_zero, int status, int unmapped),
- TP_ARGS(mm, page, writable, referenced, none_or_zero, status),
+ TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
@@ -58,6 +57,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__field(bool, referenced)
__field(int, none_or_zero)
__field(int, status)
+ __field(int, unmapped)
),
TP_fast_assign(
@@ -67,15 +67,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__entry->referenced = referenced;
__entry->none_or_zero = none_or_zero;
__entry->status = status;
+ __entry->unmapped = unmapped;
),
- TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s",
+ TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
__entry->mm,
__entry->pfn,
__entry->writable,
__entry->referenced,
__entry->none_or_zero,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->unmapped)
);
TRACE_EVENT(mm_collapse_huge_page,
@@ -133,5 +135,29 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__print_symbolic(__entry->status, SCAN_STATUS))
);
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+ TP_ARGS(mm, swapped_in, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swapped_in)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swapped_in = swapped_in;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swapped_in=%d, ret=%d",
+ __entry->mm,
+ __entry->swapped_in,
+ __entry->ret)
+);
+
#endif /* __HUGE_MEMORY_H */
#include <trace/define_trace.h>
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f7554fd7fc62..ca7217389067 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -6,7 +6,7 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
DECLARE_EVENT_CLASS(kmem_alloc,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
new file mode 100644
index 000000000000..0837ef95880b
--- /dev/null
+++ b/include/trace/events/mmflags.h
@@ -0,0 +1,165 @@
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ * GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+
+#define __def_gfpflag_names \
+ {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
+ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
+ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
+ {(unsigned long)GFP_USER, "GFP_USER"}, \
+ {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
+ {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \
+ {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
+ {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
+ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
+ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
+ {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \
+ {(unsigned long)GFP_DMA, "GFP_DMA"}, \
+ {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \
+ {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \
+ {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \
+ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \
+ {(unsigned long)__GFP_IO, "__GFP_IO"}, \
+ {(unsigned long)__GFP_FS, "__GFP_FS"}, \
+ {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \
+ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \
+ {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \
+ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \
+ {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \
+ {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \
+ {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \
+ {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \
+ {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \
+ {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \
+ {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \
+ {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \
+ {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \
+ {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \
+ {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \
+ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
+ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
+ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
+ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\
+ {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \
+
+#define show_gfp_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_gfpflag_names \
+ ) : "none"
+
+#ifdef CONFIG_MMU
+#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_MLOCK(flag,string)
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_UNCACHED(flag,string)
+#endif
+
+#ifdef CONFIG_MEMORY_FAILURE
+#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_HWPOISON(flag,string)
+#endif
+
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_IDLE(flag,string)
+#endif
+
+#define __def_pageflag_names \
+ {1UL << PG_locked, "locked" }, \
+ {1UL << PG_error, "error" }, \
+ {1UL << PG_referenced, "referenced" }, \
+ {1UL << PG_uptodate, "uptodate" }, \
+ {1UL << PG_dirty, "dirty" }, \
+ {1UL << PG_lru, "lru" }, \
+ {1UL << PG_active, "active" }, \
+ {1UL << PG_slab, "slab" }, \
+ {1UL << PG_owner_priv_1, "owner_priv_1" }, \
+ {1UL << PG_arch_1, "arch_1" }, \
+ {1UL << PG_reserved, "reserved" }, \
+ {1UL << PG_private, "private" }, \
+ {1UL << PG_private_2, "private_2" }, \
+ {1UL << PG_writeback, "writeback" }, \
+ {1UL << PG_head, "head" }, \
+ {1UL << PG_swapcache, "swapcache" }, \
+ {1UL << PG_mappedtodisk, "mappedtodisk" }, \
+ {1UL << PG_reclaim, "reclaim" }, \
+ {1UL << PG_swapbacked, "swapbacked" }, \
+ {1UL << PG_unevictable, "unevictable" } \
+IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
+IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
+IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
+IF_HAVE_PG_IDLE(PG_young, "young" ) \
+IF_HAVE_PG_IDLE(PG_idle, "idle" )
+
+#define show_page_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_pageflag_names \
+ ) : "none"
+
+#if defined(CONFIG_X86)
+#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" }
+#elif defined(CONFIG_PPC)
+#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" }
+#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" }
+#elif !defined(CONFIG_MMU)
+#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" }
+#else
+#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" }
+#endif
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
+#else
+#define IF_HAVE_VM_SOFTDIRTY(flag,name)
+#endif
+
+#define __def_vmaflag_names \
+ {VM_READ, "read" }, \
+ {VM_WRITE, "write" }, \
+ {VM_EXEC, "exec" }, \
+ {VM_SHARED, "shared" }, \
+ {VM_MAYREAD, "mayread" }, \
+ {VM_MAYWRITE, "maywrite" }, \
+ {VM_MAYEXEC, "mayexec" }, \
+ {VM_MAYSHARE, "mayshare" }, \
+ {VM_GROWSDOWN, "growsdown" }, \
+ {VM_PFNMAP, "pfnmap" }, \
+ {VM_DENYWRITE, "denywrite" }, \
+ {VM_LOCKONFAULT, "lockonfault" }, \
+ {VM_LOCKED, "locked" }, \
+ {VM_IO, "io" }, \
+ {VM_SEQ_READ, "seqread" }, \
+ {VM_RAND_READ, "randread" }, \
+ {VM_DONTCOPY, "dontcopy" }, \
+ {VM_DONTEXPAND, "dontexpand" }, \
+ {VM_ACCOUNT, "account" }, \
+ {VM_NORESERVE, "noreserve" }, \
+ {VM_HUGETLB, "hugetlb" }, \
+ __VM_ARCH_SPECIFIC , \
+ {VM_DONTDUMP, "dontdump" }, \
+IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
+ {VM_MIXEDMAP, "mixedmap" }, \
+ {VM_HUGEPAGE, "hugepage" }, \
+ {VM_NOHUGEPAGE, "nohugepage" }, \
+ {VM_MERGEABLE, "mergeable" } \
+
+#define show_vma_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_vmaflag_names \
+ ) : "none"
+
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 31763dd8db1c..0101ef37f1ee 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -8,7 +8,7 @@
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
#define RECLAIM_WB_ANON 0x0001u
#define RECLAIM_WB_FILE 0x0002u
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..d3dc8cadeb00 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -352,6 +352,7 @@ header-y += reiserfs_fs.h
header-y += reiserfs_xattr.h
header-y += resource.h
header-y += rfkill.h
+header-y += rio_mport_cdev.h
header-y += romfs_fs.h
header-y += rose.h
header-y += route.h
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index bb991dfe134f..9175a1b4dc69 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -1,7 +1,4 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * linux/include/linux/auto_fs.h
- *
+/*
* Copyright 1997 Transmeta Corporation - All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
@@ -51,7 +48,7 @@ struct autofs_packet_hdr {
struct autofs_packet_missing {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_token;
int len;
char name[NAME_MAX+1];
};
@@ -63,12 +60,12 @@ struct autofs_packet_expire {
char name[NAME_MAX+1];
};
-#define AUTOFS_IOC_READY _IO(0x93,0x60)
-#define AUTOFS_IOC_FAIL _IO(0x93,0x61)
-#define AUTOFS_IOC_CATATONIC _IO(0x93,0x62)
-#define AUTOFS_IOC_PROTOVER _IOR(0x93,0x63,int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long)
-#define AUTOFS_IOC_EXPIRE _IOR(0x93,0x65,struct autofs_packet_expire)
+#define AUTOFS_IOC_READY _IO(0x93, 0x60)
+#define AUTOFS_IOC_FAIL _IO(0x93, 0x61)
+#define AUTOFS_IOC_CATATONIC _IO(0x93, 0x62)
+#define AUTOFS_IOC_PROTOVER _IOR(0x93, 0x63, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
+#define AUTOFS_IOC_EXPIRE _IOR(0x93, 0x65, struct autofs_packet_expire)
#endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index e02982fa2953..8f8f1bdcca8c 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -1,6 +1,4 @@
-/* -*- c -*-
- * linux/include/linux/auto_fs4.h
- *
+/*
* Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
*
* This file is part of the Linux kernel and is made available under
@@ -38,7 +36,6 @@
static inline void set_autofs_type_indirect(unsigned int *type)
{
*type = AUTOFS_TYPE_INDIRECT;
- return;
}
static inline unsigned int autofs_type_indirect(unsigned int type)
@@ -49,7 +46,6 @@ static inline unsigned int autofs_type_indirect(unsigned int type)
static inline void set_autofs_type_direct(unsigned int *type)
{
*type = AUTOFS_TYPE_DIRECT;
- return;
}
static inline unsigned int autofs_type_direct(unsigned int type)
@@ -60,7 +56,6 @@ static inline unsigned int autofs_type_direct(unsigned int type)
static inline void set_autofs_type_offset(unsigned int *type)
{
*type = AUTOFS_TYPE_OFFSET;
- return;
}
static inline unsigned int autofs_type_offset(unsigned int type)
@@ -81,7 +76,6 @@ static inline unsigned int autofs_type_trigger(unsigned int type)
static inline void set_autofs_type_any(unsigned int *type)
{
*type = AUTOFS_TYPE_ANY;
- return;
}
static inline unsigned int autofs_type_any(unsigned int type)
@@ -114,7 +108,7 @@ enum autofs_notify {
/* v4 multi expire (via pipe) */
struct autofs_packet_expire_multi {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_token;
int len;
char name[NAME_MAX+1];
};
@@ -154,11 +148,10 @@ union autofs_v5_packet_union {
autofs_packet_expire_direct_t expire_direct;
};
-#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int)
+#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93, 0x66, int)
#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int)
-#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93,0x70,int)
-
+#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93, 0x67, int)
+#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93, 0x70, int)
#endif /* _LINUX_AUTO_FS4_H */
diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h
index 672374450095..cdab17ab907c 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -40,51 +40,51 @@
#define __cpu_to_be16(x) ((__force __be16)(__u16)(x))
#define __be16_to_cpu(x) ((__force __u16)(__be16)(x))
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
{
return (__force __le64)__swab64p(p);
}
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
{
return __swab64p((__u64 *)p);
}
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
{
return (__force __le32)__swab32p(p);
}
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
{
return __swab32p((__u32 *)p);
}
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
{
return (__force __le16)__swab16p(p);
}
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
{
return __swab16p((__u16 *)p);
}
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
{
return (__force __be64)*p;
}
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
{
return (__force __u64)*p;
}
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
{
return (__force __be32)*p;
}
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
{
return (__force __u32)*p;
}
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
{
return (__force __be16)*p;
}
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
{
return (__force __u16)*p;
}
diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h
index d876736a0017..4b93f2b260dd 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -40,51 +40,51 @@
#define __cpu_to_be16(x) ((__force __be16)__swab16((x)))
#define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
{
return (__force __le64)*p;
}
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
{
return (__force __u64)*p;
}
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
{
return (__force __le32)*p;
}
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
{
return (__force __u32)*p;
}
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
{
return (__force __le16)*p;
}
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
{
return (__force __u16)*p;
}
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
{
return (__force __be64)__swab64p(p);
}
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
{
return __swab64p((__u64 *)p);
}
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
{
return (__force __be32)__swab32p(p);
}
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
{
return __swab32p((__u32 *)p);
}
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
{
return (__force __be16)__swab16p(p);
}
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
{
return __swab16p((__u16 *)p);
}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
new file mode 100644
index 000000000000..574e22ec640d
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
+#define KCOV_ENABLE _IO('c', 100)
+#define KCOV_DISABLE _IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 0e011eb91b5d..3f10e5317b46 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -151,7 +151,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
* __swab16p - return a byteswapped 16-bit value from a pointer
* @p: pointer to a naturally-aligned 16-bit value
*/
-static inline __u16 __swab16p(const __u16 *p)
+static __always_inline __u16 __swab16p(const __u16 *p)
{
#ifdef __arch_swab16p
return __arch_swab16p(p);
@@ -164,7 +164,7 @@ static inline __u16 __swab16p(const __u16 *p)
* __swab32p - return a byteswapped 32-bit value from a pointer
* @p: pointer to a naturally-aligned 32-bit value
*/
-static inline __u32 __swab32p(const __u32 *p)
+static __always_inline __u32 __swab32p(const __u32 *p)
{
#ifdef __arch_swab32p
return __arch_swab32p(p);
@@ -177,7 +177,7 @@ static inline __u32 __swab32p(const __u32 *p)
* __swab64p - return a byteswapped 64-bit value from a pointer
* @p: pointer to a naturally-aligned 64-bit value
*/
-static inline __u64 __swab64p(const __u64 *p)
+static __always_inline __u64 __swab64p(const __u64 *p)
{
#ifdef __arch_swab64p
return __arch_swab64p(p);
@@ -232,7 +232,7 @@ static inline void __swab16s(__u16 *p)
* __swab32s - byteswap a 32-bit value in-place
* @p: pointer to a naturally-aligned 32-bit value
*/
-static inline void __swab32s(__u32 *p)
+static __always_inline void __swab32s(__u32 *p)
{
#ifdef __arch_swab32s
__arch_swab32s(p);
@@ -245,7 +245,7 @@ static inline void __swab32s(__u32 *p)
* __swab64s - byteswap a 64-bit value in-place
* @p: pointer to a naturally-aligned 64-bit value
*/
-static inline void __swab64s(__u64 *p)
+static __always_inline void __swab64s(__u64 *p)
{
#ifdef __arch_swab64s
__arch_swab64s(p);
diff --git a/init/Kconfig b/init/Kconfig
index 76b72124f619..651ec15ecddb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -861,6 +861,28 @@ config LOG_CPU_MAX_BUF_SHIFT
13 => 8 KB for each CPU
12 => 4 KB for each CPU
+config NMI_LOG_BUF_SHIFT
+ int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)"
+ range 10 21
+ default 13
+ depends on PRINTK_NMI
+ help
+ Select the size of a per-CPU buffer where NMI messages are temporary
+ stored. They are copied to the main log buffer in a safe context
+ to avoid a deadlock. The value defines the size as a power of 2.
+
+ NMI messages are rare and limited. The largest one is when
+ a backtrace is printed. It usually fits into 4KB. Select
+ 8KB if you want to be on the safe side.
+
+ Examples:
+ 17 => 128 KB for each CPU
+ 16 => 64 KB for each CPU
+ 15 => 32 KB for each CPU
+ 14 => 16 KB for each CPU
+ 13 => 8 KB for each CPU
+ 12 => 4 KB for each CPU
+
#
# Architectures with an unreliable sched_clock() should select this:
#
@@ -1420,6 +1442,28 @@ config KALLSYMS_ALL
Say N unless you really need all symbols.
+config KALLSYMS_ABSOLUTE_PERCPU
+ bool
+ default X86_64 && SMP
+
+config KALLSYMS_BASE_RELATIVE
+ bool
+ depends on KALLSYMS
+ default !IA64 && !(TILE && 64BIT)
+ help
+ Instead of emitting them as absolute values in the native word size,
+ emit the symbol references in the kallsyms table as 32-bit entries,
+ each containing a relative value in the range [base, base + U32_MAX]
+ or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
+ an absolute value in the range [0, S32_MAX] or a relative value in the
+ range [base, base + S32_MAX], where base is the lowest relative symbol
+ address encountered in the image.
+
+ On 64-bit builds, this reduces the size of the address table by 50%,
+ but more importantly, it results in entries whose values are build
+ time constants, and no relocation pass is required at runtime to fix
+ up the entries based on the runtime load address of the kernel.
+
config PRINTK
default y
bool "Enable support for printk" if EXPERT
@@ -1431,6 +1475,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.
+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index b3008bcfb1dc..5db3a3ab493f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_nmi_init();
perf_event_init();
profile_init();
call_function_init();
@@ -722,7 +723,6 @@ static int __init initcall_blacklist(char *str)
static bool __init_or_module initcall_blacklisted(initcall_t fn)
{
- struct list_head *tmp;
struct blacklist_entry *entry;
char *fn_name;
@@ -730,8 +730,7 @@ static bool __init_or_module initcall_blacklisted(initcall_t fn)
if (!fn_name)
return false;
- list_for_each(tmp, &blacklisted_initcalls) {
- entry = list_entry(tmp, struct blacklist_entry, next);
+ list_for_each_entry(entry, &blacklisted_initcalls, next) {
if (!strcmp(fn_name, entry->buf)) {
pr_debug("initcall %s blacklisted\n", fn_name);
kfree(fn_name);
diff --git a/ipc/msg.c b/ipc/msg.c
index 1471db9a7e61..59559a215401 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/ipc/sem.c b/ipc/sem.c
index cddd5b5fde51..e62696584305 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -155,14 +155,21 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
/*
* Locking:
+ * a) global sem_lock() for read/write
* sem_undo.id_next,
* sem_array.complex_count,
- * sem_array.pending{_alter,_cont},
- * sem_array.sem_undo: global sem_lock() for read/write
- * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ * sem_array.complex_mode
+ * sem_array.pending{_alter,_const},
+ * sem_array.sem_undo
*
+ * b) global or semaphore sem_lock() for read/write:
* sem_array.sem_base[i].pending_{const,alter}:
- * global or semaphore sem_lock() for read/write
+ * sem_array.complex_mode (for read)
+ *
+ * c) special:
+ * sem_undo_list.list_proc:
+ * * undo_list->lock for write
+ * * rcu for read
*/
#define sc_semmsl sem_ctls[0]
@@ -263,23 +270,25 @@ static void sem_rcu_free(struct rcu_head *head)
#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb()
/*
- * Wait until all currently ongoing simple ops have completed.
+ * Enter the mode suitable for non-simple operations:
* Caller must own sem_perm.lock.
- * New simple ops cannot start, because simple ops first check
- * that sem_perm.lock is free.
- * that a) sem_perm.lock is free and b) complex_count is 0.
*/
-static void sem_wait_array(struct sem_array *sma)
+static void complexmode_enter(struct sem_array *sma)
{
int i;
struct sem *sem;
- if (sma->complex_count) {
- /* The thread that increased sma->complex_count waited on
- * all sem->lock locks. Thus we don't need to wait again.
- */
+ if (sma->complex_mode) {
+ /* We are already in complex_mode. Nothing to do */
return;
}
+ WRITE_ONCE(sma->complex_mode, true);
+
+ /* We need a full barrier:
+ * The write to complex_mode must be visible
+ * before we read the first sem->lock spinlock state.
+ */
+ smp_mb();
for (i = 0; i < sma->sem_nsems; i++) {
sem = sma->sem_base + i;
@@ -289,6 +298,29 @@ static void sem_wait_array(struct sem_array *sma)
}
/*
+ * Try to leave the mode that disallows simple operations:
+ * Caller must own sem_perm.lock.
+ */
+static void complexmode_tryleave(struct sem_array *sma)
+{
+ if (sma->complex_count) {
+ /* Complex ops are sleeping.
+ * We must stay in complex mode
+ */
+ return;
+ }
+ /*
+ * Immediately after setting complex_mode to false,
+ * a simple op can start. Thus: all memory writes
+ * performed by the current operation must be visible
+ * before we set complex_mode to false.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sma->complex_mode, false);
+}
+
+/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
* Otherwise, lock the entire semaphore array, since we either have
@@ -304,56 +336,38 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
/* Complex operation - acquire a full lock */
ipc_lock_object(&sma->sem_perm);
- /* And wait until all simple ops that are processed
- * right now have dropped their locks.
- */
- sem_wait_array(sma);
+ /* Prevent parallel simple ops */
+ complexmode_enter(sma);
return -1;
}
/*
* Only one semaphore affected - try to optimize locking.
- * The rules are:
- * - optimized locking is possible if no complex operation
- * is either enqueued or processed right now.
- * - The test for enqueued complex ops is simple:
- * sma->complex_count != 0
- * - Testing for complex ops that are processed right now is
- * a bit more difficult. Complex ops acquire the full lock
- * and first wait that the running simple ops have completed.
- * (see above)
- * Thus: If we own a simple lock and the global lock is free
- * and complex_count is now 0, then it will stay 0 and
- * thus just locking sem->lock is sufficient.
+ * Optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ *
+ * Both facts are tracked by complex_mode.
*/
sem = sma->sem_base + sops->sem_num;
- if (sma->complex_count == 0) {
+ /*
+ * Initial check for complex_mode. Just an optimization,
+ * no locking.
+ */
+ if (!READ_ONCE(sma->complex_mode)) {
/*
* It appears that no complex operation is around.
* Acquire the per-semaphore lock.
*/
spin_lock(&sem->lock);
- /* Then check that the global lock is free */
- if (!spin_is_locked(&sma->sem_perm.lock)) {
- /*
- * We need a memory barrier with acquire semantics,
- * otherwise we can race with another thread that does:
- * complex_count++;
- * spin_unlock(sem_perm.lock);
- */
- ipc_smp_acquire__after_spin_is_unlocked();
-
- /*
- * Now repeat the test of complex_count:
- * It can't change anymore until we drop sem->lock.
- * Thus: if is now 0, then it will stay 0.
- */
- if (sma->complex_count == 0) {
- /* fast path successful! */
- return sops->sem_num;
- }
+ /* Now repeat the test for complex_mode.
+ * A memory barrier is provided by the spin_lock()
+ * above.
+ */
+ if (!READ_ONCE(sma->complex_mode)) {
+ /* fast path successful! */
+ return sops->sem_num;
}
spin_unlock(&sem->lock);
}
@@ -373,7 +387,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
/* Not a false alarm, thus complete the sequence for a
* full lock.
*/
- sem_wait_array(sma);
+ complexmode_enter(sma);
return -1;
}
}
@@ -382,6 +396,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
{
if (locknum == -1) {
unmerge_queues(sma);
+ complexmode_tryleave(sma);
ipc_unlock_object(&sma->sem_perm);
} else {
struct sem *sem = sma->sem_base + locknum;
@@ -533,6 +548,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
}
sma->complex_count = 0;
+ sma->complex_mode = true; /* dropped by sem_unlock below */
INIT_LIST_HEAD(&sma->pending_alter);
INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id);
@@ -2186,10 +2202,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
/*
* The proc interface isn't aware of sem_lock(), it calls
* ipc_lock_object() directly (in sysvipc_find_ipc).
- * In order to stay compatible with sem_lock(), we must wait until
- * all simple semop() calls have left their critical regions.
+ * In order to stay compatible with sem_lock(), we must
+ * enter / leave complex_mode.
*/
- sem_wait_array(sma);
+ complexmode_enter(sma);
sem_otime = get_semotime(sma);
@@ -2206,6 +2222,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
sem_otime,
sma->sem_ctime);
+ complexmode_tryleave(sma);
+
return 0;
}
#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index ed3027d0f277..331fc1b0b3c7 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -156,11 +156,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
/*
- * We raced in the idr lookup or with shm_destroy(). Either way, the
- * ID is busted.
+ * Callers of shm_lock() must validate the status of the returned ipc
+ * object pointer (as returned by ipc_lock()), and error out as
+ * appropriate.
*/
- WARN_ON(IS_ERR(ipcp));
-
+ if (IS_ERR(ipcp))
+ return (void *)ipcp;
return container_of(ipcp, struct shmid_kernel, shm_perm);
}
@@ -186,18 +187,33 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
}
-/* This is called by fork, once for every shm attach. */
-static void shm_open(struct vm_area_struct *vma)
+static int __shm_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct shm_file_data *sfd = shm_file_data(file);
struct shmid_kernel *shp;
shp = shm_lock(sfd->ns, sfd->id);
+
+ if (IS_ERR(shp))
+ return PTR_ERR(shp);
+
shp->shm_atim = get_seconds();
shp->shm_lprid = task_tgid_vnr(current);
shp->shm_nattch++;
shm_unlock(shp);
+ return 0;
+}
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+ int err = __shm_open(vma);
+ /*
+ * We raced in the idr lookup or with shm_destroy().
+ * Either way, the ID is busted.
+ */
+ WARN_ON_ONCE(err);
}
/*
@@ -260,6 +276,14 @@ static void shm_close(struct vm_area_struct *vma)
down_write(&shm_ids(ns).rwsem);
/* remove from the list of attaches of the shm segment */
shp = shm_lock(ns, sfd->id);
+
+ /*
+ * We raced in the idr lookup or with shm_destroy().
+ * Either way, the ID is busted.
+ */
+ if (WARN_ON_ONCE(IS_ERR(shp)))
+ goto done; /* no-op */
+
shp->shm_lprid = task_tgid_vnr(current);
shp->shm_dtim = get_seconds();
shp->shm_nattch--;
@@ -267,6 +291,7 @@ static void shm_close(struct vm_area_struct *vma)
shm_destroy(ns, shp);
else
shm_unlock(shp);
+done:
up_write(&shm_ids(ns).rwsem);
}
@@ -388,17 +413,25 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
struct shm_file_data *sfd = shm_file_data(file);
int ret;
+ /*
+ * In case of remap_file_pages() emulation, the file can represent
+ * removed IPC ID: propogate shm_lock() error to caller.
+ */
+ ret =__shm_open(vma);
+ if (ret)
+ return ret;
+
ret = sfd->file->f_op->mmap(sfd->file, vma);
- if (ret != 0)
+ if (ret) {
+ shm_close(vma);
return ret;
+ }
sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
WARN_ON(!sfd->vm_ops->fault);
#endif
vma->vm_ops = &shm_vm_ops;
- shm_open(vma);
-
- return ret;
+ return 0;
}
static int shm_release(struct inode *ino, struct file *file)
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..2dea801370f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,6 +19,17 @@ CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -69,6 +80,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7acaed84a0d..51d76ba1db35 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
- signr, syscall_get_arch(), syscall, is_compat_task(),
- KSTK_EIP(current), code);
+ signr, syscall_get_arch(), syscall,
+ in_compat_syscall(), KSTK_EIP(current), code);
audit_log_end(ab);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 10e088237fed..fd90195667e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -434,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- exit_oom_victim();
+ exit_oom_victim(tsk);
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -655,6 +656,7 @@ void do_exit(long code)
TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
+ kcov_task_exit(tsk);
WARN_ON(blk_needs_flush_plug(tsk));
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c754ae7..b617af6ce31b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,7 @@
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
+#include <linux/kcov.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -164,12 +165,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
+ if (page)
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ 1 << THREAD_SIZE_ORDER);
+
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(ti);
+
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ -(1 << THREAD_SIZE_ORDER));
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -384,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
account_kernel_stack(ti, 1);
+ kcov_task_init(tsk);
+
return tsk;
free_ti:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index e0f90c2b57aa..d234022805dc 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
rcu_read_unlock();
}
-static unsigned long timeout_jiffies(unsigned long timeout)
+static long hung_timeout_jiffies(unsigned long last_checked,
+ unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
- return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+ return timeout ? last_checked - jiffies + timeout * HZ :
+ MAX_SCHEDULE_TIMEOUT;
}
/*
@@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector);
*/
static int watchdog(void *dummy)
{
+ unsigned long hung_last_checked = jiffies;
+
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
+ long t = hung_timeout_jiffies(hung_last_checked, timeout);
- while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
- timeout = sysctl_hung_task_timeout_secs;
-
- if (atomic_xchg(&reset_hung_task, 0))
+ if (t <= 0) {
+ if (!atomic_xchg(&reset_hung_task, 0))
+ check_hung_uninterruptible_tasks(timeout);
+ hung_last_checked = jiffies;
continue;
-
- check_hung_uninterruptible_tasks(timeout);
+ }
+ schedule_timeout_interruptible(t);
}
return 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5c5987f10819..fafd1a3ef0da 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -38,6 +38,7 @@
* during the second link stage.
*/
extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
extern const u8 kallsyms_names[] __weak;
/*
@@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak;
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
+extern const unsigned long kallsyms_relative_base
+__attribute__((weak, section(".rodata")));
+
extern const u8 kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
@@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
return name - kallsyms_names;
}
+static unsigned long kallsyms_sym_address(int idx)
+{
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ return kallsyms_addresses[idx];
+
+ /* values are unsigned offsets if --absolute-percpu is not in effect */
+ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+
+ /* ...otherwise, positive offsets are absolute values */
+ if (kallsyms_offsets[idx] >= 0)
+ return kallsyms_offsets[idx];
+
+ /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
+ return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+}
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
@@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name)
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
- return kallsyms_addresses[i];
+ return kallsyms_sym_address(i);
}
return module_kallsyms_lookup_name(name);
}
@@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
}
@@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long i, low, high, mid;
/* This kernel should never had been booted. */
- BUG_ON(!kallsyms_addresses);
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ BUG_ON(!kallsyms_addresses);
+ else
+ BUG_ON(!kallsyms_offsets);
/* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
@@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
while (high - low > 1) {
mid = low + (high - low) / 2;
- if (kallsyms_addresses[mid] <= addr)
+ if (kallsyms_sym_address(mid) <= addr)
low = mid;
else
high = mid;
@@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
- while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low))
--low;
- symbol_start = kallsyms_addresses[low];
+ symbol_start = kallsyms_sym_address(low);
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
- if (kallsyms_addresses[i] > symbol_start) {
- symbol_end = kallsyms_addresses[i];
+ if (kallsyms_sym_address(i) > symbol_start) {
+ symbol_end = kallsyms_sym_address(i);
break;
}
}
@@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
- iter->value = kallsyms_addresses[iter->pos];
+ iter->value = kallsyms_sym_address(iter->pos);
iter->type = kallsyms_get_symbol_type(off);
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..230279c2454e
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,261 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ */
+ atomic_t rc;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ unsigned size;
+ void *area;
+ struct task_struct *t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ enum kcov_mode mode;
+
+ t = current;
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!t || in_interrupt())
+ return;
+ mode = READ_ONCE(t->kcov_mode);
+ if (mode == KCOV_MODE_TRACE) {
+ unsigned long *area;
+ unsigned long pos;
+
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ area = t->kcov_area;
+ /* The first word is number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = _RET_IP_;
+ WRITE_ONCE(area[0], pos);
+ }
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+ atomic_inc(&kcov->rc);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (atomic_dec_and_test(&kcov->rc)) {
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ t->kcov_mode = 0;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+ t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+ spin_lock(&kcov->lock);
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock(&kcov->lock);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_task_init(t);
+ kcov->t = NULL;
+ spin_unlock(&kcov->lock);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ void *area;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+
+ area = vmalloc_user(vma->vm_end - vma->vm_start);
+ if (!area)
+ return -ENOMEM;
+
+ spin_lock(&kcov->lock);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->mode == 0 || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (!kcov->area) {
+ kcov->area = area;
+ vma->vm_flags |= VM_DONTEXPAND;
+ spin_unlock(&kcov->lock);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
+ }
+ return 0;
+ }
+exit:
+ spin_unlock(&kcov->lock);
+ vfree(area);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ atomic_set(&kcov->rc, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ * Size must be at least 2 to hold current position and one PC.
+ */
+ if (arg < 2 || arg > INT_MAX)
+ return -EINVAL;
+ if (kcov->mode != 0)
+ return -EBUSY;
+ kcov->mode = KCOV_MODE_TRACE;
+ kcov->size = arg;
+ return 0;
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ if (arg != 0 || kcov->mode == 0 || kcov->area == NULL)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ t = current;
+ /* Cache in task struct for performance. */
+ t->kcov_size = kcov->size;
+ t->kcov_area = kcov->area;
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, kcov->mode);
+ t->kcov = kcov;
+ kcov->t = t;
+ /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ if (arg != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov_put(kcov);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+
+ kcov = filep->private_data;
+ spin_lock(&kcov->lock);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock(&kcov->lock);
+ return res;
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+ if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ pr_err("failed to create kcov in debugfs\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..b44cb3f5a15c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
return -EBUSY;
dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
+ if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
+
if (nr_segments > 0) {
unsigned long i;
@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8d34308ea449..f826e117ec09 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1559,3 +1559,9 @@ void __weak crash_map_reserved_pages(void)
void __weak crash_unmap_reserved_pages(void)
{}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 56b18eb1f001..01e9eaa23d9a 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -329,8 +329,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
return -EBUSY;
dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
+ if (flags & KEXEC_FILE_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -379,6 +382,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
return ret;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..f816de9d566b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
diff --git a/kernel/memremap.c b/kernel/memremap.c
index f80167d6a66d..49c82d4301cb 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -383,7 +383,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
/*
* 'memmap_start' is the virtual address for the first "struct
* page" in this range of the vmemmap array. In the case of
- * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+ * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
* pointer arithmetic, so we can perform this to_vmem_altmap()
* conversion without concern for the initialization state of
* the struct page fields.
@@ -392,7 +392,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
struct dev_pagemap *pgmap;
/*
- * Uncoditionally retrieve a dev_pagemap associated with the
+ * Unconditionally retrieve a dev_pagemap associated with the
* given physical address, this is only for use in the
* arch_{add|remove}_memory() for setting up and tearing down
* the memmap.
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
obj-y = printk.o
+obj-$(CONFIG_PRINTK_NMI) += nmi.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..341bedccc065
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,55 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+extern atomic_t nmi_message_lost;
+static inline int get_nmi_message_lost(void)
+{
+ return atomic_xchg(&nmi_message_lost, 0);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+static inline int get_nmi_message_lost(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..98e9e8043426
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,222 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+atomic_t nmi_message_lost;
+
+#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \
+ sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+ atomic_t len; /* length of written data */
+ struct irq_work work; /* IRQ work that flushes the buffer */
+ unsigned char buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = 0;
+ size_t len;
+
+again:
+ len = atomic_read(&s->len);
+
+ if (len >= sizeof(s->buffer)) {
+ atomic_inc(&nmi_message_lost);
+ return 0;
+ }
+
+ /*
+ * Make sure that all old data have been read before the buffer was
+ * reseted. This is not needed when we just append data.
+ */
+ if (!len)
+ smp_rmb();
+
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+ /*
+ * Do it once again if the buffer has been flushed in the meantime.
+ * Note that atomic_cmpxchg() is an implicit memory barrier that
+ * makes sure that the data were written before updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, len + add) != len)
+ goto again;
+
+ /* Get flushed in a more safe context. */
+ if (add && printk_nmi_irq_ready) {
+ /* Make sure that IRQ work is really initialized. */
+ smp_rmb();
+ irq_work_queue(&s->work);
+ }
+
+ return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t read_lock =
+ __RAW_SPIN_LOCK_INITIALIZER(read_lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ size_t len, size;
+ int i, last_i;
+
+ /*
+ * The lock has two functions. First, one reader has to flush all
+ * available message to make the lockless synchronization with
+ * writers easier. Second, we do not want to mix messages from
+ * different CPUs. This is especially important when printing
+ * a backtrace.
+ */
+ raw_spin_lock(&read_lock);
+
+ i = 0;
+more:
+ len = atomic_read(&s->len);
+
+ /*
+ * This is just a paranoid check that nobody has manipulated
+ * the buffer an unexpected way. If we printed something then
+ * @len must only increase.
+ */
+ if (i && i >= len)
+ pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+ i, len);
+
+ if (!len)
+ goto out; /* Someone else has already flushed the buffer. */
+
+ /* Make sure that data has been written up to the @len */
+ smp_rmb();
+
+ size = min(len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ print_nmi_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ print_nmi_seq_line(s, last_i, size - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Check that nothing has got added in the meantime and truncate
+ * the buffer. Note that atomic_cmpxchg() is an implicit memory
+ * barrier that makes sure that the data were copied before
+ * updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, 0) != len)
+ goto more;
+
+out:
+ raw_spin_unlock(&read_lock);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+
+ /* Make sure that IRQ works are initialized before enabling. */
+ smp_wmb();
+ printk_nmi_irq_ready = 1;
+
+ /* Flush pending messages that did not have scheduled IRQ works. */
+ printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c963ba534a78..7ebcfea2490e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
#include "console_cmdline.h"
#include "braille.h"
+#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -1669,6 +1670,7 @@ asmlinkage int vprintk_emit(int facility, int level,
unsigned long flags;
int this_cpu;
int printed_len = 0;
+ int nmi_message_lost;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
static unsigned int logbuf_cpu = UINT_MAX;
@@ -1719,6 +1721,15 @@ asmlinkage int vprintk_emit(int facility, int level,
strlen(recursion_msg));
}
+ nmi_message_lost = get_nmi_message_lost();
+ if (unlikely(nmi_message_lost)) {
+ text_len = scnprintf(textbuf, sizeof(textbuf),
+ "BAD LUCK: lost %d message(s) from NMI context!",
+ nmi_message_lost);
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, textbuf, text_len);
+ }
+
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1868,14 +1879,6 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
/**
* printk - print a kernel message
* @fmt: format string
@@ -1899,21 +1902,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;
va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);
return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 51369697466e..c2199e9901c9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -44,7 +44,7 @@ int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
static cpumask_var_t prof_cpu_mask;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
@@ -202,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
* profile hits. read_profile() IPI's all cpus to request them
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2341efe7fe02..c79b91d09e35 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -681,7 +681,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
break;
#ifdef CONFIG_COMPAT
- if (unlikely(is_compat_task())) {
+ if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
if (copy_siginfo_to_user32(uinfo, &info) ||
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index ea25ceb6220c..18dfc485225c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..f0a9265f43db 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15a1795bbba1..e1e5a354854e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter)
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
- if (is_compat_task()) {
+ if (in_compat_syscall()) {
struct compat_sock_fprog fprog32;
if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
goto out;
@@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall)
{
int *syscall_whitelist = mode1_syscalls;
#ifdef CONFIG_COMPAT
- if (is_compat_task())
+ if (in_compat_syscall())
syscall_whitelist = mode1_syscalls_32;
#endif
do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 3256c7e1c43a..d17c5385c38c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3601,6 +3601,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma)
void __init signals_init(void)
{
+ /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
+ BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
+ != offsetof(struct siginfo, _sifields._pad));
+
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9eb2eef7a4ef..99954ac75708 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -526,7 +526,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* High resolution timer enabled ?
*/
-static int hrtimer_hres_enabled __read_mostly = 1;
+static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
@@ -535,13 +535,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution);
*/
static int __init setup_hrtimer_hres(char *str)
{
- if (!strcmp(str, "off"))
- hrtimer_hres_enabled = 0;
- else if (!strcmp(str, "on"))
- hrtimer_hres_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0b17424349eb..c4646cd619a8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -389,20 +389,14 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-int tick_nohz_enabled __read_mostly = 1;
+bool tick_nohz_enabled __read_mostly = true;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
- if (!strcmp(str, "off"))
- tick_nohz_enabled = 0;
- else if (!strcmp(str, "on"))
- tick_nohz_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..2c4034761c7f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+/*
+ * Like schedule_timeout_uninterruptible(), except this task will not contribute
+ * to load average.
+ */
+signed long __sched schedule_timeout_idle(signed long timeout)
+{
+ __set_current_state(TASK_IDLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_idle);
+
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
{
diff --git a/lib/Kconfig b/lib/Kconfig
index 133ebc0c1773..ee38a3fd2814 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fe802ae067c6..283323881397 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -684,6 +684,27 @@ source "lib/Kconfig.kasan"
endmenu # "Memory Debugging"
+config ARCH_HAS_KCOV
+ bool
+ help
+ KCOV does not have any arch-specific code, but currently it is enabled
+ only for x86_64. KCOV requires testing on other archs, and most likely
+ disabling of instrumentation for some early boot code.
+
+config KCOV
+ bool "Code coverage for fuzzing"
+ depends on ARCH_HAS_KCOV
+ select DEBUG_FS
+ help
+ KCOV exposes kernel code coverage information in a form suitable
+ for coverage-guided fuzzing (randomized testing).
+
+ If RANDOMIZE_BASE is enabled, PC values will not be stable across
+ different machines and across reboots. If you need stable PC values,
+ disable RANDOMIZE_BASE.
+
+ For more details, see Documentation/kcov.txt.
+
config DEBUG_SHIRQ
bool "Debug shared IRQ handlers"
depends on DEBUG_KERNEL
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 49518fb48cab..39494af9a84a 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -13,11 +13,18 @@ config UBSAN_SANITIZE_ALL
bool "Enable instrumentation for the entire kernel"
depends on UBSAN
depends on ARCH_HAS_UBSAN_SANITIZE_ALL
+
+ # We build with -Wno-maybe-uninitilzed, but we still want to
+ # use -Wmaybe-uninitilized in allmodconfig builds.
+ # So dependsy bellow used to disable this option in allmodconfig
+ depends on !COMPILE_TEST
default y
help
This option activates instrumentation for the entire kernel.
If you don't enable this option, you have to explicitly specify
UBSAN_SANITIZE := y for the files/directories you want to check for UB.
+ Enabling this option will get kernel image size increased
+ significantly.
config UBSAN_ALIGNMENT
bool "Enable checking of pointers alignment"
@@ -25,5 +32,5 @@ config UBSAN_ALIGNMENT
default y if !HAVE_EFFICIENT_UNALIGNED_ACCESS
help
This option enables detection of unaligned memory accesses.
- Enabling this option on architectures that support unalligned
+ Enabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
diff --git a/lib/Makefile b/lib/Makefile
index a7c26a41a738..440369b7ba22 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
endif
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
@@ -83,6 +95,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/bug.c b/lib/bug.c
index cff145f032a5..6cde380f09de 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -175,6 +175,17 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
pr_warn("WARNING: at %p [verbose debug info unavailable]\n",
(void *)bugaddr);
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from
+ * panicking the system on this thread. Other threads
+ * are blocked by the panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
print_modules();
show_regs(regs);
print_oops_end_marker();
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/extable.c b/lib/extable.c
index 4cac81ec225e..0be02ad561e9 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -14,7 +14,37 @@
#include <linux/sort.h>
#include <asm/uaccess.h>
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define ex_to_insn(x) ((x)->insn)
+#else
+static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+#endif
+
#ifndef ARCH_HAS_SORT_EXTABLE
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define swap_ex NULL
+#else
+static void swap_ex(void *a, void *b, int size)
+{
+ struct exception_table_entry *x = a, *y = b, tmp;
+ int delta = b - a;
+
+ tmp = *x;
+ x->insn = y->insn + delta;
+ y->insn = tmp.insn - delta;
+
+#ifdef swap_ex_entry_fixup
+ swap_ex_entry_fixup(x, y, tmp, delta);
+#else
+ x->fixup = y->fixup + delta;
+ y->fixup = tmp.fixup - delta;
+#endif
+}
+#endif /* ARCH_HAS_RELATIVE_EXTABLE */
+
/*
* The exception table needs to be sorted so that the binary
* search that we use to find entries in it works properly.
@@ -26,9 +56,9 @@ static int cmp_ex(const void *a, const void *b)
const struct exception_table_entry *x = a, *y = b;
/* avoid overflow */
- if (x->insn > y->insn)
+ if (ex_to_insn(x) > ex_to_insn(y))
return 1;
- if (x->insn < y->insn)
+ if (ex_to_insn(x) < ex_to_insn(y))
return -1;
return 0;
}
@@ -37,7 +67,7 @@ void sort_extable(struct exception_table_entry *start,
struct exception_table_entry *finish)
{
sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, NULL);
+ cmp_ex, swap_ex);
}
#ifdef CONFIG_MODULES
@@ -48,13 +78,15 @@ void sort_extable(struct exception_table_entry *start,
void trim_init_extable(struct module *m)
{
/*trim the beginning*/
- while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+ while (m->num_exentries &&
+ within_module_init(ex_to_insn(&m->extable[0]), m)) {
m->extable++;
m->num_exentries--;
}
/*trim the end*/
while (m->num_exentries &&
- within_module_init(m->extable[m->num_exentries-1].insn, m))
+ within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
+ m))
m->num_exentries--;
}
#endif /* CONFIG_MODULES */
@@ -81,13 +113,13 @@ search_extable(const struct exception_table_entry *first,
* careful, the distance between value and insn
* can be larger than MAX_LONG:
*/
- if (mid->insn < value)
+ if (ex_to_insn(mid) < value)
first = mid + 1;
- else if (mid->insn > value)
+ else if (ex_to_insn(mid) > value)
last = mid - 1;
else
return mid;
- }
- return NULL;
+ }
+ return NULL;
}
#endif
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index 94be244e8441..d8a5cf66c316 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -321,6 +321,70 @@ int kstrtos8(const char *s, unsigned int base, s8 *res)
}
EXPORT_SYMBOL(kstrtos8);
+/**
+ * kstrtobool - convert common user inputs into boolean values
+ * @s: input string
+ * @res: result
+ *
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0', or
+ * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value
+ * pointed to by res is updated upon finding a match.
+ */
+int kstrtobool(const char *s, bool *res)
+{
+ if (!s)
+ return -EINVAL;
+
+ switch (s[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ *res = true;
+ return 0;
+ case 'n':
+ case 'N':
+ case '0':
+ *res = false;
+ return 0;
+ case 'o':
+ case 'O':
+ switch (s[1]) {
+ case 'n':
+ case 'N':
+ *res = true;
+ return 0;
+ case 'f':
+ case 'F':
+ *res = false;
+ return 0;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(kstrtobool);
+
+/*
+ * Since "base" would be a nonsense argument, this open-codes the
+ * _from_user helper instead of using the helper macro below.
+ */
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
+{
+ /* Longest string needed to differentiate, newline, terminator */
+ char buf[4];
+
+ count = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, s, count))
+ return -EFAULT;
+ buf[count] = '\0';
+ return kstrtobool(buf, res);
+}
+EXPORT_SYMBOL(kstrtobool_from_user);
+
#define kstrto_from_user(f, g, type) \
int f(const char __user *s, size_t count, unsigned int base, type *res) \
{ \
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
#include <linux/delay.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
-#include <linux/seq_buf.h>
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
/*
* When raise() is called it will be is passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
void nmi_trigger_all_cpu_backtrace(bool include_self,
void (*raise)(cpumask_t *mask))
{
- struct nmi_seq_buf *s;
- int i, cpu, this_cpu = get_cpu();
+ int i, this_cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("Sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
}
/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int len, last_i = 0;
+ printk_nmi_flush();
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
bool nmi_cpu_backtrace(struct pt_regs *regs)
{
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
pr_warn("NMI backtrace for cpu %d\n", cpu);
if (regs)
show_regs(regs);
else
dump_stack();
- this_cpu_write(printk_func, printk_func_save);
-
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6b79e9026e24..1624c4117961 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -173,6 +173,41 @@ radix_tree_find_next_bit(const unsigned long *addr,
return size;
}
+#if 0
+static void dump_node(void *slot, int height, int offset)
+{
+ struct radix_tree_node *node;
+ int i;
+
+ if (!slot)
+ return;
+
+ if (height == 0) {
+ pr_debug("radix entry %p offset %d\n", slot, offset);
+ return;
+ }
+
+ node = indirect_to_ptr(slot);
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
+ slot, offset, node->tags[0][0], node->tags[1][0],
+ node->tags[2][0], node->path, node->count, node->parent);
+
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+ dump_node(node->slots[i], height - 1, i);
+}
+
+/* For debug */
+static void radix_tree_dump(struct radix_tree_root *root)
+{
+ pr_debug("radix root: %p height %d rnode %p tags %x\n",
+ root, root->height, root->rnode,
+ root->gfp_mask >> __GFP_BITS_SHIFT);
+ if (!radix_tree_is_indirect_ptr(root->rnode))
+ return;
+ dump_node(root->rnode, root->height, 0);
+}
+#endif
+
/*
* This assumes that the caller has performed appropriate preallocation, and
* that the caller has pinned this thread of control to the current CPU.
@@ -192,6 +227,15 @@ radix_tree_node_alloc(struct radix_tree_root *root)
struct radix_tree_preload *rtp;
/*
+ * Even if the caller has preloaded, try to allocate from the
+ * cache first for the new node to get accounted.
+ */
+ ret = kmem_cache_alloc(radix_tree_node_cachep,
+ gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN);
+ if (ret)
+ goto out;
+
+ /*
* Provided the caller has preloaded here, we will always
* succeed in getting a node here (and never reach
* kmem_cache_alloc)
@@ -208,10 +252,11 @@ radix_tree_node_alloc(struct radix_tree_root *root)
* for debugging.
*/
kmemleak_update_trace(ret);
+ goto out;
}
- if (ret == NULL)
- ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
-
+ ret = kmem_cache_alloc(radix_tree_node_cachep,
+ gfp_mask | __GFP_ACCOUNT);
+out:
BUG_ON(radix_tree_is_indirect_ptr(ret));
return ret;
}
@@ -323,7 +368,8 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
/*
* Extend a radix tree so it can store key @index.
*/
-static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+static int radix_tree_extend(struct radix_tree_root *root,
+ unsigned long index, unsigned order)
{
struct radix_tree_node *node;
struct radix_tree_node *slot;
@@ -335,7 +381,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
while (index > radix_tree_maxindex(height))
height++;
- if (root->rnode == NULL) {
+ if ((root->rnode == NULL) && (order == 0)) {
root->height = height;
goto out;
}
@@ -358,9 +404,10 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
node->count = 1;
node->parent = NULL;
slot = root->rnode;
- if (newheight > 1) {
+ if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
slot = indirect_to_ptr(slot);
slot->parent = node;
+ slot = ptr_to_indirect(slot);
}
node->slots[0] = slot;
node = ptr_to_indirect(node);
@@ -375,6 +422,7 @@ out:
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
+ * @order: index occupies 2^order aligned slots
* @nodep: returns node
* @slotp: returns slot
*
@@ -388,26 +436,29 @@ out:
* Returns -ENOMEM, or 0 for success.
*/
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
- struct radix_tree_node **nodep, void ***slotp)
+ unsigned order, struct radix_tree_node **nodep,
+ void ***slotp)
{
struct radix_tree_node *node = NULL, *slot;
unsigned int height, shift, offset;
int error;
+ BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
+
/* Make sure the tree is high enough. */
if (index > radix_tree_maxindex(root->height)) {
- error = radix_tree_extend(root, index);
+ error = radix_tree_extend(root, index, order);
if (error)
return error;
}
- slot = indirect_to_ptr(root->rnode);
+ slot = root->rnode;
height = root->height;
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ shift = height * RADIX_TREE_MAP_SHIFT;
offset = 0; /* uninitialised var warning */
- while (height > 0) {
+ while (shift > order) {
if (slot == NULL) {
/* Have to add a child node. */
if (!(slot = radix_tree_node_alloc(root)))
@@ -415,19 +466,38 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
slot->path = height;
slot->parent = node;
if (node) {
- rcu_assign_pointer(node->slots[offset], slot);
+ rcu_assign_pointer(node->slots[offset],
+ ptr_to_indirect(slot));
node->count++;
slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
} else
- rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
- }
+ rcu_assign_pointer(root->rnode,
+ ptr_to_indirect(slot));
+ } else if (!radix_tree_is_indirect_ptr(slot))
+ break;
/* Go a level down */
+ height--;
+ shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = slot;
+ node = indirect_to_ptr(slot);
slot = node->slots[offset];
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ }
+
+ /* Insert pointers to the canonical entry */
+ if ((shift - order) > 0) {
+ int i, n = 1 << (shift - order);
+ offset = offset & ~(n - 1);
+ slot = ptr_to_indirect(&node->slots[offset]);
+ for (i = 0; i < n; i++) {
+ if (node->slots[offset + i])
+ return -EEXIST;
+ }
+
+ for (i = 1; i < n; i++) {
+ rcu_assign_pointer(node->slots[offset + i], slot);
+ node->count++;
+ }
}
if (nodep)
@@ -438,15 +508,16 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
}
/**
- * radix_tree_insert - insert into a radix tree
+ * __radix_tree_insert - insert into a radix tree
* @root: radix tree root
* @index: index key
+ * @order: key covers the 2^order indices around index
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
*/
-int radix_tree_insert(struct radix_tree_root *root,
- unsigned long index, void *item)
+int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+ unsigned order, void *item)
{
struct radix_tree_node *node;
void **slot;
@@ -454,7 +525,7 @@ int radix_tree_insert(struct radix_tree_root *root,
BUG_ON(radix_tree_is_indirect_ptr(item));
- error = __radix_tree_create(root, index, &node, &slot);
+ error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
return error;
if (*slot != NULL)
@@ -472,7 +543,7 @@ int radix_tree_insert(struct radix_tree_root *root,
return 0;
}
-EXPORT_SYMBOL(radix_tree_insert);
+EXPORT_SYMBOL(__radix_tree_insert);
/**
* __radix_tree_lookup - lookup an item in a radix tree
@@ -523,6 +594,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
node = rcu_dereference_raw(*slot);
if (node == NULL)
return NULL;
+ if (!radix_tree_is_indirect_ptr(node))
+ break;
+ node = indirect_to_ptr(node);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
@@ -609,6 +683,9 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
tag_set(slot, tag, offset);
slot = slot->slots[offset];
BUG_ON(slot == NULL);
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ slot = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
@@ -648,11 +725,14 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
goto out;
shift = height * RADIX_TREE_MAP_SHIFT;
- slot = indirect_to_ptr(root->rnode);
+ slot = root->rnode;
while (shift) {
if (slot == NULL)
goto out;
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ slot = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -728,6 +808,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
if (node == NULL)
return 0;
+ node = indirect_to_ptr(node);
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
if (!tag_get(node, tag, offset))
@@ -735,6 +816,8 @@ int radix_tree_tag_get(struct radix_tree_root *root,
if (height == 1)
return 1;
node = rcu_dereference_raw(node->slots[offset]);
+ if (!radix_tree_is_indirect_ptr(node))
+ return 1;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
@@ -795,6 +878,7 @@ restart:
node = rnode;
while (1) {
+ struct radix_tree_node *slot;
if ((flags & RADIX_TREE_ITER_TAGGED) ?
!test_bit(offset, node->tags[tag]) :
!node->slots[offset]) {
@@ -825,9 +909,12 @@ restart:
if (!shift)
break;
- node = rcu_dereference_raw(node->slots[offset]);
- if (node == NULL)
+ slot = rcu_dereference_raw(node->slots[offset]);
+ if (slot == NULL)
goto restart;
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ node = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
}
@@ -925,15 +1012,20 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
if (!tag_get(slot, iftag, offset))
goto next;
if (shift) {
- /* Go down one level */
- shift -= RADIX_TREE_MAP_SHIFT;
node = slot;
slot = slot->slots[offset];
- continue;
+ if (radix_tree_is_indirect_ptr(slot)) {
+ slot = indirect_to_ptr(slot);
+ shift -= RADIX_TREE_MAP_SHIFT;
+ continue;
+ } else {
+ slot = node;
+ node = node->parent;
+ }
}
/* tag the leaf */
- tagged++;
+ tagged += 1 << shift;
tag_set(slot, settag, offset);
/* walk back up the path tagging interior nodes */
@@ -1181,10 +1273,20 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
goto out;
}
- shift -= RADIX_TREE_MAP_SHIFT;
slot = rcu_dereference_raw(slot->slots[i]);
if (slot == NULL)
goto out;
+ if (!radix_tree_is_indirect_ptr(slot)) {
+ if (slot == item) {
+ *found_index = index + i;
+ index = 0;
+ } else {
+ index += shift;
+ }
+ goto out;
+ }
+ slot = indirect_to_ptr(slot);
+ shift -= RADIX_TREE_MAP_SHIFT;
}
/* Bottom level: check items */
@@ -1264,11 +1366,13 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
/*
* The candidate node has more than one child, or its child
- * is not at the leftmost slot, we cannot shrink.
+ * is not at the leftmost slot, or it is a multiorder entry,
+ * we cannot shrink.
*/
if (to_free->count != 1)
break;
- if (!to_free->slots[0])
+ slot = to_free->slots[0];
+ if (!slot)
break;
/*
@@ -1278,8 +1382,11 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* (to_free->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
- slot = to_free->slots[0];
if (root->height > 1) {
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+
+ slot = indirect_to_ptr(slot);
slot->parent = NULL;
slot = ptr_to_indirect(slot);
}
@@ -1377,7 +1484,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node;
- unsigned int offset;
+ unsigned int offset, i;
void **slot;
void *entry;
int tag;
@@ -1406,6 +1513,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
radix_tree_tag_clear(root, index, tag);
}
+ /* Delete any sibling slots pointing to this slot */
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr_to_indirect(slot))
+ break;
+ node->slots[offset + i] = NULL;
+ node->count--;
+ }
node->slots[offset] = NULL;
node->count--;
diff --git a/lib/string.c b/lib/string.c
index 0323c0d5629a..ed83562a53ae 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -631,33 +631,30 @@ bool sysfs_streq(const char *s1, const char *s2)
EXPORT_SYMBOL(sysfs_streq);
/**
- * strtobool - convert common user inputs into boolean values
- * @s: input string
- * @res: result
+ * match_string - matches given string in an array
+ * @array: array of strings
+ * @n: number of strings in the array or -1 for NULL terminated arrays
+ * @string: string to match with
*
- * This routine returns 0 iff the first character is one of 'Yy1Nn0'.
- * Otherwise it will return -EINVAL. Value pointed to by res is
- * updated upon finding a match.
- */
-int strtobool(const char *s, bool *res)
-{
- switch (s[0]) {
- case 'y':
- case 'Y':
- case '1':
- *res = true;
- break;
- case 'n':
- case 'N':
- case '0':
- *res = false;
- break;
- default:
- return -EINVAL;
+ * Return:
+ * index of a @string in the @array if matches, or %-EINVAL otherwise.
+ */
+int match_string(const char * const *array, size_t n, const char *string)
+{
+ int index;
+ const char *item;
+
+ for (index = 0; index < n; index++) {
+ item = array[index];
+ if (!item)
+ break;
+ if (!strcmp(item, string))
+ return index;
}
- return 0;
+
+ return -EINVAL;
}
-EXPORT_SYMBOL(strtobool);
+EXPORT_SYMBOL(match_string);
#ifndef __HAVE_ARCH_MEMSET
/**
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 4f6ae60433bc..563f10e6876a 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -17,6 +17,9 @@
#include <linux/socket.h>
#include <linux/in.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+
#define BUF_SIZE 256
#define PAD_SIZE 16
#define FILL_CHAR '$'
@@ -411,6 +414,55 @@ netdev_features(void)
}
static void __init
+flags(void)
+{
+ unsigned long flags;
+ gfp_t gfp;
+ char *cmp_buffer;
+
+ flags = 0;
+ test("", "%pGp", &flags);
+
+ /* Page flags should filter the zone id */
+ flags = 1UL << NR_PAGEFLAGS;
+ test("", "%pGp", &flags);
+
+ flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
+ | 1UL << PG_active | 1UL << PG_swapbacked;
+ test("uptodate|dirty|lru|active|swapbacked", "%pGp", &flags);
+
+
+ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
+ | VM_DENYWRITE;
+ test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags);
+
+ gfp = GFP_TRANSHUGE;
+ test("GFP_TRANSHUGE", "%pGg", &gfp);
+
+ gfp = GFP_ATOMIC|__GFP_DMA;
+ test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
+
+ gfp = __GFP_ATOMIC;
+ test("__GFP_ATOMIC", "%pGg", &gfp);
+
+ cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+ if (!cmp_buffer)
+ return;
+
+ /* Any flags not translated by the table should remain numeric */
+ gfp = ~__GFP_BITS_MASK;
+ snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
+ test(cmp_buffer, "%pGg", &gfp);
+
+ snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
+ (unsigned long) gfp);
+ gfp |= __GFP_ATOMIC;
+ test(cmp_buffer, "%pGg", &gfp);
+
+ kfree(cmp_buffer);
+}
+
+static void __init
test_pointer(void)
{
plain();
@@ -428,6 +480,7 @@ test_pointer(void)
struct_clk();
bitmap();
netdev_features();
+ flags();
}
static int __init
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 48ff9c36644d..525c8e19bda2 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -35,6 +35,8 @@
#include <linux/blkdev.h>
#endif
+#include "../mm/internal.h" /* For the trace_print_flags arrays */
+
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
#include <asm/byteorder.h> /* cpu_to_le16 */
@@ -1407,6 +1409,72 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
}
}
+static
+char *format_flags(char *buf, char *end, unsigned long flags,
+ const struct trace_print_flags *names)
+{
+ unsigned long mask;
+ const struct printf_spec strspec = {
+ .field_width = -1,
+ .precision = -1,
+ };
+ const struct printf_spec numspec = {
+ .flags = SPECIAL|SMALL,
+ .field_width = -1,
+ .precision = -1,
+ .base = 16,
+ };
+
+ for ( ; flags && names->name; names++) {
+ mask = names->mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ buf = string(buf, end, names->name, strspec);
+
+ flags &= ~mask;
+ if (flags) {
+ if (buf < end)
+ *buf = '|';
+ buf++;
+ }
+ }
+
+ if (flags)
+ buf = number(buf, end, flags, numspec);
+
+ return buf;
+}
+
+static noinline_for_stack
+char *flags_string(char *buf, char *end, void *flags_ptr, const char *fmt)
+{
+ unsigned long flags;
+ const struct trace_print_flags *names;
+
+ switch (fmt[1]) {
+ case 'p':
+ flags = *(unsigned long *)flags_ptr;
+ /* Remove zone id */
+ flags &= (1UL << NR_PAGEFLAGS) - 1;
+ names = pageflag_names;
+ break;
+ case 'v':
+ flags = *(unsigned long *)flags_ptr;
+ names = vmaflag_names;
+ break;
+ case 'g':
+ flags = *(gfp_t *)flags_ptr;
+ names = gfpflag_names;
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported flags modifier: %c\n", fmt[1]);
+ return buf;
+ }
+
+ return format_flags(buf, end, flags, names);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1495,6 +1563,11 @@ int kptr_restrict __read_mostly;
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'G' For flags to be printed as a collection of symbolic strings that would
+ * construct the specific value. Supported flags given by option:
+ * p page flags (see struct page) given as pointer to unsigned long
+ * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
+ * v vma flags (VM_*) given as pointer to unsigned long
*
* ** Please update also Documentation/printk-formats.txt when making changes **
*
@@ -1590,22 +1663,23 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return buf;
}
case 'K':
- /*
- * %pK cannot be used in IRQ context because its test
- * for CAP_SYSLOG would be meaningless.
- */
- if (kptr_restrict && (in_irq() || in_serving_softirq() ||
- in_nmi())) {
- if (spec.field_width == -1)
- spec.field_width = default_width;
- return string(buf, end, "pK-error", spec);
- }
-
switch (kptr_restrict) {
case 0:
/* Always print %pK values */
break;
case 1: {
+ const struct cred *cred;
+
+ /*
+ * kptr_restrict==1 cannot be used in IRQ context
+ * because its test for CAP_SYSLOG would be meaningless.
+ */
+ if (in_irq() || in_serving_softirq() || in_nmi()) {
+ if (spec.field_width == -1)
+ spec.field_width = default_width;
+ return string(buf, end, "pK-error", spec);
+ }
+
/*
* Only print the real pointer value if the current
* process has CAP_SYSLOG and is running with the
@@ -1615,8 +1689,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
* leak pointer values if a binary opens a file using
* %pK and then elevates privileges before reading it.
*/
- const struct cred *cred = current_cred();
-
+ cred = current_cred();
if (!has_capability_noaudit(current, CAP_SYSLOG) ||
!uid_eq(cred->euid, cred->uid) ||
!gid_eq(cred->egid, cred->gid))
@@ -1648,6 +1721,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return bdev_name(buf, end, ptr, spec, fmt);
#endif
+ case 'G':
+ return flags_string(buf, end, ptr, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
diff --git a/mm/Kconfig b/mm/Kconfig
index 03cbfa072f42..363ee0ca189f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -652,8 +652,6 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
- default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on X86_64 #arch_add_memory() comprehends device memory
@@ -667,5 +665,10 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config NR_ZONES_EXTENDED
+ bool
+ default n if !64BIT
+ default y if ZONE_DEVICE && ZONE_DMA && ZONE_DMA32
+
config FRAME_VECTOR
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..ee57dea429b9 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,52 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
config PAGE_POISONING
- bool
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on !HIBERNATION
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..fb1a7948c107 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,21 @@
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -48,7 +63,10 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ obj-$(CONFIG_DEBUG_PAGEALLOC) += debug-pagealloc.o
+endif
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 926c76d56388..46d19a6a06ab 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
return 0;
out_destroy_stat:
- while (--i)
+ while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
@@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, a short sleep or a cond_resched is
- * performed to yield the processor and to allow other subsystems to make
- * a forward progress.
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
@@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
-
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that a particular
- * WQ is congested if the worker thread is looping without
- * ever sleeping. Therefore we have to do a short sleep
- * here rather than calling cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
+ cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..4cb1c2ef5abb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,9 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -71,49 +75,6 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
@@ -200,7 +161,8 @@ static void reset_cached_positions(struct zone *zone)
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
}
/*
@@ -554,13 +516,17 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
@@ -573,11 +539,13 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +831,23 @@ unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1076,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
@@ -1115,16 +1090,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
@@ -1135,7 +1115,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
@@ -1154,8 +1135,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/* Perform the isolation */
isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
@@ -1211,11 +1192,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1358,10 +1339,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1371,11 +1351,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
@@ -1497,6 +1477,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1759,4 +1740,221 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0;
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (cc.mode != MIGRATE_ASYNC &&
+ status == COMPACT_COMPLETE) {
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->classzone_idx >= cc.classzone_idx)
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+
+module_init(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 5bf5906ce13b..0928d1390a6d 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -6,132 +6,7 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool page_poisoning_enabled __read_mostly;
-
-static bool need_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return false;
-
- return true;
-}
-
-static void init_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return;
-
- page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
- .need = need_page_poisoning,
- .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline bool page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static void poison_page(struct page *page)
-{
- void *addr = kmap_atomic(page);
-
- set_page_poison(page);
- memset(addr, PAGE_POISON, PAGE_SIZE);
- kunmap_atomic(addr);
-}
-
-static void poison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- poison_page(page + i);
-}
-
-static bool single_bit_flip(unsigned char a, unsigned char b)
-{
- unsigned char error = a ^ b;
-
- return error && !(error & (error - 1));
-}
-
-static void check_poison_mem(unsigned char *mem, size_t bytes)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
- unsigned char *start;
- unsigned char *end;
-
- start = memchr_inv(mem, PAGE_POISON, bytes);
- if (!start)
- return;
-
- for (end = mem + bytes - 1; end > start; end--) {
- if (*end != PAGE_POISON)
- break;
- }
-
- if (!__ratelimit(&ratelimit))
- return;
- else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
- else
- printk(KERN_ERR "pagealloc: memory corruption\n");
-
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
- end - start + 1, 1);
- dump_stack();
-}
-
-static void unpoison_page(struct page *page)
-{
- void *addr;
-
- if (!page_poison(page))
- return;
-
- addr = kmap_atomic(page);
- check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
- kunmap_atomic(addr);
-}
-
-static void unpoison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- unpoison_page(page + i);
-}
-
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
- if (!page_poisoning_enabled)
- return;
-
- if (enable)
- unpoison_pages(page, numpages);
- else
- poison_pages(page, numpages);
+ kernel_poison_pages(page, numpages, enable);
}
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5d6481..df7247b0b532 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,75 +9,38 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
- {1UL << PG_head, "head" },
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
- {1UL << PG_young, "young" },
- {1UL << PG_idle, "idle" },
-#endif
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
};
-static void dump_flags(unsigned long flags,
- const struct trace_print_flags *names, int count)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < count && flags; i++) {
-
- mask = names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- pr_cont("%s%s", delim, names[i].name);
- delim = "|";
- }
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
- /* check for left over flags */
- if (flags)
- pr_cont("%s%#lx", delim, flags);
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
- pr_cont(")\n");
-}
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
@@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason,
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
- }
+
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
- dump_page_badflags(page, reason, 0);
+ __dump_page(page, reason);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
- {VM_READ, "read" },
- {VM_WRITE, "write" },
- {VM_EXEC, "exec" },
- {VM_SHARED, "shared" },
- {VM_MAYREAD, "mayread" },
- {VM_MAYWRITE, "maywrite" },
- {VM_MAYEXEC, "mayexec" },
- {VM_MAYSHARE, "mayshare" },
- {VM_GROWSDOWN, "growsdown" },
- {VM_PFNMAP, "pfnmap" },
- {VM_DENYWRITE, "denywrite" },
- {VM_LOCKONFAULT, "lockonfault" },
- {VM_LOCKED, "locked" },
- {VM_IO, "io" },
- {VM_SEQ_READ, "seqread" },
- {VM_RAND_READ, "randread" },
- {VM_DONTCOPY, "dontcopy" },
- {VM_DONTEXPAND, "dontexpand" },
- {VM_ACCOUNT, "account" },
- {VM_NORESERVE, "noreserve" },
- {VM_HUGETLB, "hugetlb" },
-#if defined(CONFIG_X86)
- {VM_PAT, "pat" },
-#elif defined(CONFIG_PPC)
- {VM_SAO, "sao" },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
- {VM_GROWSUP, "growsup" },
-#elif !defined(CONFIG_MMU)
- {VM_MAPPED_COPY, "mappedcopy" },
-#else
- {VM_ARCH_1, "arch_1" },
-#endif
- {VM_DONTDUMP, "dontdump" },
-#ifdef CONFIG_MEM_SOFT_DIRTY
- {VM_SOFTDIRTY, "softdirty" },
-#endif
- {VM_MIXEDMAP, "mixedmap" },
- {VM_HUGEPAGE, "hugepage" },
- {VM_NOHUGEPAGE, "nohugepage" },
- {VM_MERGEABLE, "mergeable" },
-};
-
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n",
+ "pgoff %lx file %p private_data %p\n"
+ "flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
- vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
- "%s", /* This is here to hold the comma */
+ "def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
- "" /* This is here to not have a comma! */
- );
-
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ mm->def_flags, &mm->def_flags
+ );
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5826..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
static struct {
struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
.cache_filter = false,
};
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
if (gfpflags & __GFP_NOFAIL)
return false;
if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
- if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, size);
+ return should_fail(&failslab.attr, s->object_size);
}
static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0720c9d19365..6414803972f7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
@@ -216,8 +214,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
@@ -231,7 +228,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
@@ -240,11 +236,9 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -532,7 +526,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
@@ -542,9 +535,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -557,8 +549,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -576,7 +567,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -1245,7 +1236,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1253,8 +1243,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1307,7 +1299,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1317,13 +1308,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1374,7 +1360,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1385,12 +1370,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1450,7 +1431,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1461,12 +1441,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1529,7 +1505,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1539,12 +1514,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -1649,6 +1620,15 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
@@ -1890,6 +1870,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
* @offset: page index
+ * @gfp_mask: memory allocation flags
*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
@@ -2151,10 +2132,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
@@ -2283,7 +2265,7 @@ static struct page *wait_on_page_read(struct page *page)
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
@@ -2305,53 +2287,74 @@ repeat:
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
- goto retry;
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index aea8f7a42df9..3cda32c5bdb3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -57,7 +58,8 @@ enum scan_result {
SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
- SCAN_CGROUP_CHARGE_FAIL
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE
};
#define CREATE_TRACE_POINTS
@@ -99,6 +101,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -586,6 +589,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
@@ -594,6 +624,7 @@ static struct attribute *khugepaged_attr[] = {
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -2313,6 +2344,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return true;
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long _address;
+ pte_t *pte, pteval;
+ int swapped_in = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
@@ -2381,6 +2450,8 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
}
+ __collapse_huge_page_swapin(mm, vma, address, pmd);
+
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
@@ -2457,9 +2528,6 @@ static void collapse_huge_page(struct mm_struct *mm,
result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
-
out_nolock:
trace_mm_collapse_huge_page(mm, isolated, result);
return;
@@ -2479,7 +2547,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2495,6 +2563,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
@@ -2581,7 +2657,7 @@ out_unmap:
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
- none_or_zero, result);
+ none_or_zero, result, unmapped);
return ret;
}
@@ -3218,28 +3294,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
}
}
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
- int mapcount;
struct page *page_tail = head + tail;
- mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_add().
+ * it's safer to use atomic_inc().
*/
- atomic_add(mapcount + 1, &page_tail->_count);
-
+ atomic_inc(&page_tail->_count);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3273,8 +3347,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
lru_add_page_tail(head, page_tail, lruvec, list);
-
- return mapcount;
}
static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3282,7 +3354,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- int i, tail_mapcount;
+ int i;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -3291,10 +3363,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- tail_mapcount = 0;
for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
- tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
- atomic_sub(tail_mapcount, &head->_count);
+ __split_huge_page_tail(head, i, lruvec, list);
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
@@ -3459,6 +3529,7 @@ void deferred_split_huge_page(struct page *page)
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebddb4..f9153e580d81 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -34,9 +35,18 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
+
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
@@ -131,13 +141,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
return page_idx ^ (1 << order);
}
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
-#ifdef CONFIG_MEMORY_FAILURE
-extern bool is_free_buddy_page(struct page *page);
-#endif
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -162,6 +181,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -466,4 +486,9 @@ static inline void try_to_unmap_flush_dirty(void)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f5b0..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
KASAN_SANITIZE := n
UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..6f4f424037c0 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
+ if (unlikely(!object)) /* Skip object if allocation failed */
+ return;
+
/*
* Has already been memset(), which initializes the shadow for us
* as well.
diff --git a/mm/ksm.c b/mm/ksm.c
index ca6d2a06a615..823d78b2a055 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -126,9 +126,12 @@ struct ksm_scan {
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
* @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @chain_prune_time: time of the last full garbage collection
+ * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
struct stable_node {
@@ -136,11 +139,24 @@ struct stable_node {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
struct list_head *head;
- struct list_head list;
+ struct {
+ struct hlist_node hlist_dup;
+ struct list_head list;
+ };
};
};
struct hlist_head hlist;
- unsigned long kpfn;
+ union {
+ unsigned long kpfn;
+ unsigned long chain_prune_time;
+ };
+ /*
+ * STABLE_NODE_CHAIN can be any negative number in
+ * rmap_hlist_len negative range, but better not -1 to be able
+ * to reliably detect underflows.
+ */
+#define STABLE_NODE_CHAIN -1024
+ int rmap_hlist_len;
#ifdef CONFIG_NUMA
int nid;
#endif
@@ -190,6 +206,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
+#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -217,6 +234,18 @@ static unsigned long ksm_pages_unshared;
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
+/* The number of stable_node chains */
+static unsigned long ksm_stable_node_chains;
+
+/* The number of stable_node dups linked to the stable_node chains */
+static unsigned long ksm_stable_node_dups;
+
+/* Delay in pruning stale stable_node_dups in the stable_node_chains */
+static int ksm_stable_node_chains_prune_millisecs = 2000;
+
+/* Maximum number of page slots sharing a stable node */
+static int ksm_max_page_sharing = 256;
+
/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = 100;
@@ -279,6 +308,44 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
+static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+{
+ return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
+}
+
+static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+{
+ return dup->head == STABLE_NODE_DUP_HEAD;
+}
+
+static inline void stable_node_chain_add_dup(struct stable_node *dup,
+ struct stable_node *chain)
+{
+ VM_BUG_ON(is_stable_node_dup(dup));
+ dup->head = STABLE_NODE_DUP_HEAD;
+ VM_BUG_ON(!is_stable_node_chain(chain));
+ hlist_add_head(&dup->hlist_dup, &chain->hlist);
+ ksm_stable_node_dups++;
+}
+
+static inline void __stable_node_dup_del(struct stable_node *dup)
+{
+ hlist_del(&dup->hlist_dup);
+ ksm_stable_node_dups--;
+}
+
+static inline void stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (is_stable_node_dup(dup))
+ __stable_node_dup_del(dup);
+ else
+ rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
+#ifdef CONFIG_DEBUG_VM
+ dup->head = NULL;
+#endif
+}
+
static inline struct rmap_item *alloc_rmap_item(void)
{
struct rmap_item *rmap_item;
@@ -303,6 +370,8 @@ static inline struct stable_node *alloc_stable_node(void)
static inline void free_stable_node(struct stable_node *stable_node)
{
+ VM_BUG_ON(stable_node->rmap_hlist_len &&
+ !is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
@@ -479,25 +548,82 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
+static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+ struct rb_root *root)
+{
+ struct stable_node *chain = alloc_stable_node();
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (likely(chain)) {
+ INIT_HLIST_HEAD(&chain->hlist);
+ chain->chain_prune_time = jiffies;
+ chain->rmap_hlist_len = STABLE_NODE_CHAIN;
+#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
+ chain->nid = -1; /* debug */
+#endif
+ ksm_stable_node_chains++;
+
+ /*
+ * Put the stable node chain in the first dimension of
+ * the stable tree and at the same time remove the old
+ * stable node.
+ */
+ rb_replace_node(&dup->node, &chain->node, root);
+
+ /*
+ * Move the old stable node to the second dimension
+ * queued in the hlist_dup. The invariant is that all
+ * dup stable_nodes in the chain->hlist point to pages
+ * that are wrprotected and have the exact same
+ * content.
+ */
+ stable_node_chain_add_dup(dup, chain);
+ }
+ return chain;
+}
+
+static inline void free_stable_node_chain(struct stable_node *chain,
+ struct rb_root *root)
+{
+ rb_erase(&chain->node, root);
+ free_stable_node(chain);
+ ksm_stable_node_chains--;
+}
+
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
+ /* check it's not STABLE_NODE_CHAIN or negative */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
+ /*
+ * We need the second aligned pointer of the migrate_nodes
+ * list_head to stay clear from the rb_parent_color union
+ * (aligned and different than any node) and also different
+ * from &migrate_nodes. This will verify that future list.h changes
+ * don't break STABLE_NODE_DUP_HEAD.
+ */
+#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+#endif
+
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ stable_node_dup_del(stable_node);
free_stable_node(stable_node);
}
@@ -616,6 +742,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
@@ -724,6 +852,31 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
+static int remove_stable_node_chain(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ if (remove_stable_node(stable_node))
+ return true;
+ else
+ return false;
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ if (remove_stable_node(dup))
+ return true;
+ }
+ BUG_ON(!hlist_empty(&stable_node->hlist));
+ free_stable_node_chain(stable_node, root);
+ return false;
+}
+
static int remove_all_stable_nodes(void)
{
struct stable_node *stable_node, *next;
@@ -734,7 +887,8 @@ static int remove_all_stable_nodes(void)
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
struct stable_node, node);
- if (remove_stable_node(stable_node)) {
+ if (remove_stable_node_chain(stable_node,
+ root_stable_tree + nid)) {
err = -EBUSY;
break; /* proceed to next nid */
}
@@ -1104,6 +1258,163 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
return err ? NULL : page;
}
+static __always_inline
+bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+{
+ VM_BUG_ON(stable_node->rmap_hlist_len < 0);
+ /*
+ * Check that at least one mapping still exists, otherwise
+ * there's no much point to merge and share with this
+ * stable_node, as the underlying tree_page of the other
+ * sharer is going to be freed soon.
+ */
+ return stable_node->rmap_hlist_len &&
+ stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
+}
+
+static __always_inline
+bool is_page_sharing_candidate(struct stable_node *stable_node)
+{
+ return __is_page_sharing_candidate(stable_node, 0);
+}
+
+static struct stable_node *stable_node_dup(struct stable_node *stable_node,
+ struct page **tree_page,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *dup, *found = NULL;
+ struct hlist_node *hlist_safe;
+ struct page *_tree_page;
+ int nr = 0;
+ int found_rmap_hlist_len;
+
+ if (!prune_stale_stable_nodes ||
+ time_before(jiffies, stable_node->chain_prune_time +
+ msecs_to_jiffies(
+ ksm_stable_node_chains_prune_millisecs)))
+ prune_stale_stable_nodes = false;
+ else
+ stable_node->chain_prune_time = jiffies;
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ cond_resched();
+ /*
+ * We must walk all stable_node_dup to prune the stale
+ * stable nodes during lookup.
+ *
+ * get_ksm_page can drop the nodes from the
+ * stable_node->hlist if they point to freed pages
+ * (that's why we do a _safe walk). The "dup"
+ * stable_node parameter itself will be freed from
+ * under us if it returns NULL.
+ */
+ _tree_page = get_ksm_page(dup, false);
+ if (!_tree_page)
+ continue;
+ nr += 1;
+ if (is_page_sharing_candidate(dup)) {
+ if (!found ||
+ dup->rmap_hlist_len > found_rmap_hlist_len) {
+ if (found)
+ put_page(*tree_page);
+ found = dup;
+ found_rmap_hlist_len = found->rmap_hlist_len;
+ *tree_page = _tree_page;
+
+ if (!prune_stale_stable_nodes)
+ break;
+ /* skip put_page */
+ continue;
+ }
+ }
+ put_page(_tree_page);
+ }
+
+ /*
+ * nr is relevant only if prune_stale_stable_nodes is true,
+ * otherwise we may break the loop at nr == 1 even if there
+ * are multiple entries.
+ */
+ if (prune_stale_stable_nodes && found) {
+ if (nr == 1) {
+ /*
+ * If there's not just one entry it would
+ * corrupt memory, better BUG_ON. In KSM
+ * context with no lock held it's not even
+ * fatal.
+ */
+ BUG_ON(stable_node->hlist.first->next);
+
+ /*
+ * There's just one entry and it is below the
+ * deduplication limit so drop the chain.
+ */
+ rb_replace_node(&stable_node->node, &found->node,
+ root);
+ free_stable_node(stable_node);
+ ksm_stable_node_chains--;
+ ksm_stable_node_dups--;
+ } else if (__is_page_sharing_candidate(found, 1)) {
+ /*
+ * Refile our candidate at the head
+ * after the prune if our candidate
+ * can accept one more future sharing
+ * in addition to the one underway.
+ */
+ hlist_del(&found->hlist_dup);
+ hlist_add_head(&found->hlist_dup,
+ &stable_node->hlist);
+ }
+ }
+
+ return found;
+}
+
+static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ if (!is_stable_node_chain(stable_node))
+ return stable_node;
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return NULL;
+ }
+ return hlist_entry(stable_node->hlist.first,
+ typeof(*stable_node), hlist_dup);
+}
+
+static struct stable_node *__stable_node_chain(struct stable_node *stable_node,
+ struct page **tree_page,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ if (!is_stable_node_chain(stable_node)) {
+ if (is_page_sharing_candidate(stable_node)) {
+ *tree_page = get_ksm_page(stable_node, false);
+ return stable_node;
+ }
+ return NULL;
+ }
+ return stable_node_dup(stable_node, tree_page, root,
+ prune_stale_stable_nodes);
+}
+
+static __always_inline struct stable_node *chain_prune(struct stable_node *s_n,
+ struct page **t_p,
+ struct rb_root *root)
+{
+ return __stable_node_chain(s_n, t_p, root, true);
+}
+
+static __always_inline struct stable_node *chain(struct stable_node *s_n,
+ struct page **t_p,
+ struct rb_root *root)
+{
+ return __stable_node_chain(s_n, t_p, root, false);
+}
+
/*
* stable_tree_search - search for page inside the stable tree
*
@@ -1119,7 +1430,7 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
struct stable_node *page_node;
page_node = page_stable_node(page);
@@ -1141,7 +1452,32 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ stable_node_dup = chain_prune(stable_node, &tree_page, root);
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1164,6 +1500,34 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ /*
+ * Test if the migrated page should be merged
+ * into a stable node dup. If the mapcount is
+ * 1 we can migrate it with another KSM page
+ * without adding it to the chain.
+ */
+ if (page_mapcount(page) > 1)
+ goto chain_append;
+ }
+
+ if (!stable_node_dup) {
+ /*
+ * If the stable_node is a chain and
+ * we got a payload match in memcmp
+ * but we cannot merge the scanned
+ * page in any of the existing
+ * stable_node dups because they're
+ * all full, we need to wait the
+ * scanned page to find itself a match
+ * in the unstable tree to create a
+ * brand new KSM page to add later to
+ * the dups of this stable_node.
+ */
+ return NULL;
+ }
+
/*
* Lock and unlock the stable_node's page (which
* might already have been migrated) so that page
@@ -1171,23 +1535,21 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node, true);
- if (tree_page) {
- unlock_page(tree_page);
- if (get_kpfn_nid(stable_node->kpfn) !=
- NUMA(stable_node->nid)) {
- put_page(tree_page);
- goto replace;
- }
- return tree_page;
- }
- /*
- * There is now a place for page_node, but the tree may
- * have been rebalanced, so re-evaluate parent and new.
- */
- if (page_node)
+ tree_page = get_ksm_page(stable_node_dup, true);
+ if (unlikely(!tree_page))
+ /*
+ * The tree may have been rebalanced,
+ * so re-evaluate parent and new.
+ */
goto again;
- return NULL;
+ unlock_page(tree_page);
+
+ if (get_kpfn_nid(stable_node_dup->kpfn) !=
+ NUMA(stable_node_dup->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
}
}
@@ -1198,22 +1560,72 @@ again:
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, root);
- get_page(page);
- return page;
+out:
+ if (is_page_sharing_candidate(page_node)) {
+ get_page(page);
+ return page;
+ } else
+ return NULL;
replace:
- if (page_node) {
- list_del(&page_node->list);
- DO_NUMA(page_node->nid = nid);
- rb_replace_node(&stable_node->node, &page_node->node, root);
- get_page(page);
+ if (stable_node_dup == stable_node) {
+ /* there is no chain */
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node->node, &page_node->node,
+ root);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ rb_erase(&stable_node->node, root);
+ page = NULL;
+ }
} else {
- rb_erase(&stable_node->node, root);
- page = NULL;
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ __stable_node_dup_del(stable_node_dup);
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ page = NULL;
+ }
}
- stable_node->head = &migrate_nodes;
- list_add(&stable_node->list, stable_node->head);
+ stable_node_dup->head = &migrate_nodes;
+ list_add(&stable_node_dup->list, stable_node_dup->head);
return page;
+
+chain_append:
+ /* stable_node_dup could be null if it reached the limit */
+ if (!stable_node_dup)
+ stable_node_dup = stable_node_any;
+ if (stable_node_dup == stable_node) {
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(stable_node_dup,
+ root);
+ if (!stable_node)
+ return NULL;
+ }
+ /*
+ * Add this stable_node dup that was
+ * migrated to the stable_node chain
+ * of the current nid for this page
+ * content.
+ */
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ goto out;
}
/*
@@ -1230,7 +1642,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ bool need_chain = false;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
@@ -1245,7 +1658,32 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ stable_node_dup = chain(stable_node, &tree_page, root);
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1268,27 +1706,37 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
- /*
- * It is not a bug that stable_tree_search() didn't
- * find this node: because at that time our page was
- * not yet write-protected, so may have changed since.
- */
- return NULL;
+ need_chain = true;
+ break;
}
}
- stable_node = alloc_stable_node();
- if (!stable_node)
+ stable_node_dup = alloc_stable_node();
+ if (!stable_node_dup)
return NULL;
- INIT_HLIST_HEAD(&stable_node->hlist);
- stable_node->kpfn = kpfn;
- set_page_stable_node(kpage, stable_node);
- DO_NUMA(stable_node->nid = nid);
- rb_link_node(&stable_node->node, parent, new);
- rb_insert_color(&stable_node->node, root);
+ INIT_HLIST_HEAD(&stable_node_dup->hlist);
+ stable_node_dup->kpfn = kpfn;
+ set_page_stable_node(kpage, stable_node_dup);
+ stable_node_dup->rmap_hlist_len = 0;
+ DO_NUMA(stable_node_dup->nid = nid);
+ if (!need_chain) {
+ rb_link_node(&stable_node_dup->node, parent, new);
+ rb_insert_color(&stable_node_dup->node, root);
+ } else {
+ if (!is_stable_node_chain(stable_node)) {
+ struct stable_node *orig = stable_node;
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(orig, root);
+ if (!stable_node) {
+ free_stable_node(stable_node_dup);
+ return NULL;
+ }
+ }
+ stable_node_chain_add_dup(stable_node_dup, stable_node);
+ }
- return stable_node;
+ return stable_node_dup;
}
/*
@@ -1378,8 +1826,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* the same ksm page.
*/
static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node)
+ struct stable_node *stable_node,
+ bool max_page_sharing_bypass)
{
+ /*
+ * rmap won't find this mapping if we don't insert the
+ * rmap_item in the right stable_node
+ * duplicate. page_migration could break later if rmap breaks,
+ * so we can as well crash here. We really need to check for
+ * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
+ * for other negative values as an undeflow if detected here
+ * for the first time (and not when decreasing rmap_hlist_len)
+ * would be sign of memory corruption in the stable_node.
+ */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
+ stable_node->rmap_hlist_len++;
+ if (!max_page_sharing_bypass)
+ /* possibly non fatal but unexpected overflow, only warn */
+ WARN_ON_ONCE(stable_node->rmap_hlist_len >
+ ksm_max_page_sharing);
+
rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1407,19 +1874,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
struct page *kpage;
unsigned int checksum;
int err;
+ bool max_page_sharing_bypass = false;
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
- get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
+ NUMA(stable_node->nid)) {
+ stable_node_dup_del(stable_node);
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
+ /*
+ * If it's a KSM fork, allow it to go over the sharing limit
+ * without warnings.
+ */
+ if (!is_page_sharing_candidate(stable_node))
+ max_page_sharing_bypass = true;
}
/* We first start with searching the page inside the stable tree */
@@ -1439,7 +1913,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
- stable_tree_append(rmap_item, page_stable_node(kpage));
+ stable_tree_append(rmap_item, page_stable_node(kpage),
+ max_page_sharing_bypass);
unlock_page(kpage);
}
put_page(kpage);
@@ -1472,8 +1947,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
- stable_tree_append(tree_rmap_item, stable_node);
- stable_tree_append(rmap_item, stable_node);
+ stable_tree_append(tree_rmap_item, stable_node,
+ false);
+ stable_tree_append(rmap_item, stable_node,
+ false);
}
unlock_page(kpage);
@@ -1974,6 +2451,48 @@ static void wait_while_offlining(void)
}
}
+static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ return true;
+ }
+ return false;
+}
+
+static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ return stable_node_dup_remove_range(stable_node, start_pfn,
+ end_pfn);
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ stable_node_dup_remove_range(dup, start_pfn, end_pfn);
+ }
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return true; /* notify caller that tree was rebalanced */
+ } else
+ return false;
+}
+
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -1985,15 +2504,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
node = rb_first(root_stable_tree + nid);
while (node) {
stable_node = rb_entry(node, struct stable_node, node);
- if (stable_node->kpfn >= start_pfn &&
- stable_node->kpfn < end_pfn) {
- /*
- * Don't get_ksm_page, page has already gone:
- * which is why we keep kpfn instead of page*
- */
- remove_node_from_stable_tree(stable_node);
+ if (stable_node_chain_remove_range(stable_node,
+ start_pfn, end_pfn,
+ root_stable_tree +
+ nid))
node = rb_first(root_stable_tree + nid);
- } else
+ else
node = rb_next(node);
cond_resched();
}
@@ -2217,6 +2733,47 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
KSM_ATTR(merge_across_nodes);
#endif
+static ssize_t max_page_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_max_page_sharing);
+}
+
+static ssize_t max_page_sharing_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ int knob;
+
+ err = kstrtoint(buf, 10, &knob);
+ if (err)
+ return err;
+ /*
+ * When a KSM page is created it is shared by 2 mappings. This
+ * being a signed comparison, it implicitly verifies it's not
+ * negative.
+ */
+ if (knob < 2)
+ return -EINVAL;
+
+ if (READ_ONCE(ksm_max_page_sharing) == knob)
+ return count;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_max_page_sharing != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else
+ ksm_max_page_sharing = knob;
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(max_page_sharing);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2255,6 +2812,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t stable_node_dups_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+}
+KSM_ATTR_RO(stable_node_dups);
+
+static ssize_t stable_node_chains_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+}
+KSM_ATTR_RO(stable_node_chains);
+
+static ssize_t
+stable_node_chains_prune_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+}
+
+static ssize_t
+stable_node_chains_prune_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_stable_node_chains_prune_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(stable_node_chains_prune_millisecs);
+
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2274,6 +2871,10 @@ static struct attribute *ksm_attrs[] = {
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
+ &max_page_sharing_attr.attr,
+ &stable_node_chains_attr.attr,
+ &stable_node_dups_attr.attr,
+ &stable_node_chains_prune_millisecs_attr.attr,
NULL,
};
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d2e1..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
- /* Ignore return value for now */
- memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
*
* return values:
* zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929f13..dfff740b856a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -585,6 +585,9 @@ repeat:
nid, flags);
}
+ if (!nr_new)
+ return 0;
+
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
@@ -612,14 +615,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.memory;
-
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +741,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.reserved;
-
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..ae8b81c55685 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -663,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
int zid;
@@ -1709,19 +1683,13 @@ cleanup:
}
/**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
*
- * memcg = mem_cgroup_begin_page_stat(page);
- * if (TestClearPageState(page))
- * mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1730,25 +1698,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page from being uncharged.
- * E.g. end-writeback clearing PageWriteback(), which allows
- * migration to go ahead and uncharge the page before the
- * account transaction might be complete.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return NULL;
+ return;
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1759,21 +1720,23 @@ again:
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
- * the task who has the lock for mem_cgroup_end_page_stat().
+ * the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return memcg;
+ return;
}
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
/**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1785,7 +1748,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2361,9 +2324,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_online(memcg))
- return 0;
-
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
@@ -2382,10 +2342,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
- int ret;
+ int ret = 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!mem_cgroup_is_root(memcg))
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
@@ -2755,39 +2716,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
+ memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += mem_cgroup_read_stat(iter, i);
+ }
}
-static unsigned long tree_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_events(iter, idx);
+ memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ events[i] += mem_cgroup_read_events(iter, i);
+ }
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val;
+ unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_SWAP);
+ }
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -2853,6 +2823,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
+ if (cgroup_memory_nokmem)
+ return 0;
+
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
@@ -2873,24 +2846,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
}
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
- struct mem_cgroup *memcg)
-{
- int ret = 0;
-
- mutex_lock(&memcg_limit_mutex);
- /*
- * If the parent cgroup is not kmem-online now, it cannot be
- * onlined after this point, because it has at least one child
- * already.
- */
- if (memcg_kmem_online(parent) ||
- (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
- ret = memcg_online_kmem(memcg);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
-}
-
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
@@ -2949,10 +2904,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
}
#else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
- return 0;
-}
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
@@ -2968,22 +2919,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret = 0;
+ int ret;
mutex_lock(&memcg_limit_mutex);
- /* Top-level cgroup doesn't propagate from root */
- if (!memcg_kmem_online(memcg)) {
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- ret = -EBUSY;
- if (ret)
- goto out;
- ret = memcg_online_kmem(memcg);
- if (ret)
- goto out;
- }
ret = page_counter_limit(&memcg->kmem, limit);
-out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4234,7 +4173,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- error = memcg_propagate_kmem(parent, memcg);
+ error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -4488,7 +4427,7 @@ static int mem_cgroup_move_account(struct page *page,
VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_replace_page() from looking at
+ * Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
ret = -EBUSY;
@@ -4923,9 +4862,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
lru_add_drain_all();
/*
- * Signal mem_cgroup_begin_page_stat() to take the memcg's
- * move_lock while we're moving its pages to another memcg.
- * Then wait for already started RCU-only updates to finish.
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
@@ -5113,6 +5052,8 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[MEMCG_NR_EVENTS];
int i;
/*
@@ -5126,22 +5067,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
+ tree_stat(memcg, stat);
+ tree_events(memcg, events);
+
seq_printf(m, "anon %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ seq_printf(m, "kernel_stack %llu\n",
+ (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ seq_printf(m, "slab %llu\n",
+ (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+ stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+ (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5153,12 +5099,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
+ seq_printf(m, "slab_reclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ seq_printf(m, "slab_unreclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+ events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
return 0;
}
@@ -5517,16 +5468,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
*
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
*/
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
@@ -5559,7 +5510,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, true);
+ commit_charge(newpage, memcg, false);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3a95..67c30eb993f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -826,8 +826,6 @@ static struct page_state {
#undef lru
#undef swapbacked
#undef head
-#undef tail
-#undef compound
#undef slab
#undef reserved
diff --git a/mm/memory.c b/mm/memory.c
index 38090ca37a08..d279350424fa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1105,6 +1105,12 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
+ /*
+ * oom_reaper cannot tear down dirty
+ * pages
+ */
+ if (unlikely(details && details->ignore_dirty))
+ continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1123,8 +1129,8 @@ again:
}
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
+ /* only check swap_entries if explicitly asked for in details */
+ if (unlikely(details && !details->check_swap_entries))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1229,7 +1235,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
@@ -1237,9 +1243,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping)
- details = NULL;
-
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -1897,7 +1900,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long end = addr + size;
int err;
- BUG_ON(addr >= end);
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2433,7 +2438,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details;
+ struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -2468,7 +2473,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
@@ -3143,8 +3148,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 979b18cbd343..92402f8d17e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -77,6 +78,9 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
+
void get_online_mems(void)
{
might_sleep();
@@ -509,6 +513,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
int start_sec, end_sec;
struct vmem_altmap *altmap;
+ clear_zone_contiguous(zone);
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -540,6 +546,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
vmemmap_populate_print_last();
+ set_zone_contiguous(zone);
+
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +819,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
}
+ clear_zone_contiguous(zone);
+
/*
* We can only remove entire sections
*/
@@ -826,6 +836,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
if (ret)
break;
}
+
+ set_zone_contiguous(zone);
+
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1042,7 +1055,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -1082,7 +1095,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1093,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1261,8 +1276,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
return zone_default;
}
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
pg_data_t *pgdat = NULL;
@@ -1322,6 +1342,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* online pages if requested */
+ if (online)
+ walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+ NULL, online_memory_block);
+
goto out;
error:
@@ -1345,7 +1370,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, memhp_auto_online);
if (ret < 0)
release_memory_resource(res);
return ret;
@@ -1858,8 +1883,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c4187c0e1de..8c5fd08c253c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (!is_vm_hugetlb_page(vma) &&
+ (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index b1034f9c77e7..90cbf7c65cac 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
/* No turning back from here */
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
+
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
+
return MIGRATEPAGE_SUCCESS;
}
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
+
+ mem_cgroup_migrate(page, newpage);
}
/************************************************************
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- set_page_memcg(page, NULL);
if (!PageAnon(page))
page->mapping = NULL;
}
@@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
}
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -1018,7 +1023,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1076,6 +1081,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
@@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1836,9 +1842,8 @@ fail_putback:
}
mlock_migrate_page(new_page, page);
- set_page_memcg(new_page, page_memcg(page));
- set_page_memcg(page, NULL);
page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8eb7bb40dc40..6ff5dfa65b33 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE)
+ if (next - addr != HPAGE_PMD_SIZE) {
split_huge_pmd(vma, pmd, addr);
- else {
+ if (pmd_none(*pmd))
+ continue;
+ } else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
@@ -352,10 +354,12 @@ fail:
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
- unsigned long vm_flags, nstart, end, tmp, reqprot;
+ unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
+ (prot & PROT_READ);
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
@@ -372,13 +376,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return -EINVAL;
reqprot = prot;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC:
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- prot |= PROT_EXEC;
-
- vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
@@ -412,7 +409,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags;
+ /* Does the application expect PROT_READ to imply PROT_EXEC */
+ if (rier && (vma->vm_flags & VM_MAYEXEC))
+ prot |= PROT_EXEC;
+
+ newflags = calc_vm_prot_bits(prot);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
@@ -443,6 +444,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -ENOMEM;
goto out;
}
+ prot = reqprot;
}
out:
up_write(&current->mm->mmap_sem);
diff --git a/mm/mremap.c b/mm/mremap.c
index d77946a997f7..8eeba02fc991 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
}
}
split_huge_pmd(vma, old_pmd, old_addr);
+ if (pmd_none(*old_pmd))
+ continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c06941b..7653055d9f09 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -386,10 +391,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
+ "oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
@@ -408,6 +414,167 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static LIST_HEAD(oom_reaper_list);
+static DEFINE_SPINLOCK(oom_reaper_lock);
+
+
+static bool __oom_reap_task(struct task_struct *tsk)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct task_struct *p;
+ struct zap_details details = {.check_swap_entries = true,
+ .ignore_dirty = true};
+ bool ret = true;
+
+ /*
+ * Make sure we find the associated mm_struct even when the particular
+ * thread has already terminated and cleared its mm.
+ * We might have race with exit path so consider our work done if there
+ * is no mm.
+ */
+ p = find_lock_task_mm(tsk);
+ if (!p)
+ return true;
+
+ mm = p->mm;
+ if (!atomic_inc_not_zero(&mm->mm_users)) {
+ task_unlock(p);
+ return true;
+ }
+
+ task_unlock(p);
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ ret = false;
+ goto out;
+ }
+
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+ &details);
+ }
+ }
+ tlb_finish_mmu(&tlb, 0, -1);
+ pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lulB\n",
+ task_pid_nr(tsk), tsk->comm,
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore. OOM killer can continue
+ * by selecting other victim if unmapping hasn't led to any
+ * improvements. This also means that selecting this task doesn't
+ * make any sense.
+ */
+ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+ exit_oom_victim(tsk);
+out:
+ mmput(mm);
+ return ret;
+}
+
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+ int attempts = 0;
+
+ /* Retry the down_read_trylock(mmap_sem) a few times */
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+ schedule_timeout_idle(HZ/10);
+
+ if (attempts > MAX_OOM_REAP_RETRIES) {
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
+ }
+
+ /* Drop a reference taken by wake_oom_reaper */
+ put_task_struct(tsk);
+}
+
+static int oom_reaper(void *unused)
+{
+ while (true) {
+ struct task_struct *tsk = NULL;
+
+ wait_event_freezable(oom_reaper_wait,
+ (!list_empty(&oom_reaper_list)));
+ spin_lock(&oom_reaper_lock);
+ if (!list_empty(&oom_reaper_list)) {
+ tsk = list_first_entry(&oom_reaper_list,
+ struct task_struct, oom_reaper_list);
+ list_del(&tsk->oom_reaper_list);
+ }
+ spin_unlock(&oom_reaper_lock);
+
+ if (tsk)
+ oom_reap_task(tsk);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+ if (!oom_reaper_th)
+ return;
+
+ get_task_struct(tsk);
+
+ spin_lock(&oom_reaper_lock);
+ list_add(&tsk->oom_reaper_list, &oom_reaper_list);
+ spin_unlock(&oom_reaper_lock);
+ wake_up(&oom_reaper_wait);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ if (IS_ERR(oom_reaper_th)) {
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+ PTR_ERR(oom_reaper_th));
+ oom_reaper_th = NULL;
+ }
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct task_struct *mm)
+{
+}
+#endif
+
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
@@ -434,9 +601,10 @@ void mark_oom_victim(struct task_struct *tsk)
/**
* exit_oom_victim - note the exit of an OOM victim
*/
-void exit_oom_victim(void)
+void exit_oom_victim(struct task_struct *tsk)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
@@ -458,15 +626,11 @@ void exit_oom_victim(void)
bool oom_killer_disable(void)
{
/*
- * Make sure to not race with an ongoing OOM killer
- * and that the current is not the victim.
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
*/
- mutex_lock(&oom_lock);
- if (test_thread_flag(TIF_MEMDIE)) {
- mutex_unlock(&oom_lock);
+ if (mutex_lock_killable(&oom_lock))
return false;
- }
-
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
@@ -501,7 +665,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
@@ -517,6 +680,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ bool can_oom_reap = true;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -607,17 +771,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD))
- continue;
- if (is_global_init(p))
- continue;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ /*
+ * We cannot use oom_reaper for the mm shared by this
+ * process because it wouldn't get killed and so the
+ * memory might be still used.
+ */
+ can_oom_reap = false;
continue;
-
+ }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
+ if (can_oom_reap)
+ wake_oom_reaper(victim);
+
mmdrop(mm);
put_task_struct(victim);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6fe7d15bd1f7..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg)
+void account_page_dirtied(struct page *page, struct address_space *mapping)
{
struct inode *inode = mapping->host;
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb)
+ struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
}
return 1;
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, memcg, wb);
+ account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
/*
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..00118fee5340 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
@@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason,
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
@@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason,
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -477,7 +496,9 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +509,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1002,6 +1026,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
@@ -1104,6 +1129,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
@@ -1254,9 +1348,13 @@ free_range:
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1368,11 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1381,6 +1482,12 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool should_zero(void)
+{
+ return !IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) ||
+ !page_poisoning_enabled();
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
@@ -1396,10 +1503,11 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_refcounted(page);
arch_alloc_page(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (should_zero() && gfp_flags & __GFP_ZERO)
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -2690,9 +2798,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2748,8 +2855,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -2976,6 +3087,103 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
}
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds
+ * without any progress in a row) is considered as well as the reclaimable pages
+ * on the applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+ struct alloc_context *ac, int alloc_flags,
+ bool did_some_progress,
+ int no_progress_loops)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure we converge to OOM if we cannot make any progress
+ * several times in the row.
+ */
+ if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ return false;
+
+ /* Do not retry high order allocations unless they are __GFP_REPEAT */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ return false;
+
+ /*
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->high_zoneidx, ac->nodemask) {
+ unsigned long available;
+ unsigned long reclaimable;
+
+ available = reclaimable = zone_reclaimable_pages(zone);
+ available -= DIV_ROUND_UP(no_progress_loops * available,
+ MAX_RECLAIM_RETRIES);
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ /*
+ * Would the allocation succeed if we reclaimed the whole
+ * available?
+ */
+ if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+ ac->high_zoneidx, alloc_flags, available)) {
+ unsigned long writeback;
+ unsigned long dirty;
+
+ writeback = zone_page_state_snapshot(zone, NR_WRITEBACK);
+ dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+ /*
+ * If we didn't make any progress and have a lot of
+ * dirty + writeback pages then we should wait for
+ * an IO to complete to slow down the reclaim and
+ * prevent from pre mature OOM
+ */
+ if (!did_some_progress && 2*(writeback + dirty) > reclaimable) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ return true;
+ }
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that
+ * a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to
+ * do a short sleep here rather than calling
+ * cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout(1);
+ else
+ cond_resched();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -2983,11 +3191,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
int alloc_flags;
- unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
int contended_compaction = COMPACT_CONTENDED_NONE;
+ int no_progress_loops = 0;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3147,14 +3355,19 @@ retry:
if (gfp_mask & __GFP_NORETRY)
goto noretry;
- /* Keep reclaiming pages as long as there is reasonable progress */
- pages_reclaimed += did_some_progress;
- if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
- ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
- /* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so do
+ * not reset the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ no_progress_loops = 0;
+ else
+ no_progress_loops++;
+
+ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+ did_some_progress > 0, no_progress_loops))
goto retry;
- }
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -3162,8 +3375,10 @@ retry:
goto got_pg;
/* Retry as long as the OOM killer is making progress */
- if (did_some_progress)
+ if (did_some_progress) {
+ no_progress_loops = 0;
goto retry;
+ }
noretry:
/*
@@ -4491,6 +4706,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4722,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
+ */
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4934,11 +5183,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5197,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
@@ -5023,6 +5267,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5279,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5319,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
@@ -5072,15 +5359,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
@@ -5201,7 +5495,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
@@ -5217,11 +5510,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5290,7 +5587,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
@@ -5358,6 +5654,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5529,6 +5827,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6116,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
@@ -6927,7 +7261,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -6946,4 +7279,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..44ad1f00c4e1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -150,6 +183,31 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ pr_alert("page allocated via order %u, migratetype %s, "
+ "gfp_mask %#x(%pGg)\n", page_ext->order,
+ migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -305,7 +363,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
new file mode 100644
index 000000000000..312b131d5875
--- /dev/null
+++ b/mm/page_poison.c
@@ -0,0 +1,158 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page_ext.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly =
+ !IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC);
+
+static int early_page_poison_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
+
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ return __page_poisoning_enabled;
+}
+
+static bool need_page_poisoning(void)
+{
+ return want_page_poisoning;
+}
+
+static void init_page_poisoning(void)
+{
+ if (!want_page_poisoning)
+ return;
+
+ __page_poisoning_enabled = true;
+}
+
+struct page_ext_operations page_poisoning_ops = {
+ .need = need_page_poisoning,
+ .init = init_page_poisoning,
+};
+
+static inline void set_page_poison(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ set_page_poison(page);
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ printk(KERN_ERR "pagealloc: single bit error\n");
+ else
+ printk(KERN_ERR "pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ if (!page_poison(page))
+ return;
+
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
+}
+
+void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_poison_pages(struct page *page, int numpages, int enable)
+{
+ if (!page_poisoning_enabled())
+ return;
+
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047f38..02f0bfc3c80a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7e6c1c..b8e8369676be 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
if (iter.index >= end)
break;
page = radix_tree_deref_slot(slot);
- /*
- * This should only be possible to happen at index 0, so we
- * don't need to reset the counter, nor do we risk infinite
- * restarts.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
if (radix_tree_exceptional_entry(page))
swapped++;
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
@@ -1116,7 +1111,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
} else if (page_count(page) - page_mapcount(page) > 1) {
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
@@ -1962,8 +1958,7 @@ restart:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
page = NULL;
}
@@ -2032,8 +2028,7 @@ restart:
continue_resched:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/slab.c b/mm/slab.c
index 6ecc697a8bc4..f652b967e1da 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
/*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-
-/*
* struct array_cache
*
* Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- *
- * Entries should not be directly dereferenced as
- * entries belonging to slabs marked pfmemalloc will
- * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
@@ -207,33 +197,6 @@ struct alien_cache {
struct array_cache ac;
};
-#define SLAB_OBJ_PFMEMALLOC 1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
- return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-
-static inline void set_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
- return;
-}
-
-static inline void clear_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES 1
-struct arraycache_init {
- struct array_cache cache;
- void *entries[BOOT_CPUCACHE_ENTRIES];
-};
-
/*
* Need this for bootstrapping a per node allocator.
*/
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
+#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
+ return atomic_read(&cachep->store_user_clean) == 1;
}
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
+ atomic_set(&cachep->store_user_clean, 1);
+}
- return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+ if (is_store_user_clean(cachep))
+ atomic_set(&cachep->store_user_clean, 0);
}
#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
#endif
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
@@ -475,61 +430,12 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return this_cpu_ptr(cachep->cpu_cache);
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
- size_t freelist_size;
-
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
-
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
-
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
-
- return nr_objs;
-}
-
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
- size_t align, int flags, size_t *left_over,
- unsigned int *num)
+ unsigned long flags, size_t *left_over, unsigned int *num)
{
- int nr_objs;
- size_t mgmt_size;
size_t slab_size = PAGE_SIZE << gfporder;
/*
@@ -537,26 +443,26 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
- * - Padding to respect alignment of @align
* - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
- if (flags & CFLGS_OFF_SLAB) {
- mgmt_size = 0;
- nr_objs = slab_size / buffer_size;
-
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ *num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ *num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
}
- *num = nr_objs;
- *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
#if DEBUG
@@ -687,120 +593,24 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return ac;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ void *objp)
{
- return PageSlabPfmemalloc(page);
-}
-
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
- struct array_cache *ac)
-{
- struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
- struct page *page;
- unsigned long flags;
-
- if (!pfmemalloc_active)
- return;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- pfmemalloc_active = false;
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
-}
-
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
- gfp_t flags, bool force_refill)
-{
- int i;
- void *objp = ac->entry[--ac->avail];
-
- /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
- if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
-
- if (gfp_pfmemalloc_allowed(flags)) {
- clear_obj_pfmemalloc(&objp);
- return objp;
- }
-
- /* The caller cannot use PFMEMALLOC objects, find another one */
- for (i = 0; i < ac->avail; i++) {
- /* If a !PFMEMALLOC object is found, swap them */
- if (!is_obj_pfmemalloc(ac->entry[i])) {
- objp = ac->entry[i];
- ac->entry[i] = ac->entry[ac->avail];
- ac->entry[ac->avail] = objp;
- return objp;
- }
- }
-
- /*
- * If there are empty slabs on the slabs_free list and we are
- * being forced to refill the cache, mark this one !pfmemalloc.
- */
- n = get_node(cachep, numa_mem_id());
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
- clear_obj_pfmemalloc(&objp);
- recheck_pfmemalloc_active(cachep, ac);
- return objp;
- }
-
- /* No !PFMEMALLOC objects available */
- ac->avail++;
- objp = NULL;
- }
-
- return objp;
-}
-
-static inline void *ac_get_obj(struct kmem_cache *cachep,
- struct array_cache *ac, gfp_t flags, bool force_refill)
-{
- void *objp;
+ struct page *page = virt_to_head_page(objp);
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_get_obj(cachep, ac, flags, force_refill);
- else
- objp = ac->entry[--ac->avail];
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
- return objp;
-}
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
- struct array_cache *ac, void *objp)
-{
- if (unlikely(pfmemalloc_active)) {
- /* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
- if (PageSlabPfmemalloc(page))
- set_obj_pfmemalloc(&objp);
+ slabs_destroy(cachep, &list);
}
-
- return objp;
-}
-
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
-{
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_put_obj(cachep, ac, objp);
-
- ac->entry[ac->avail++] = objp;
}
/*
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac_put_obj(cachep, ac, objp);
+ ac->entry[ac->avail++] = objp;
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1604,10 +1414,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (page_is_pfmemalloc(page))
- pfmemalloc_active = true;
-
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1615,8 +1421,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
+
__SetPageSlab(page);
- if (page_is_pfmemalloc(page))
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1636,9 +1444,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
*/
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ int order = cachep->gfporder;
+ unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, cachep->gfporder);
+ kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1655,7 +1464,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_kmem_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(page, order, cachep);
+ __free_pages(page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1670,6 +1480,14 @@ static void kmem_rcu_free(struct rcu_head *head)
}
#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1521,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
}
*addr++ = 0x87654321;
}
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ if (caller)
+ store_stackinfo(cachep, objp, caller);
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller) {}
+
#endif
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1616,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int size, i;
int lines = 0;
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
@@ -1842,20 +1680,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
struct page *page)
{
int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->size % PAGE_SIZE == 0 &&
- OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1916,7 +1752,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
@@ -1926,9 +1761,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
+ size_t size, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
@@ -1936,7 +1770,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
unsigned int num;
size_t remainder;
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ cache_estimate(gfporder, size, flags, &remainder, &num);
if (!num)
continue;
@@ -1945,19 +1779,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
break;
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
/*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
+ * Needed to avoid possible looping condition
+ * in cache_grow()
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ if (OFF_SLAB(freelist_cache))
+ continue;
- if (num > offslab_limit)
- break;
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
}
/* Found something acceptable - save it away */
@@ -2075,6 +1914,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
return cachep;
}
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
@@ -2099,7 +2011,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
@@ -2119,8 +2030,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
#endif
/*
@@ -2152,6 +2061,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* 4) Store it.
*/
cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
if (slab_is_available())
gfp = GFP_KERNEL;
@@ -2179,37 +2092,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- /*
- * To activate debug pagealloc, off-slab management is necessary
- * requirement. In early phase of initialization, small sized slab
- * doesn't get initialized so it would not be possible. So, we need
- * to check size >= 256. It guarantees that all necessary small
- * sized slab is initialized in current slab initialization sequence.
- */
- if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
- size >= 256 && cachep->object_size > cache_line_size() &&
- ALIGN(size, cachep->align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
-#endif
#endif
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
-
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
@@ -2218,42 +2102,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, cachep->align, flags);
-
- if (!cachep->num)
- return -E2BIG;
-
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-
+#if DEBUG
/*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
}
+#endif
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
-#ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
}
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,21 +2149,26 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
- /*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
- * But since we go off slab only for object size greater than
- * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
}
err = setup_cpu_cache(cachep, gfp);
if (err) {
- __kmem_cache_shutdown(cachep);
+ __kmem_cache_release(cachep);
return err;
}
@@ -2377,9 +2270,6 @@ static int drain_freelist(struct kmem_cache *cache,
}
page = list_entry(p, struct page, lru);
-#if DEBUG
- BUG_ON(page->active);
-#endif
list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
@@ -2414,12 +2304,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
+ return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
+{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep, false);
-
- if (rc)
- return rc;
free_percpu(cachep->cpu_cache);
@@ -2430,7 +2321,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
kfree(n);
cachep->node[i] = NULL;
}
- return 0;
}
/*
@@ -2454,18 +2344,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- if (OFF_SLAB(cachep)) {
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
if (!freelist)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
+
return freelist;
}
@@ -2480,17 +2375,14 @@ static inline void set_free_obj(struct page *page,
((freelist_idx_t *)(page->freelist))[idx] = val;
}
-static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
{
+#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
+
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2514,15 +2406,32 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
-#else
- if (cachep->ctor)
- cachep->ctor(objp);
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0, 0);
+ }
+ }
#endif
- set_obj_status(page, i, OBJECT_FREE);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+
+ cache_init_objs_debug(cachep, page);
+
+ if (OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor)
+ cachep->ctor(index_to_obj(cachep, page, i));
+
set_free_obj(page, i, i);
}
}
@@ -2537,30 +2446,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
- int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
+
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ if (cachep->flags & SLAB_STORE_USER)
+ set_store_user_dirty(cachep);
#endif
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
- void *objp, int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
unsigned int i;
- /* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
-
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
@@ -2571,6 +2478,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
}
#endif
page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
set_free_obj(page, page->active, objnr);
}
@@ -2645,7 +2555,7 @@ static int cache_grow(struct kmem_cache *cachep,
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (OFF_SLAB(cachep) && !freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
@@ -2726,27 +2636,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER)
+ if (cachep->flags & SLAB_STORE_USER) {
+ set_store_user_dirty(cachep);
*dbg_userword(cachep, objp) = (void *)caller;
+ }
objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, caller);
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- } else {
- poison_obj(cachep, objp, POISON_FREE);
- }
-#else
poison_obj(cachep, objp, POISON_FREE);
-#endif
+ slab_kernel_map(cachep, objp, 0, caller);
}
return objp;
}
@@ -2756,7 +2658,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-static struct page *get_first_slab(struct kmem_cache_node *n)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->lru);
+ if (page->active == cachep->num) {
+ list_add(&page->lru, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->lru, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->lru);
+ if (!page->active)
+ list_add_tail(&page->lru, &n->slabs_free);
+ else
+ list_add_tail(&page->lru, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
@@ -2768,21 +2748,55 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
struct page, lru);
}
+ if (sk_memalloc_socks())
+ return get_valid_first_slab(n, page, pfmemalloc);
+
return page;
}
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ /* Racy check if there is free objects */
+ if (!n->free_objects)
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+ void *list = NULL;
check_irq_off();
node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
+
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
@@ -2808,7 +2822,7 @@ retry:
while (batchcount > 0) {
struct page *page;
/* Get slab alloc is to come from. */
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -2826,26 +2840,29 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
if (unlikely(!ac->avail)) {
int x;
-force_grow:
+
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2870,7 @@ force_grow:
node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && ac->avail == 0)
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
@@ -2861,7 +2878,7 @@ force_grow:
}
ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac->entry[--ac->avail];
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2894,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
@@ -2910,8 +2918,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
@@ -2926,40 +2932,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
- if (unlikely(cachep == kmem_cache))
- return false;
-
- return should_failslab(cachep->object_size, flags, cachep->flags);
-}
-
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
- bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
+ objp = ac->entry[--ac->avail];
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
}
STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
+ objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
@@ -3097,6 +3087,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
struct page *page;
struct kmem_cache_node *n;
void *obj;
+ void *list = NULL;
int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3097,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
retry:
check_irq_off();
spin_lock(&n->list_lock);
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3118,17 +3109,13 @@ retry:
BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, page);
n->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
goto done;
must_grow:
@@ -3152,14 +3139,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3188,16 +3171,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
- flags);
- if (likely(ptr)) {
- kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && ptr)
+ memset(ptr, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &ptr);
return ptr;
}
@@ -3240,30 +3218,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
void *objp;
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
prefetchw(objp);
- if (likely(objp)) {
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && objp)
+ memset(objp, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
@@ -3281,13 +3250,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
void *objp;
struct page *page;
- clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
@@ -3317,9 +3285,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
LIST_HEAD(list);
batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
+
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
@@ -3389,7 +3355,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac_put_obj(cachep, ac, objp);
+ if (sk_memalloc_socks()) {
+ cache_free_pfmemalloc(cachep, objp);
+ return;
+ }
+
+ ac->entry[ac->avail++] = objp;
}
/**
@@ -3411,16 +3382,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ s = slab_pre_alloc_hook(s, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(flags & __GFP_ZERO))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3567,6 +3575,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
@@ -4102,15 +4136,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
struct page *page)
{
void *p;
- int i;
+ int i, j;
+ unsigned long v;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ bool active = true;
+
+ for (j = page->active; j < c->num; j++) {
+ if (get_free_obj(page, j) == i) {
+ active = false;
+ break;
+ }
+ }
+
+ if (!active)
+ continue;
+
+ /*
+ * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+ * mapping is established when actual object allocation and
+ * we could mistakenly access the unmapped object in the cpu
+ * cache.
+ */
+ if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
continue;
- if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ if (!add_caller(n, v))
return;
}
}
@@ -4146,21 +4199,31 @@ static int leaks_show(struct seq_file *m, void *p)
if (!(cachep->flags & SLAB_RED_ZONE))
return 0;
- /* OK, we can do it */
+ /*
+ * Set store_user_clean and start to grab stored user information
+ * for all objects on this cache. If some alloc/free requests comes
+ * during the processing, information would be wrong so restart
+ * whole processing.
+ */
+ do {
+ set_store_user_clean(cachep);
+ drain_cpu_caches(cachep);
- x[1] = 0;
+ x[1] = 0;
- for_each_kmem_cache_node(cachep, node, n) {
+ for_each_kmem_cache_node(cachep, node, n) {
- check_irq_on();
- spin_lock_irq(&n->list_lock);
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
+ }
+ } while (!is_store_user_clean(cachep));
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
name = cachep->name;
if (x[0] == x[1]) {
/* Increase the buffer size */
diff --git a/mm/slab.h b/mm/slab.h
index 834ad240c0bb..95208bcdbea0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
#endif
#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -140,6 +144,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
@@ -167,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
/*
* Generic implementation of bulk operations
* These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
* may be allocated or freed using these operations.
*/
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -241,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ int ret;
+
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+
+ ret = __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
+ if (ret)
+ return ret;
+
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ return 0;
+}
+
+static __always_inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ memcg_kmem_uncharge(page, order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -289,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
return 0;
}
+static inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
@@ -320,6 +351,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1,
+ s->flags, flags);
+ kasan_slab_alloc(s, object);
+ }
+ memcg_kmem_put_cache(s);
+}
+
#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b50aef01ccf7..8addc3c4df37 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
size_t i;
- for (i = 0; i < nr; i++)
- kmem_cache_free(s, p[i]);
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
@@ -506,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_online(memcg))
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -693,6 +697,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
void slab_kmem_cache_release(struct kmem_cache *s)
{
+ __kmem_cache_release(s);
destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
diff --git a/mm/slob.c b/mm/slob.c
index 17e8f8cc7c53..5ec158054ffe 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 2e1355ac056b..0f7e9e94e57c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -39,6 +39,10 @@
#include "internal.h"
+#ifdef CONFIG_KASAN
+#include "kasan/kasan.h"
+#endif
+
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
@@ -124,6 +128,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -224,24 +236,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -256,11 +250,9 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return get_freepointer(s, object);
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-#else
- p = get_freepointer(s, object);
-#endif
return p;
}
@@ -284,30 +276,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +426,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -491,6 +475,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -630,7 +634,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +653,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -679,6 +685,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -771,11 +780,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -820,6 +829,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -1021,14 +1034,17 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
}
/* Object debug checks for alloc/free paths */
-static void setup_object_debug(struct kmem_cache *s, struct page *page,
+static void *setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
- return;
+ return object;
+ object = fixup_red_left(s, object);
init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
+
+ return object;
}
static noinline int alloc_debug_processing(struct kmem_cache *s,
@@ -1225,8 +1241,8 @@ unsigned long kmem_cache_flags(unsigned long object_size,
return flags;
}
#else /* !CONFIG_SLUB_DEBUG */
-static inline void setup_object_debug(struct kmem_cache *s,
- struct page *page, void *object) {}
+static inline void *setup_object_debug(struct kmem_cache *s,
+ struct page *page, void *object) { return object; }
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1281,36 +1297,6 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x);
}
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
-{
- flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
- might_sleep_if(gfpflags_allow_blocking(flags));
-
- if (should_failslab(s->object_size, flags, s->flags))
- return NULL;
-
- return memcg_kmem_get_cache(s, flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
-{
- size_t i;
-
- flags &= gfp_allowed_mask;
- for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->object_size, 1,
- s->flags, flags);
- kasan_slab_alloc(s, object);
- }
- memcg_kmem_put_cache(s);
-}
-
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
@@ -1359,15 +1345,17 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
#endif
}
-static void setup_object(struct kmem_cache *s, struct page *page,
+static void *setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
- setup_object_debug(s, page, object);
+ object = setup_object_debug(s, page, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
s->ctor(object);
kasan_poison_object_data(s, object);
}
+
+ return object;
}
/*
@@ -1463,14 +1451,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kasan_poison_slab(page);
for_each_object_idx(p, idx, s, start, page->objects) {
- setup_object(s, page, p);
- if (likely(idx < page->objects))
- set_freepointer(s, p, p + s->size);
- else
- set_freepointer(s, p, NULL);
+ void *object = setup_object(s, page, p);
+
+ if (likely(idx < page->objects)) {
+ set_freepointer(s, object,
+ fixup_red_left(s, p + s->size));
+ } else
+ set_freepointer(s, object, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
@@ -1511,8 +1501,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
- page->objects)
- check_object(s, page, p, SLUB_RED_INACTIVE);
+ page->objects) {
+ void *object = fixup_red_left(s, p);
+
+ check_object(s, page, object, SLUB_RED_INACTIVE);
+ }
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1528,7 +1521,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_kmem_pages(page, order);
+ memcg_uncharge_slab(page, order, s);
+ __free_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -1592,18 +1586,12 @@ static inline void add_partial(struct kmem_cache_node *n,
__add_partial(n, page, tail);
}
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
- list_del(&page->lru);
- n->nr_partial--;
-}
-
static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- __remove_partial(n, page);
+ list_del(&page->lru);
+ n->nr_partial--;
}
/*
@@ -2821,6 +2809,7 @@ struct detached_freelist {
void *tail;
void *freelist;
int cnt;
+ struct kmem_cache *s;
};
/*
@@ -2835,26 +2824,45 @@ struct detached_freelist {
* synchronization primitive. Look ahead in the array is limited due
* to performance reasons.
*/
-static int build_detached_freelist(struct kmem_cache *s, size_t size,
- void **p, struct detached_freelist *df)
+static inline
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
{
size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
+ struct page *page;
/* Always re-init detached_freelist */
df->page = NULL;
do {
object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
} while (!object && size);
if (!object)
return 0;
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_kmem_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
+
/* Start new detached freelist */
- set_freepointer(s, object, NULL);
- df->page = virt_to_head_page(object);
+ df->page = page;
+ set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
p[size] = NULL; /* mark object processed */
@@ -2868,7 +2876,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
/* df->page is always set at this point */
if (df->page == virt_to_head_page(object)) {
/* Opportunity build freelist */
- set_freepointer(s, object, df->freelist);
+ set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
p[size] = NULL; /* mark object processed */
@@ -2887,25 +2895,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
return first_skipped_index;
}
-
/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
if (WARN_ON(!size))
return;
do {
struct detached_freelist df;
- struct kmem_cache *s;
-
- /* Support for memcg */
- s = cache_from_obj(orig_s, p[size - 1]);
size = build_detached_freelist(s, size, p, &df);
if (unlikely(!df.page))
continue;
- slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3184,6 +3187,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
}
}
+void __kmem_cache_release(struct kmem_cache *s)
+{
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+}
+
static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -3294,6 +3303,16 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ if (flags & SLAB_RED_ZONE) {
+ s->red_left_pad = sizeof(void *);
+#ifdef CONFIG_KASAN
+ s->red_left_pad = min_t(int, s->red_left_pad,
+ KASAN_SHADOW_SCALE_SIZE);
+#endif
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3430,10 +3449,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
get_map(s, page, map);
for_each_object(p, s, addr, page->objects) {
+ void *object = fixup_red_left(s, p);
if (!test_bit(slab_index(p, s, addr), map)) {
- pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
- print_tracking(s, p);
+ pr_err("INFO: Object 0x%p @offset=%tu\n",
+ object, object - addr);
+ print_tracking(s, object);
}
}
slab_unlock(page);
@@ -3443,28 +3464,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
struct page *page, *h;
+ BUG_ON(irqs_disabled());
+ spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on kmem_cache_close()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
+ spin_unlock_irq(&n->list_lock);
}
/*
* Release all resources used by a slab cache.
*/
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
int node;
struct kmem_cache_node *n;
@@ -3476,16 +3500,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- free_percpu(s->cpu_slab);
- free_kmem_cache_nodes(s);
return 0;
}
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
- return kmem_cache_close(s);
-}
-
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -3980,7 +3997,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
- kmem_cache_close(s);
+ __kmem_cache_release(s);
return err;
}
@@ -4102,15 +4119,21 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
get_map(s, page, map);
for_each_object(p, s, addr, page->objects) {
+ void *object = fixup_red_left(s, p);
+
if (test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
return 0;
}
- for_each_object(p, s, addr, page->objects)
+ for_each_object(p, s, addr, page->objects) {
+ void *object = fixup_red_left(s, p);
+
if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_ACTIVE))
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
return 0;
+ }
+
return 1;
}
@@ -4308,9 +4331,12 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
bitmap_zero(map, page->objects);
get_map(s, page, map);
- for_each_object(p, s, addr, page->objects)
+ for_each_object(p, s, addr, page->objects) {
+ void *object = fixup_red_left(s, p);
+
if (!test_bit(slab_index(p, s, addr), map))
- add_location(t, s, get_track(s, p, alloc));
+ add_location(t, s, get_track(s, object, alloc));
+ }
}
static int list_locations(struct kmem_cache *s, char *buf,
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg;
unsigned long flags;
if (page->mapping != mapping)
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
diff --git a/mm/util.c b/mm/util.c
index 4fb14ca5a419..4f841e51182e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..d4b2e34adae0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va)
static void vmap_debug_free_range(unsigned long start, unsigned long end)
{
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * Unmap page tables and force a TLB flush immediately if pagealloc
+ * debugging is enabled. This catches use after free bugs similarly to
+ * those in linear kernel virtual address space after a page has been
+ * freed.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
+ * All the lazy freeing logic is still retained, in order to minimise
+ * intrusiveness of this debugging feature.
*
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * This is going to be *slow* (linear kernel virtual address debugging
+ * doesn't do a broadcast TLB flush so it is a lot faster).
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ if (debug_pagealloc_enabled()) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
}
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71b1c29948db..86eb21491867 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,29 +191,29 @@ static bool sane_reclaim(struct scan_control *sc)
}
#endif
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
- return zone_page_state(zone, NR_PAGES_SCANNED) <
+ return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -390,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
*
* @memcg specifies the memory cgroup to target. If it is not NULL,
* only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
*
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
@@ -412,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_online(memcg))
+ if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
if (nr_scanned == 0)
@@ -436,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ /*
+ * If kernel memory accounting is disabled, we ignore
+ * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+ * passing NULL for memcg.
+ */
+ if (memcg_kmem_enabled() &&
+ !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -611,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
- struct mem_cgroup *memcg;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
@@ -656,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -682,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow, memcg);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -694,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -1931,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -2071,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2097,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2130,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2538,10 +2530,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
*/
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2549,7 +2539,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
- bool reclaimable = false;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2614,17 +2603,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
- if (nr_soft_reclaimed)
- reclaimable = true;
/* need some check for avoid more shrink_zone() */
}
- if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
- reclaimable = true;
-
- if (global_reclaim(sc) &&
- !reclaimable && zone_reclaimable(zone))
- reclaimable = true;
+ shrink_zone(zone, sc, zone_idx(zone));
}
/*
@@ -2632,8 +2614,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-
- return reclaimable;
}
/*
@@ -2658,7 +2638,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool zones_reclaimable;
retry:
delayacct_freepages_start();
@@ -2669,7 +2648,7 @@ retry:
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- zones_reclaimable = shrink_zones(zonelist, sc);
+ shrink_zones(zonelist, sc);
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2716,10 +2695,6 @@ retry:
goto retry;
}
- /* Any of the zones still reclaimable? Don't OOM. */
- if (zones_reclaimable)
- return 1;
-
return 0;
}
@@ -2981,18 +2956,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3042,7 +3022,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3096,8 +3076,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
- struct scan_control *sc,
- unsigned long *nr_attempted)
+ struct scan_control *sc)
{
int testorder = sc->order;
unsigned long balance_gap;
@@ -3107,17 +3086,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3131,15 +3099,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, testorder, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
@@ -3149,7 +3114,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, testorder, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3161,7 +3126,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3178,8 +3143,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3196,9 +3160,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
@@ -3233,7 +3195,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, true, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3249,24 +3211,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
- }
-
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
@@ -3310,8 +3254,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc))
raise_priority = false;
}
@@ -3324,49 +3267,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3377,7 +3300,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3387,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3408,6 +3333,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
reset_isolation_suitable(pgdat);
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
if (!kthread_should_stop())
schedule();
@@ -3437,7 +3368,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3470,24 +3400,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3497,7 +3422,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3517,9 +3442,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3549,7 +3473,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b373..74f8c918ac4b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_kcompatd_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -847,6 +848,7 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc_failed",
"thp_split_page",
"thp_split_page_failed",
+ "thp_deferred_split_page",
"thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
@@ -924,19 +926,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Movable",
- "Reclaimable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1133,7 +1122,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..8a75f8d2916a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
* refault distance will immediately activate the refaulting page.
*/
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ ZONES_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
+ eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow,
- struct zone **zone,
- unsigned long *distance)
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+ unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- unsigned long eviction;
- unsigned long refault;
- unsigned long mask;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
- eviction = entry;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
- *zone = NODE_DATA(nid)->node_zones + zid;
-
- refault = atomic_long_read(&(*zone)->inactive_age);
- mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- /*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
- *
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
- */
- *distance = (refault - eviction) & mask;
+ *memcgidp = memcgid;
+ *zonep = NODE_DATA(nid)->node_zones + zid;
+ *evictionp = entry << bucket_order;
}
/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
struct zone *zone;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
- unpack_shadow(shadow, &zone, &refault_distance);
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ lock_page_memcg(page);
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ if (!mem_cgroup_disabled() && !page_memcg(page))
+ goto out;
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+out:
+ unlock_page_memcg(page);
}
/*
@@ -278,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- pages = node_present_pages(sc->nid);
+ if (memcg_kmem_enabled())
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ else
+ pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
+ node_page_state(sc->nid, NR_INACTIVE_FILE);
+
/*
* Active cache pages are limited to 50% of memory, and shadow
* entries that represent a refault distance bigger than that
@@ -387,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
@@ -398,8 +475,25 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
int ret;
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e44d2ac91dd3..580a414d020e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1389,7 +1389,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len,
int err = 0;
#ifdef CONFIG_COMPAT
- if (is_compat_task()) {
+ if (in_compat_syscall()) {
struct compat_sctp_getaddrs_old param32;
if (len < sizeof(param32))
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 805681a7d356..2cc7af858c6f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2449,7 +2449,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
int type, err;
#ifdef CONFIG_COMPAT
- if (is_compat_task())
+ if (in_compat_syscall())
return -ENOTSUPP;
#endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 2edbcadb3d7f..31d7a73c35ce 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -136,6 +136,12 @@ _c_flags += $(if $(patsubst n%,, \
$(CFLAGS_UBSAN))
endif
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+ $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+ $(CFLAGS_KCOV))
+endif
+
# If building the kernel in a separate objtree expand all occurrences
# of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 8ab68679cfb5..77ce538268b5 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -14,4 +14,8 @@ ifdef CONFIG_UBSAN
ifdef CONFIG_UBSAN_ALIGNMENT
CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
endif
+
+ # -fsanitize=* options makes GCC less smart than usual and
+ # increase number of 'maybe-uninitialized false-positives
+ CFLAGS_UBSAN += $(call cc-option, -Wno-maybe-uninitialized)
endif
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
new file mode 100644
index 000000000000..6e6709c1830c
--- /dev/null
+++ b/scripts/gdb/linux/proc.py
@@ -0,0 +1,41 @@
+#
+# gdb helper commands and functions for Linux kernel debugging
+#
+# Kernel proc information reader
+#
+# Copyright (c) 2016 Linaro Ltd
+#
+# Authors:
+# Kieran Bingham <kieran.bingham@linaro.org>
+#
+# This work is licensed under the terms of the GNU GPL version 2.
+#
+
+import gdb
+
+
+class LxCmdLine(gdb.Command):
+ """ Report the Linux Commandline used in the current kernel.
+ Equivalent to cat /proc/cmdline on a running target"""
+
+ def __init__(self):
+ super(LxCmdLine, self).__init__("lx-cmdline", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n")
+
+LxCmdLine()
+
+
+class LxVersion(gdb.Command):
+ """ Report the Linux Version of the current kernel.
+ Equivalent to cat /proc/version on a running target"""
+
+ def __init__(self):
+ super(LxVersion, self).__init__("lx-version", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ # linux_banner should contain a newline
+ gdb.write(gdb.parse_and_eval("linux_banner").string())
+
+LxVersion()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index ce82bf5c3943..d5943eca19cd 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -29,3 +29,4 @@ else:
import linux.tasks
import linux.cpus
import linux.lists
+ import linux.proc
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 8fa81e84e295..638b143ee60f 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -22,6 +22,7 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
+#include <limits.h>
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
@@ -34,6 +35,7 @@ struct sym_entry {
unsigned int len;
unsigned int start_pos;
unsigned char *sym;
+ unsigned int percpu_absolute;
};
struct addr_range {
@@ -42,6 +44,7 @@ struct addr_range {
};
static unsigned long long _text;
+static unsigned long long relative_base;
static struct addr_range text_ranges[] = {
{ "_stext", "_etext" },
{ "_sinittext", "_einittext" },
@@ -61,6 +64,7 @@ static int all_symbols = 0;
static int absolute_percpu = 0;
static char symbol_prefix_char = '\0';
static unsigned long long kernel_start_addr = 0;
+static int base_relative = 0;
int token_profit[0x10000];
@@ -74,7 +78,7 @@ static void usage(void)
fprintf(stderr, "Usage: kallsyms [--all-symbols] "
"[--symbol-prefix=<prefix char>] "
"[--page-offset=<CONFIG_PAGE_OFFSET>] "
- "< in.map > out.S\n");
+ "[--base-relative] < in.map > out.S\n");
exit(1);
}
@@ -171,6 +175,8 @@ static int read_symbol(FILE *in, struct sym_entry *s)
strcpy((char *)s->sym + 1, str);
s->sym[0] = stype;
+ s->percpu_absolute = 0;
+
/* Record if we've found __per_cpu_start/end. */
check_symbol_range(sym, s->addr, &percpu_range, 1);
@@ -202,6 +208,8 @@ static int symbol_valid(struct sym_entry *s)
*/
static char *special_symbols[] = {
"kallsyms_addresses",
+ "kallsyms_offsets",
+ "kallsyms_relative_base",
"kallsyms_num_syms",
"kallsyms_names",
"kallsyms_markers",
@@ -325,7 +333,7 @@ static int expand_symbol(unsigned char *data, int len, char *result)
static int symbol_absolute(struct sym_entry *s)
{
- return toupper(s->sym[0]) == 'A';
+ return s->percpu_absolute;
}
static void write_src(void)
@@ -346,16 +354,48 @@ static void write_src(void)
printf("\t.section .rodata, \"a\"\n");
- /* Provide proper symbols relocatability by their '_text'
- * relativeness. The symbol names cannot be used to construct
- * normal symbol references as the list of symbols contains
- * symbols that are declared static and are private to their
- * .o files. This prevents .tmp_kallsyms.o or any other
- * object from referencing them.
+ /* Provide proper symbols relocatability by their relativeness
+ * to a fixed anchor point in the runtime image, either '_text'
+ * for absolute address tables, in which case the linker will
+ * emit the final addresses at build time. Otherwise, use the
+ * offset relative to the lowest value encountered of all relative
+ * symbols, and emit non-relocatable fixed offsets that will be fixed
+ * up at runtime.
+ *
+ * The symbol names cannot be used to construct normal symbol
+ * references as the list of symbols contains symbols that are
+ * declared static and are private to their .o files. This prevents
+ * .tmp_kallsyms.o or any other object from referencing them.
*/
- output_label("kallsyms_addresses");
+ if (!base_relative)
+ output_label("kallsyms_addresses");
+ else
+ output_label("kallsyms_offsets");
+
for (i = 0; i < table_cnt; i++) {
- if (!symbol_absolute(&table[i])) {
+ if (base_relative) {
+ long long offset;
+ int overflow;
+
+ if (!absolute_percpu) {
+ offset = table[i].addr - relative_base;
+ overflow = (offset < 0 || offset > UINT_MAX);
+ } else if (symbol_absolute(&table[i])) {
+ offset = table[i].addr;
+ overflow = (offset < 0 || offset > INT_MAX);
+ } else {
+ offset = relative_base - table[i].addr - 1;
+ overflow = (offset < INT_MIN || offset >= 0);
+ }
+ if (overflow) {
+ fprintf(stderr, "kallsyms failure: "
+ "%s symbol value %#llx out of range in relative mode\n",
+ symbol_absolute(&table[i]) ? "absolute" : "relative",
+ table[i].addr);
+ exit(EXIT_FAILURE);
+ }
+ printf("\t.long\t%#x\n", (int)offset);
+ } else if (!symbol_absolute(&table[i])) {
if (_text <= table[i].addr)
printf("\tPTR\t_text + %#llx\n",
table[i].addr - _text);
@@ -368,6 +408,12 @@ static void write_src(void)
}
printf("\n");
+ if (base_relative) {
+ output_label("kallsyms_relative_base");
+ printf("\tPTR\t_text - %#llx\n", _text - relative_base);
+ printf("\n");
+ }
+
output_label("kallsyms_num_syms");
printf("\tPTR\t%d\n", table_cnt);
printf("\n");
@@ -681,8 +727,27 @@ static void make_percpus_absolute(void)
unsigned int i;
for (i = 0; i < table_cnt; i++)
- if (symbol_in_range(&table[i], &percpu_range, 1))
+ if (symbol_in_range(&table[i], &percpu_range, 1)) {
+ /*
+ * Keep the 'A' override for percpu symbols to
+ * ensure consistent behavior compared to older
+ * versions of this tool.
+ */
table[i].sym[0] = 'A';
+ table[i].percpu_absolute = 1;
+ }
+}
+
+/* find the minimum non-absolute symbol address */
+static void record_relative_base(void)
+{
+ unsigned int i;
+
+ relative_base = -1ULL;
+ for (i = 0; i < table_cnt; i++)
+ if (!symbol_absolute(&table[i]) &&
+ table[i].addr < relative_base)
+ relative_base = table[i].addr;
}
int main(int argc, char **argv)
@@ -703,7 +768,9 @@ int main(int argc, char **argv)
} else if (strncmp(argv[i], "--page-offset=", 14) == 0) {
const char *p = &argv[i][14];
kernel_start_addr = strtoull(p, NULL, 16);
- } else
+ } else if (strcmp(argv[i], "--base-relative") == 0)
+ base_relative = 1;
+ else
usage();
}
} else if (argc != 1)
@@ -712,6 +779,8 @@ int main(int argc, char **argv)
read_map(stdin);
if (absolute_percpu)
make_percpus_absolute();
+ if (base_relative)
+ record_relative_base();
sort_symbols();
optimize_token_table();
write_src();
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 8f22654c71b4..49d61ade9425 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -86,10 +86,14 @@ kallsyms()
kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET"
fi
- if [ -n "${CONFIG_X86_64}" ]; then
+ if [ -n "${CONFIG_KALLSYMS_ABSOLUTE_PERCPU}" ]; then
kallsymopt="${kallsymopt} --absolute-percpu"
fi
+ if [ -n "${CONFIG_KALLSYMS_BASE_RELATIVE}" ]; then
+ kallsymopt="${kallsymopt} --base-relative"
+ fi
+
local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \
${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}"
diff --git a/scripts/namespace.pl b/scripts/namespace.pl
index a71be6b7cdec..9f3c9d47a4a5 100755
--- a/scripts/namespace.pl
+++ b/scripts/namespace.pl
@@ -117,6 +117,8 @@ my %nameexception = (
'kallsyms_names' => 1,
'kallsyms_num_syms' => 1,
'kallsyms_addresses'=> 1,
+ 'kallsyms_offsets' => 1,
+ 'kallsyms_relative_base'=> 1,
'__this_module' => 1,
'_etext' => 1,
'_edata' => 1,
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..af247c70fb66 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -282,12 +282,12 @@ do_file(char const *const fname)
case EM_386:
case EM_X86_64:
case EM_S390:
+ case EM_AARCH64:
custom_sort = sort_relative_table;
break;
case EM_ARCOMPACT:
case EM_ARCV2:
case EM_ARM:
- case EM_AARCH64:
case EM_MICROBLAZE:
case EM_MIPS:
case EM_XTENSA:
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c
index e9b98af6b52c..e8da3b8ee721 100644
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -141,10 +141,8 @@ int pxa2xx_pcm_mmap(struct snd_pcm_substream *substream,
struct vm_area_struct *vma)
{
struct snd_pcm_runtime *runtime = substream->runtime;
- return dma_mmap_writecombine(substream->pcm->card->dev, vma,
- runtime->dma_area,
- runtime->dma_addr,
- runtime->dma_bytes);
+ return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+ runtime->dma_addr, runtime->dma_bytes);
}
EXPORT_SYMBOL(pxa2xx_pcm_mmap);
@@ -156,8 +154,7 @@ int pxa2xx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
buf->dev.type = SNDRV_DMA_TYPE_DEV;
buf->dev.dev = pcm->card->dev;
buf->private_data = NULL;
- buf->area = dma_alloc_writecombine(pcm->card->dev, size,
- &buf->addr, GFP_KERNEL);
+ buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
if (!buf->area)
return -ENOMEM;
buf->bytes = size;
@@ -178,8 +175,7 @@ void pxa2xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
buf = &substream->dma_buffer;
if (!buf->area)
continue;
- dma_free_writecombine(pcm->card->dev, buf->bytes,
- buf->area, buf->addr);
+ dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
buf->area = NULL;
}
}
diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index 27e25bb78c97..72e2d0012084 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -14,6 +14,7 @@
#include <linux/input.h>
#include <linux/delay.h>
#include <linux/bitops.h>
+#include <linux/mm.h>
#include "pcsp_input.h"
#include "pcsp.h"
@@ -148,11 +149,11 @@ static int alsa_card_pcsp_init(struct device *dev)
return err;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
/* Well, CONFIG_DEBUG_PAGEALLOC makes the sound horrible. Lets alert */
- printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, "
- "which may make the sound noisy.\n");
-#endif
+ if (debug_pagealloc_enabled()) {
+ printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, "
+ "which may make the sound noisy.\n");
+ }
return 0;
}
diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c
index 49d7513f429e..e63cd5ecfd8f 100644
--- a/sound/soc/fsl/imx-pcm-fiq.c
+++ b/sound/soc/fsl/imx-pcm-fiq.c
@@ -217,8 +217,8 @@ static int snd_imx_pcm_mmap(struct snd_pcm_substream *substream,
struct snd_pcm_runtime *runtime = substream->runtime;
int ret;
- ret = dma_mmap_writecombine(substream->pcm->card->dev, vma,
- runtime->dma_area, runtime->dma_addr, runtime->dma_bytes);
+ ret = dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+ runtime->dma_addr, runtime->dma_bytes);
pr_debug("%s: ret: %d %p %pad 0x%08x\n", __func__, ret,
runtime->dma_area,
@@ -247,8 +247,7 @@ static int imx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
buf->dev.type = SNDRV_DMA_TYPE_DEV;
buf->dev.dev = pcm->card->dev;
buf->private_data = NULL;
- buf->area = dma_alloc_writecombine(pcm->card->dev, size,
- &buf->addr, GFP_KERNEL);
+ buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
if (!buf->area)
return -ENOMEM;
buf->bytes = size;
@@ -330,8 +329,7 @@ static void imx_pcm_free(struct snd_pcm *pcm)
if (!buf->area)
continue;
- dma_free_writecombine(pcm->card->dev, buf->bytes,
- buf->area, buf->addr);
+ dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
buf->area = NULL;
}
}
diff --git a/sound/soc/nuc900/nuc900-pcm.c b/sound/soc/nuc900/nuc900-pcm.c
index e09326158bc2..2cca055fd806 100644
--- a/sound/soc/nuc900/nuc900-pcm.c
+++ b/sound/soc/nuc900/nuc900-pcm.c
@@ -267,10 +267,8 @@ static int nuc900_dma_mmap(struct snd_pcm_substream *substream,
{
struct snd_pcm_runtime *runtime = substream->runtime;
- return dma_mmap_writecombine(substream->pcm->card->dev, vma,
- runtime->dma_area,
- runtime->dma_addr,
- runtime->dma_bytes);
+ return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+ runtime->dma_addr, runtime->dma_bytes);
}
static struct snd_pcm_ops nuc900_dma_ops = {
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c
index 6bb623a2a4df..99381a27295b 100644
--- a/sound/soc/omap/omap-pcm.c
+++ b/sound/soc/omap/omap-pcm.c
@@ -156,10 +156,8 @@ static int omap_pcm_mmap(struct snd_pcm_substream *substream,
{
struct snd_pcm_runtime *runtime = substream->runtime;
- return dma_mmap_writecombine(substream->pcm->card->dev, vma,
- runtime->dma_area,
- runtime->dma_addr,
- runtime->dma_bytes);
+ return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+ runtime->dma_addr, runtime->dma_bytes);
}
static struct snd_pcm_ops omap_pcm_ops = {
@@ -183,8 +181,7 @@ static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
buf->dev.type = SNDRV_DMA_TYPE_DEV;
buf->dev.dev = pcm->card->dev;
buf->private_data = NULL;
- buf->area = dma_alloc_writecombine(pcm->card->dev, size,
- &buf->addr, GFP_KERNEL);
+ buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
if (!buf->area)
return -ENOMEM;
@@ -207,8 +204,7 @@ static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
if (!buf->area)
continue;
- dma_free_writecombine(pcm->card->dev, buf->bytes,
- buf->area, buf->addr);
+ dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
buf->area = NULL;
}
}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 118010553d0c..5da5a9511cef 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -602,7 +602,7 @@ static int gfpcmp(const void *a, const void *b)
return fa->flags - fb->flags;
}
-/* see include/trace/events/gfpflags.h */
+/* see include/trace/events/mmflags.h */
static const struct {
const char *original;
const char *compact;
@@ -612,30 +612,39 @@ static const struct {
{ "GFP_HIGHUSER", "HU" },
{ "GFP_USER", "U" },
{ "GFP_TEMPORARY", "TMP" },
+ { "GFP_KERNEL_ACCOUNT", "KAC" },
{ "GFP_KERNEL", "K" },
{ "GFP_NOFS", "NF" },
{ "GFP_ATOMIC", "A" },
{ "GFP_NOIO", "NI" },
- { "GFP_HIGH", "H" },
- { "GFP_WAIT", "W" },
- { "GFP_IO", "I" },
- { "GFP_COLD", "CO" },
- { "GFP_NOWARN", "NWR" },
- { "GFP_REPEAT", "R" },
- { "GFP_NOFAIL", "NF" },
- { "GFP_NORETRY", "NR" },
- { "GFP_COMP", "C" },
- { "GFP_ZERO", "Z" },
- { "GFP_NOMEMALLOC", "NMA" },
- { "GFP_MEMALLOC", "MA" },
- { "GFP_HARDWALL", "HW" },
- { "GFP_THISNODE", "TN" },
- { "GFP_RECLAIMABLE", "RC" },
- { "GFP_MOVABLE", "M" },
- { "GFP_NOTRACK", "NT" },
- { "GFP_NO_KSWAPD", "NK" },
- { "GFP_OTHER_NODE", "ON" },
{ "GFP_NOWAIT", "NW" },
+ { "GFP_DMA", "D" },
+ { "__GFP_HIGHMEM", "HM" },
+ { "GFP_DMA32", "D32" },
+ { "__GFP_HIGH", "H" },
+ { "__GFP_ATOMIC", "_A" },
+ { "__GFP_IO", "I" },
+ { "__GFP_FS", "F" },
+ { "__GFP_COLD", "CO" },
+ { "__GFP_NOWARN", "NWR" },
+ { "__GFP_REPEAT", "R" },
+ { "__GFP_NOFAIL", "NF" },
+ { "__GFP_NORETRY", "NR" },
+ { "__GFP_COMP", "C" },
+ { "__GFP_ZERO", "Z" },
+ { "__GFP_NOMEMALLOC", "NMA" },
+ { "__GFP_MEMALLOC", "MA" },
+ { "__GFP_HARDWALL", "HW" },
+ { "__GFP_THISNODE", "TN" },
+ { "__GFP_RECLAIMABLE", "RC" },
+ { "__GFP_MOVABLE", "M" },
+ { "__GFP_ACCOUNT", "AC" },
+ { "__GFP_NOTRACK", "NT" },
+ { "__GFP_WRITE", "WR" },
+ { "__GFP_RECLAIM", "R" },
+ { "__GFP_DIRECT_RECLAIM", "DR" },
+ { "__GFP_KSWAPD_RECLAIM", "KR" },
+ { "__GFP_OTHER_NODE", "ON" },
};
static size_t max_gfp_len;
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
new file mode 100644
index 000000000000..11d888ca6a92
--- /dev/null
+++ b/tools/testing/radix-tree/.gitignore
@@ -0,0 +1,2 @@
+main
+radix-tree.c
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
new file mode 100644
index 000000000000..3698a1a12ece
--- /dev/null
+++ b/tools/testing/radix-tree/Makefile
@@ -0,0 +1,20 @@
+
+CFLAGS += -I. -g -Wall -D_LGPL_SOURCE
+LDFLAGS += -lpthread -lurcu
+TARGETS = main
+OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
+ regression1.o regression2.o regression3.o
+
+targets: $(TARGETS)
+
+main: $(OFILES)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(OFILES) -o main
+
+clean:
+ $(RM) -f $(TARGETS) *.o radix-tree.c
+
+$(OFILES): *.h */*.h
+
+radix-tree.c: ../../../lib/radix-tree.c
+ sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c
new file mode 100644
index 000000000000..d1c2178bb2d4
--- /dev/null
+++ b/tools/testing/radix-tree/find_next_bit.c
@@ -0,0 +1,57 @@
+/* find_next_bit.c: fallback find next bit implementation
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
new file mode 100644
index 000000000000..154823737b20
--- /dev/null
+++ b/tools/testing/radix-tree/linux.c
@@ -0,0 +1,60 @@
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <urcu/uatomic.h>
+
+int nr_allocated;
+
+void *mempool_alloc(mempool_t *pool, int gfp_mask)
+{
+ return pool->alloc(gfp_mask, pool->data);
+}
+
+void mempool_free(void *element, mempool_t *pool)
+{
+ pool->free(element, pool->data);
+}
+
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ mempool_t *ret = malloc(sizeof(*ret));
+
+ ret->alloc = alloc_fn;
+ ret->free = free_fn;
+ ret->data = pool_data;
+ return ret;
+}
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
+{
+ void *ret = malloc(cachep->size);
+ if (cachep->ctor)
+ cachep->ctor(ret);
+ uatomic_inc(&nr_allocated);
+ return ret;
+}
+
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ assert(objp);
+ uatomic_dec(&nr_allocated);
+ memset(objp, 0, cachep->size);
+ free(objp);
+}
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t offset,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *ret = malloc(sizeof(*ret));
+
+ ret->size = size;
+ ret->ctor = ctor;
+ return ret;
+}
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h
new file mode 100644
index 000000000000..71d58427ab60
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops.h
@@ -0,0 +1,150 @@
+#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+
+#include <linux/types.h>
+
+#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p &= ~mask;
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p ^= mask;
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old | mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old & ~mask;
+ return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr,
+ volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old ^ mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ int num = 0;
+
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf) == 0) {
+ num += 4;
+ word >>= 4;
+ }
+ if ((word & 0x3) == 0) {
+ num += 2;
+ word >>= 2;
+ }
+ if ((word & 0x1) == 0)
+ num += 1;
+ return num;
+}
+
+unsigned long find_next_bit(const unsigned long *addr,
+ unsigned long size,
+ unsigned long offset);
+
+#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/__ffs.h b/tools/testing/radix-tree/linux/bitops/__ffs.h
new file mode 100644
index 000000000000..9a3274aecf83
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/__ffs.h
@@ -0,0 +1,43 @@
+#ifndef _ASM_GENERIC_BITOPS___FFS_H_
+#define _ASM_GENERIC_BITOPS___FFS_H_
+
+#include <asm/types.h>
+
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ int num = 0;
+
+#if BITS_PER_LONG == 64
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+#endif
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf) == 0) {
+ num += 4;
+ word >>= 4;
+ }
+ if ((word & 0x3) == 0) {
+ num += 2;
+ word >>= 2;
+ }
+ if ((word & 0x1) == 0)
+ num += 1;
+ return num;
+}
+
+#endif /* _ASM_GENERIC_BITOPS___FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffs.h b/tools/testing/radix-tree/linux/bitops/ffs.h
new file mode 100644
index 000000000000..fbbb43af7dc0
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/ffs.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_GENERIC_BITOPS_FFS_H_
+#define _ASM_GENERIC_BITOPS_FFS_H_
+
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as
+ * the libc and compiler builtin ffs routines, therefore
+ * differs in spirit from the above ffz (man ffs).
+ */
+static inline int ffs(int x)
+{
+ int r = 1;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff)) {
+ x >>= 16;
+ r += 16;
+ }
+ if (!(x & 0xff)) {
+ x >>= 8;
+ r += 8;
+ }
+ if (!(x & 0xf)) {
+ x >>= 4;
+ r += 4;
+ }
+ if (!(x & 3)) {
+ x >>= 2;
+ r += 2;
+ }
+ if (!(x & 1)) {
+ x >>= 1;
+ r += 1;
+ }
+ return r;
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffz.h b/tools/testing/radix-tree/linux/bitops/ffz.h
new file mode 100644
index 000000000000..6744bd4cdf46
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/ffz.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_GENERIC_BITOPS_FFZ_H_
+#define _ASM_GENERIC_BITOPS_FFZ_H_
+
+/*
+ * ffz - find first zero in word.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+#define ffz(x) __ffs(~(x))
+
+#endif /* _ASM_GENERIC_BITOPS_FFZ_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/find.h b/tools/testing/radix-tree/linux/bitops/find.h
new file mode 100644
index 000000000000..72a51e5a12ef
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/find.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_GENERIC_BITOPS_FIND_H_
+#define _ASM_GENERIC_BITOPS_FIND_H_
+
+extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
+ size, unsigned long offset);
+
+extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+
+#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
+#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
+
+#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls.h b/tools/testing/radix-tree/linux/bitops/fls.h
new file mode 100644
index 000000000000..850859bc5069
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/fls.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_GENERIC_BITOPS_FLS_H_
+#define _ASM_GENERIC_BITOPS_FLS_H_
+
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs.
+ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
+ */
+
+static inline int fls(int x)
+{
+ int r = 32;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff0000u)) {
+ x <<= 16;
+ r -= 16;
+ }
+ if (!(x & 0xff000000u)) {
+ x <<= 8;
+ r -= 8;
+ }
+ if (!(x & 0xf0000000u)) {
+ x <<= 4;
+ r -= 4;
+ }
+ if (!(x & 0xc0000000u)) {
+ x <<= 2;
+ r -= 2;
+ }
+ if (!(x & 0x80000000u)) {
+ x <<= 1;
+ r -= 1;
+ }
+ return r;
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls64.h b/tools/testing/radix-tree/linux/bitops/fls64.h
new file mode 100644
index 000000000000..1b6b17ce2428
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/fls64.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_GENERIC_BITOPS_FLS64_H_
+#define _ASM_GENERIC_BITOPS_FLS64_H_
+
+#include <asm/types.h>
+
+static inline int fls64(__u64 x)
+{
+ __u32 h = x >> 32;
+ if (h)
+ return fls(h) + 32;
+ return fls(x);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/hweight.h b/tools/testing/radix-tree/linux/bitops/hweight.h
new file mode 100644
index 000000000000..fbbc383771da
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/hweight.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_GENERIC_BITOPS_HWEIGHT_H_
+#define _ASM_GENERIC_BITOPS_HWEIGHT_H_
+
+#include <asm/types.h>
+
+extern unsigned int hweight32(unsigned int w);
+extern unsigned int hweight16(unsigned int w);
+extern unsigned int hweight8(unsigned int w);
+extern unsigned long hweight64(__u64 w);
+
+#endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/le.h b/tools/testing/radix-tree/linux/bitops/le.h
new file mode 100644
index 000000000000..b9c7e5d2d2ad
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/le.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_GENERIC_BITOPS_LE_H_
+#define _ASM_GENERIC_BITOPS_LE_H_
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
+
+#if defined(__LITTLE_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) test_bit(nr, addr)
+#define generic___set_le_bit(nr, addr) __set_bit(nr, addr)
+#define generic___clear_le_bit(nr, addr) __clear_bit(nr, addr)
+
+#define generic_test_and_set_le_bit(nr, addr) test_and_set_bit(nr, addr)
+#define generic_test_and_clear_le_bit(nr, addr) test_and_clear_bit(nr, addr)
+
+#define generic___test_and_set_le_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset)
+
+#elif defined(__BIG_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) \
+ test_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___set_le_bit(nr, addr) \
+ __set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___clear_le_bit(nr, addr) \
+ __clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic_test_and_set_le_bit(nr, addr) \
+ test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic_test_and_clear_le_bit(nr, addr) \
+ test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic___test_and_set_le_bit(nr, addr) \
+ __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___test_and_clear_le_bit(nr, addr) \
+ __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr,
+ unsigned long size, unsigned long offset);
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+#define generic_find_first_zero_le_bit(addr, size) \
+ generic_find_next_zero_le_bit((addr), (size), 0)
+
+#endif /* _ASM_GENERIC_BITOPS_LE_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h
new file mode 100644
index 000000000000..46a825cf2ae1
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h
@@ -0,0 +1,111 @@
+#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+
+#include <asm/types.h>
+
+#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p &= ~mask;
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p ^= mask;
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old | mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old & ~mask;
+ return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr,
+ volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old ^ mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
new file mode 100644
index 000000000000..ccbe444977df
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -0,0 +1 @@
+#define WARN_ON_ONCE(x) assert(x)
diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/radix-tree/linux/cpu.h
new file mode 100644
index 000000000000..b0edb1bc64de
--- /dev/null
+++ b/tools/testing/radix-tree/linux/cpu.h
@@ -0,0 +1,35 @@
+
+#define hotcpu_notifier(a, b)
+
+#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */
+#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */
+#define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */
+#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
+#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
+#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
+#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
+ * not handling interrupts, soon dead.
+ * Called on the dying cpu, interrupts
+ * are already disabled. Must not
+ * sleep, must not fail */
+#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
+ * lock is dropped */
+#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running.
+ * Called on the new cpu, just before
+ * enabling interrupts. Must not sleep,
+ * must not fail */
+#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached
+ * idle loop. */
+#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly,
+ * perhaps due to preemption. */
+#define CPU_TASKS_FROZEN 0x0010
+
+#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN)
+#define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_UP_CANCELED_FROZEN (CPU_UP_CANCELED | CPU_TASKS_FROZEN)
+#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
+#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
+#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
+#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN)
+
diff --git a/tools/testing/radix-tree/linux/export.h b/tools/testing/radix-tree/linux/export.h
new file mode 100644
index 000000000000..b6afd131998d
--- /dev/null
+++ b/tools/testing/radix-tree/linux/export.h
@@ -0,0 +1,2 @@
+
+#define EXPORT_SYMBOL(sym)
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
new file mode 100644
index 000000000000..01f1eabba119
--- /dev/null
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -0,0 +1,8 @@
+#ifndef _GFP_H
+#define _GFP_H
+
+#define __GFP_BITS_SHIFT 22
+#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+#define __GFP_WAIT 1
+
+#endif
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
new file mode 100644
index 000000000000..ae013b0160ac
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -0,0 +1,35 @@
+#ifndef _KERNEL_H
+#define _KERNEL_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <limits.h>
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define BUG_ON(expr) assert(!(expr))
+#define __init
+#define __must_check
+#define panic(expr)
+#define printk printf
+#define __force
+#define likely(c) (c)
+#define unlikely(c) (c)
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type, member) );})
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static inline int in_interrupt(void)
+{
+ return 0;
+}
+#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/kmemleak.h b/tools/testing/radix-tree/linux/kmemleak.h
new file mode 100644
index 000000000000..155f112786c4
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kmemleak.h
@@ -0,0 +1 @@
+static inline void kmemleak_update_trace(const void *ptr) { }
diff --git a/tools/testing/radix-tree/linux/mempool.h b/tools/testing/radix-tree/linux/mempool.h
new file mode 100644
index 000000000000..650a1ac47da6
--- /dev/null
+++ b/tools/testing/radix-tree/linux/mempool.h
@@ -0,0 +1,17 @@
+
+#include <linux/slab.h>
+
+typedef void *(mempool_alloc_t)(int gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
+typedef struct {
+ mempool_alloc_t *alloc;
+ mempool_free_t *free;
+ void *data;
+} mempool_t;
+
+void *mempool_alloc(mempool_t *pool, int gfp_mask);
+void mempool_free(void *element, mempool_t *pool);
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data);
+
diff --git a/tools/testing/radix-tree/linux/notifier.h b/tools/testing/radix-tree/linux/notifier.h
new file mode 100644
index 000000000000..70e4797d5a46
--- /dev/null
+++ b/tools/testing/radix-tree/linux/notifier.h
@@ -0,0 +1,8 @@
+#ifndef _NOTIFIER_H
+#define _NOTIFIER_H
+
+struct notifier_block;
+
+#define NOTIFY_OK 0x0001 /* Suits me */
+
+#endif
diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/radix-tree/linux/percpu.h
new file mode 100644
index 000000000000..5837f1d56f17
--- /dev/null
+++ b/tools/testing/radix-tree/linux/percpu.h
@@ -0,0 +1,7 @@
+
+#define DEFINE_PER_CPU(type, val) type val
+
+#define __get_cpu_var(var) var
+#define this_cpu_ptr(var) var
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu))
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h
new file mode 100644
index 000000000000..c50529e4d98b
--- /dev/null
+++ b/tools/testing/radix-tree/linux/preempt.h
@@ -0,0 +1,5 @@
+/* */
+
+#define preempt_disable() do { } while (0)
+#define preempt_enable() do { } while (0)
+
diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h
new file mode 100644
index 000000000000..ce694ddd4aea
--- /dev/null
+++ b/tools/testing/radix-tree/linux/radix-tree.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/radix-tree.h"
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
new file mode 100644
index 000000000000..f7129ea2a899
--- /dev/null
+++ b/tools/testing/radix-tree/linux/rcupdate.h
@@ -0,0 +1,9 @@
+#ifndef _RCUPDATE_H
+#define _RCUPDATE_H
+
+#include <urcu.h>
+
+#define rcu_dereference_raw(p) rcu_dereference(p)
+#define rcu_dereference_protected(p, cond) rcu_dereference(p)
+
+#endif
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
new file mode 100644
index 000000000000..57282506c21d
--- /dev/null
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -0,0 +1,28 @@
+#ifndef SLAB_H
+#define SLAB_H
+
+#include <linux/types.h>
+
+#define GFP_KERNEL 1
+#define SLAB_HWCACHE_ALIGN 1
+#define SLAB_PANIC 2
+#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
+
+static inline int gfpflags_allow_blocking(gfp_t mask)
+{
+ return 1;
+}
+
+struct kmem_cache {
+ int size;
+ void (*ctor)(void *);
+};
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
+void kmem_cache_free(struct kmem_cache *cachep, void *objp);
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t offset,
+ unsigned long flags, void (*ctor)(void *));
+
+#endif /* SLAB_H */
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
new file mode 100644
index 000000000000..72a9d85f6c76
--- /dev/null
+++ b/tools/testing/radix-tree/linux/types.h
@@ -0,0 +1,28 @@
+#ifndef _TYPES_H
+#define _TYPES_H
+
+#define __rcu
+#define __read_mostly
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+typedef struct {
+ unsigned int x;
+} spinlock_t;
+
+#define uninitialized_var(x) x = x
+
+typedef unsigned gfp_t;
+#include <linux/gfp.h>
+
+#endif
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
new file mode 100644
index 000000000000..0e83cad27a9f
--- /dev/null
+++ b/tools/testing/radix-tree/main.c
@@ -0,0 +1,272 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+
+#include "test.h"
+#include "regression.h"
+
+void __gang_check(unsigned long middle, long down, long up, int chunk, int hop)
+{
+ long idx;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ middle = 1 << 30;
+
+ for (idx = -down; idx < up; idx++)
+ item_insert(&tree, middle + idx);
+
+ item_check_absent(&tree, middle - down - 1);
+ for (idx = -down; idx < up; idx++)
+ item_check_present(&tree, middle + idx);
+ item_check_absent(&tree, middle + up);
+
+ item_gang_check_present(&tree, middle - down,
+ up + down, chunk, hop);
+ item_full_scan(&tree, middle - down, down + up, chunk);
+ item_kill_tree(&tree);
+}
+
+void gang_check(void)
+{
+ __gang_check(1 << 30, 128, 128, 35, 2);
+ __gang_check(1 << 31, 128, 128, 32, 32);
+ __gang_check(1 << 31, 128, 128, 32, 100);
+ __gang_check(1 << 31, 128, 128, 17, 7);
+ __gang_check(0xffff0000, 0, 65536, 17, 7);
+ __gang_check(0xfffffffe, 1, 1, 17, 7);
+}
+
+void __big_gang_check(void)
+{
+ unsigned long start;
+ int wrapped = 0;
+
+ start = 0;
+ do {
+ unsigned long old_start;
+
+// printf("0x%08lx\n", start);
+ __gang_check(start, rand() % 113 + 1, rand() % 71,
+ rand() % 157, rand() % 91 + 1);
+ old_start = start;
+ start += rand() % 1000000;
+ start %= 1ULL << 33;
+ if (start < old_start)
+ wrapped = 1;
+ } while (!wrapped);
+}
+
+void big_gang_check(void)
+{
+ int i;
+
+ for (i = 0; i < 1000; i++) {
+ __big_gang_check();
+ srand(time(0));
+ printf("%d ", i);
+ fflush(stdout);
+ }
+}
+
+void add_and_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 44);
+ item_check_present(&tree, 44);
+ item_check_absent(&tree, 43);
+ item_kill_tree(&tree);
+}
+
+void dynamic_height_check(void)
+{
+ int i;
+ RADIX_TREE(tree, GFP_KERNEL);
+ tree_verify_min_height(&tree, 0);
+
+ item_insert(&tree, 42);
+ tree_verify_min_height(&tree, 42);
+
+ item_insert(&tree, 1000000);
+ tree_verify_min_height(&tree, 1000000);
+
+ assert(item_delete(&tree, 1000000));
+ tree_verify_min_height(&tree, 42);
+
+ assert(item_delete(&tree, 42));
+ tree_verify_min_height(&tree, 0);
+
+ for (i = 0; i < 1000; i++) {
+ item_insert(&tree, i);
+ tree_verify_min_height(&tree, i);
+ }
+
+ i--;
+ for (;;) {
+ assert(item_delete(&tree, i));
+ if (i == 0) {
+ tree_verify_min_height(&tree, 0);
+ break;
+ }
+ i--;
+ tree_verify_min_height(&tree, i);
+ }
+
+ item_kill_tree(&tree);
+}
+
+void check_copied_tags(struct radix_tree_root *tree, unsigned long start, unsigned long end, unsigned long *idx, int count, int fromtag, int totag)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+/* if (i % 1000 == 0)
+ putchar('.'); */
+ if (idx[i] < start || idx[i] > end) {
+ if (item_tag_get(tree, idx[i], totag)) {
+ printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+ }
+ assert(!item_tag_get(tree, idx[i], totag));
+ continue;
+ }
+ if (item_tag_get(tree, idx[i], fromtag) ^
+ item_tag_get(tree, idx[i], totag)) {
+ printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+ }
+ assert(!(item_tag_get(tree, idx[i], fromtag) ^
+ item_tag_get(tree, idx[i], totag)));
+ }
+}
+
+#define ITEMS 50000
+
+void copy_tag_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ unsigned long idx[ITEMS];
+ unsigned long start, end, count = 0, tagged, cur, tmp;
+ int i;
+
+// printf("generating radix tree indices...\n");
+ start = rand();
+ end = rand();
+ if (start > end && (rand() % 10)) {
+ cur = start;
+ start = end;
+ end = cur;
+ }
+ /* Specifically create items around the start and the end of the range
+ * with high probability to check for off by one errors */
+ cur = rand();
+ if (cur & 1) {
+ item_insert(&tree, start);
+ if (cur & 2) {
+ if (start <= end)
+ count++;
+ item_tag_set(&tree, start, 0);
+ }
+ }
+ if (cur & 4) {
+ item_insert(&tree, start-1);
+ if (cur & 8)
+ item_tag_set(&tree, start-1, 0);
+ }
+ if (cur & 16) {
+ item_insert(&tree, end);
+ if (cur & 32) {
+ if (start <= end)
+ count++;
+ item_tag_set(&tree, end, 0);
+ }
+ }
+ if (cur & 64) {
+ item_insert(&tree, end+1);
+ if (cur & 128)
+ item_tag_set(&tree, end+1, 0);
+ }
+
+ for (i = 0; i < ITEMS; i++) {
+ do {
+ idx[i] = rand();
+ } while (item_lookup(&tree, idx[i]));
+
+ item_insert(&tree, idx[i]);
+ if (rand() & 1) {
+ item_tag_set(&tree, idx[i], 0);
+ if (idx[i] >= start && idx[i] <= end)
+ count++;
+ }
+/* if (i % 1000 == 0)
+ putchar('.'); */
+ }
+
+// printf("\ncopying tags...\n");
+ cur = start;
+ tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1);
+
+// printf("checking copied tags\n");
+ assert(tagged == count);
+ check_copied_tags(&tree, start, end, idx, ITEMS, 0, 1);
+
+ /* Copy tags in several rounds */
+// printf("\ncopying tags...\n");
+ cur = start;
+ do {
+ tmp = rand() % (count/10+2);
+ tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2);
+ } while (tmp == tagged);
+
+// printf("%lu %lu %lu\n", tagged, tmp, count);
+// printf("checking copied tags\n");
+ check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2);
+ assert(tagged < tmp);
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ verify_tag_consistency(&tree, 2);
+// printf("\n");
+ item_kill_tree(&tree);
+}
+
+static void single_thread_tests(void)
+{
+ int i;
+
+ tag_check();
+ printf("after tag_check: %d allocated\n", nr_allocated);
+ gang_check();
+ printf("after gang_check: %d allocated\n", nr_allocated);
+ add_and_check();
+ printf("after add_and_check: %d allocated\n", nr_allocated);
+ dynamic_height_check();
+ printf("after dynamic_height_check: %d allocated\n", nr_allocated);
+ big_gang_check();
+ printf("after big_gang_check: %d allocated\n", nr_allocated);
+ for (i = 0; i < 2000; i++) {
+ copy_tag_check();
+ printf("%d ", i);
+ fflush(stdout);
+ }
+ printf("after copy_tag_check: %d allocated\n", nr_allocated);
+}
+
+int main(void)
+{
+ rcu_register_thread();
+ radix_tree_init();
+
+ regression1_test();
+ regression2_test();
+ regression3_test();
+ single_thread_tests();
+
+ sleep(1);
+ printf("after sleep(1): %d allocated\n", nr_allocated);
+ rcu_unregister_thread();
+
+ exit(0);
+}
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c
new file mode 100644
index 000000000000..31a2d14225d6
--- /dev/null
+++ b/tools/testing/radix-tree/rcupdate.c
@@ -0,0 +1,86 @@
+#include <linux/rcupdate.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+
+static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER;
+static struct rcu_head *rcuhead_global = NULL;
+static __thread int nr_rcuhead = 0;
+static __thread struct rcu_head *rcuhead = NULL;
+static __thread struct rcu_head *rcutail = NULL;
+
+static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER;
+
+/* switch to urcu implementation when it is merged. */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))
+{
+ head->func = func;
+ head->next = rcuhead;
+ rcuhead = head;
+ if (!rcutail)
+ rcutail = head;
+ nr_rcuhead++;
+ if (nr_rcuhead >= 1000) {
+ int signal = 0;
+
+ pthread_mutex_lock(&rculock);
+ if (!rcuhead_global)
+ signal = 1;
+ rcutail->next = rcuhead_global;
+ rcuhead_global = head;
+ pthread_mutex_unlock(&rculock);
+
+ nr_rcuhead = 0;
+ rcuhead = NULL;
+ rcutail = NULL;
+
+ if (signal) {
+ pthread_cond_signal(&rcu_worker_cond);
+ }
+ }
+}
+
+static void *rcu_worker(void *arg)
+{
+ struct rcu_head *r;
+
+ rcupdate_thread_init();
+
+ while (1) {
+ pthread_mutex_lock(&rculock);
+ while (!rcuhead_global) {
+ pthread_cond_wait(&rcu_worker_cond, &rculock);
+ }
+ r = rcuhead_global;
+ rcuhead_global = NULL;
+
+ pthread_mutex_unlock(&rculock);
+
+ synchronize_rcu();
+
+ while (r) {
+ struct rcu_head *tmp = r->next;
+ r->func(r);
+ r = tmp;
+ }
+ }
+
+ rcupdate_thread_exit();
+
+ return NULL;
+}
+
+static pthread_t worker_thread;
+void rcupdate_init(void)
+{
+ pthread_create(&worker_thread, NULL, rcu_worker, NULL);
+}
+
+void rcupdate_thread_init(void)
+{
+ rcu_register_thread();
+}
+void rcupdate_thread_exit(void)
+{
+ rcu_unregister_thread();
+}
diff --git a/tools/testing/radix-tree/regression.h b/tools/testing/radix-tree/regression.h
new file mode 100644
index 000000000000..e018c4816688
--- /dev/null
+++ b/tools/testing/radix-tree/regression.h
@@ -0,0 +1,8 @@
+#ifndef __REGRESSION_H__
+#define __REGRESSION_H__
+
+void regression1_test(void);
+void regression2_test(void);
+void regression3_test(void);
+
+#endif
diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c
new file mode 100644
index 000000000000..d471d3bd1345
--- /dev/null
+++ b/tools/testing/radix-tree/regression1.c
@@ -0,0 +1,221 @@
+/*
+ * Regression1
+ * Description:
+ * Salman Qazi describes the following radix-tree bug:
+ *
+ * In the following case, we get can get a deadlock:
+ *
+ * 0. The radix tree contains two items, one has the index 0.
+ * 1. The reader (in this case find_get_pages) takes the rcu_read_lock.
+ * 2. The reader acquires slot(s) for item(s) including the index 0 item.
+ * 3. The non-zero index item is deleted, and as a consequence the other item
+ * is moved to the root of the tree. The place where it used to be is queued
+ * for deletion after the readers finish.
+ * 3b. The zero item is deleted, removing it from the direct slot, it remains in
+ * the rcu-delayed indirect node.
+ * 4. The reader looks at the index 0 slot, and finds that the page has 0 ref
+ * count
+ * 5. The reader looks at it again, hoping that the item will either be freed
+ * or the ref count will increase. This never happens, as the slot it is
+ * looking at will never be updated. Also, this slot can never be reclaimed
+ * because the reader is holding rcu_read_lock and is in an infinite loop.
+ *
+ * The fix is to re-use the same "indirect" pointer case that requires a slot
+ * lookup retry into a general "retry the lookup" bit.
+ *
+ * Running:
+ * This test should run to completion in a few seconds. The above bug would
+ * cause it to hang indefinitely.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "regression.h"
+
+static RADIX_TREE(mt_tree, GFP_KERNEL);
+static pthread_mutex_t mt_lock;
+
+struct page {
+ pthread_mutex_t lock;
+ struct rcu_head rcu;
+ int count;
+ unsigned long index;
+};
+
+static struct page *page_alloc(void)
+{
+ struct page *p;
+ p = malloc(sizeof(struct page));
+ p->count = 1;
+ p->index = 1;
+ pthread_mutex_init(&p->lock, NULL);
+
+ return p;
+}
+
+static void page_rcu_free(struct rcu_head *rcu)
+{
+ struct page *p = container_of(rcu, struct page, rcu);
+ assert(!p->count);
+ pthread_mutex_destroy(&p->lock);
+ free(p);
+}
+
+static void page_free(struct page *p)
+{
+ call_rcu(&p->rcu, page_rcu_free);
+}
+
+static unsigned find_get_pages(unsigned long start,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mt_tree,
+ (void ***)pages, NULL, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ assert((start | i) == 0);
+ goto restart;
+ }
+ /*
+ * No exceptional entries are inserted in this test.
+ */
+ assert(0);
+ }
+
+ pthread_mutex_lock(&page->lock);
+ if (!page->count) {
+ pthread_mutex_unlock(&page->lock);
+ goto repeat;
+ }
+ /* don't actually update page refcount */
+ pthread_mutex_unlock(&page->lock);
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static pthread_barrier_t worker_barrier;
+
+static void *regression1_fn(void *arg)
+{
+ rcu_register_thread();
+
+ if (pthread_barrier_wait(&worker_barrier) ==
+ PTHREAD_BARRIER_SERIAL_THREAD) {
+ int j;
+
+ for (j = 0; j < 1000000; j++) {
+ struct page *p;
+
+ p = page_alloc();
+ pthread_mutex_lock(&mt_lock);
+ radix_tree_insert(&mt_tree, 0, p);
+ pthread_mutex_unlock(&mt_lock);
+
+ p = page_alloc();
+ pthread_mutex_lock(&mt_lock);
+ radix_tree_insert(&mt_tree, 1, p);
+ pthread_mutex_unlock(&mt_lock);
+
+ pthread_mutex_lock(&mt_lock);
+ p = radix_tree_delete(&mt_tree, 1);
+ pthread_mutex_lock(&p->lock);
+ p->count--;
+ pthread_mutex_unlock(&p->lock);
+ pthread_mutex_unlock(&mt_lock);
+ page_free(p);
+
+ pthread_mutex_lock(&mt_lock);
+ p = radix_tree_delete(&mt_tree, 0);
+ pthread_mutex_lock(&p->lock);
+ p->count--;
+ pthread_mutex_unlock(&p->lock);
+ pthread_mutex_unlock(&mt_lock);
+ page_free(p);
+ }
+ } else {
+ int j;
+
+ for (j = 0; j < 100000000; j++) {
+ struct page *pages[10];
+
+ find_get_pages(0, 10, pages);
+ }
+ }
+
+ rcu_unregister_thread();
+
+ return NULL;
+}
+
+static pthread_t *threads;
+void regression1_test(void)
+{
+ int nr_threads;
+ int i;
+ long arg;
+
+ /* Regression #1 */
+ printf("running regression test 1, should finish in under a minute\n");
+ nr_threads = 2;
+ pthread_barrier_init(&worker_barrier, NULL, nr_threads);
+
+ threads = malloc(nr_threads * sizeof(pthread_t *));
+
+ for (i = 0; i < nr_threads; i++) {
+ arg = i;
+ if (pthread_create(&threads[i], NULL, regression1_fn, (void *)arg)) {
+ perror("pthread_create");
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_join(threads[i], NULL)) {
+ perror("pthread_join");
+ exit(1);
+ }
+ }
+
+ free(threads);
+
+ printf("regression test 1, done\n");
+}
+
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
new file mode 100644
index 000000000000..5d2fa28cdca3
--- /dev/null
+++ b/tools/testing/radix-tree/regression2.c
@@ -0,0 +1,126 @@
+/*
+ * Regression2
+ * Description:
+ * Toshiyuki Okajima describes the following radix-tree bug:
+ *
+ * In the following case, we can get a hangup on
+ * radix_radix_tree_gang_lookup_tag_slot.
+ *
+ * 0. The radix tree contains RADIX_TREE_MAP_SIZE items. And the tag of
+ * a certain item has PAGECACHE_TAG_DIRTY.
+ * 1. radix_tree_range_tag_if_tagged(, start, end, , PAGECACHE_TAG_DIRTY,
+ * PAGECACHE_TAG_TOWRITE) is called to add PAGECACHE_TAG_TOWRITE tag
+ * for the tag which has PAGECACHE_TAG_DIRTY. However, there is no tag with
+ * PAGECACHE_TAG_DIRTY within the range from start to end. As the result,
+ * There is no tag with PAGECACHE_TAG_TOWRITE but the root tag has
+ * PAGECACHE_TAG_TOWRITE.
+ * 2. An item is added into the radix tree and then the level of it is
+ * extended into 2 from 1. At that time, the new radix tree node succeeds
+ * the tag status of the root tag. Therefore the tag of the new radix tree
+ * node has PAGECACHE_TAG_TOWRITE but there is not slot with
+ * PAGECACHE_TAG_TOWRITE tag in the child node of the new radix tree node.
+ * 3. The tag of a certain item is cleared with PAGECACHE_TAG_DIRTY.
+ * 4. All items within the index range from 0 to RADIX_TREE_MAP_SIZE - 1 are
+ * released. (Only the item which index is RADIX_TREE_MAP_SIZE exist in the
+ * radix tree.) As the result, the slot of the radix tree node is NULL but
+ * the tag which corresponds to the slot has PAGECACHE_TAG_TOWRITE.
+ * 5. radix_tree_gang_lookup_tag_slot(PAGECACHE_TAG_TOWRITE) calls
+ * __lookup_tag. __lookup_tag returns with 0. And __lookup_tag doesn't
+ * change the index that is the input and output parameter. Because the 1st
+ * slot of the radix tree node is NULL, but the tag which corresponds to
+ * the slot has PAGECACHE_TAG_TOWRITE.
+ * Therefore radix_tree_gang_lookup_tag_slot tries to get some items by
+ * calling __lookup_tag, but it cannot get any items forever.
+ *
+ * The fix is to change that radix_tree_tag_if_tagged doesn't tag the root tag
+ * if it doesn't set any tags within the specified range.
+ *
+ * Running:
+ * This test should run to completion immediately. The above bug would cause it
+ * to hang indefinitely.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "regression.h"
+
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
+#define PAGECACHE_TAG_DIRTY 0
+#define PAGECACHE_TAG_WRITEBACK 1
+#define PAGECACHE_TAG_TOWRITE 2
+
+static RADIX_TREE(mt_tree, GFP_KERNEL);
+unsigned long page_count = 0;
+
+struct page {
+ unsigned long index;
+};
+
+static struct page *page_alloc(void)
+{
+ struct page *p;
+ p = malloc(sizeof(struct page));
+ p->index = page_count++;
+
+ return p;
+}
+
+void regression2_test(void)
+{
+ int i;
+ struct page *p;
+ int max_slots = RADIX_TREE_MAP_SIZE;
+ unsigned long int start, end;
+ struct page *pages[1];
+
+ printf("running regression test 2 (should take milliseconds)\n");
+ /* 0. */
+ for (i = 0; i <= max_slots - 1; i++) {
+ p = page_alloc();
+ radix_tree_insert(&mt_tree, i, p);
+ }
+ radix_tree_tag_set(&mt_tree, max_slots - 1, PAGECACHE_TAG_DIRTY);
+
+ /* 1. */
+ start = 0;
+ end = max_slots - 2;
+ radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+
+ /* 2. */
+ p = page_alloc();
+ radix_tree_insert(&mt_tree, max_slots, p);
+
+ /* 3. */
+ radix_tree_tag_clear(&mt_tree, max_slots - 1, PAGECACHE_TAG_DIRTY);
+
+ /* 4. */
+ for (i = max_slots - 1; i >= 0; i--)
+ radix_tree_delete(&mt_tree, i);
+
+ /* 5. */
+ // NOTE: start should not be 0 because radix_tree_gang_lookup_tag_slot
+ // can return.
+ start = 1;
+ end = max_slots - 2;
+ radix_tree_gang_lookup_tag_slot(&mt_tree, (void ***)pages, start, end,
+ PAGECACHE_TAG_TOWRITE);
+
+ /* We remove all the remained nodes */
+ radix_tree_delete(&mt_tree, max_slots);
+
+ printf("regression test 2, done\n");
+}
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
new file mode 100644
index 000000000000..17d3ba5f4a0a
--- /dev/null
+++ b/tools/testing/radix-tree/regression3.c
@@ -0,0 +1,86 @@
+/*
+ * Regression3
+ * Description:
+ * Helper radix_tree_iter_retry resets next_index to the current index.
+ * In following radix_tree_next_slot current chunk size becomes zero.
+ * This isn't checked and it tries to dereference null pointer in slot.
+ *
+ * Running:
+ * This test should run to completion immediately. The above bug would
+ * cause it to segfault.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "regression.h"
+
+void regression3_test(void)
+{
+ RADIX_TREE(root, GFP_KERNEL);
+ void *ptr = (void *)4ul;
+ struct radix_tree_iter iter;
+ void **slot;
+ bool first;
+
+ printf("running regression test 3 (should take milliseconds)\n");
+
+ radix_tree_insert(&root, 0, ptr);
+ radix_tree_tag_set(&root, 0, 0);
+
+ first = true;
+ radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
+// printk("tagged %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ radix_tree_tag_set(&root, 1, 0);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+// printk("retry %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+ radix_tree_delete(&root, 1);
+
+ first = true;
+ radix_tree_for_each_slot(slot, &root, &iter, 0) {
+// printk("slot %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+// printk("retry %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+ radix_tree_delete(&root, 1);
+
+ first = true;
+ radix_tree_for_each_contig(slot, &root, &iter, 0) {
+// printk("contig %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+// printk("retry %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+
+ radix_tree_delete(&root, 0);
+ radix_tree_delete(&root, 1);
+
+ printf("regression test 3 passed\n");
+}
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
new file mode 100644
index 000000000000..83136be552a0
--- /dev/null
+++ b/tools/testing/radix-tree/tag_check.c
@@ -0,0 +1,332 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+
+#include "test.h"
+
+
+static void
+__simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
+{
+ int ret;
+
+ item_check_absent(tree, index);
+ assert(item_tag_get(tree, index, tag) == 0);
+
+ item_insert(tree, index);
+ assert(item_tag_get(tree, index, tag) == 0);
+ item_tag_set(tree, index, tag);
+ ret = item_tag_get(tree, index, tag);
+ assert(ret != 0);
+ ret = item_delete(tree, index);
+ assert(ret != 0);
+ item_insert(tree, index);
+ ret = item_tag_get(tree, index, tag);
+ assert(ret == 0);
+ ret = item_delete(tree, index);
+ assert(ret != 0);
+ ret = item_delete(tree, index);
+ assert(ret == 0);
+}
+
+void simple_checks(void)
+{
+ unsigned long index;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ for (index = 0; index < 10000; index++) {
+ __simple_checks(&tree, index, 0);
+ __simple_checks(&tree, index, 1);
+ }
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ printf("before item_kill_tree: %d allocated\n", nr_allocated);
+ item_kill_tree(&tree);
+ printf("after item_kill_tree: %d allocated\n", nr_allocated);
+}
+
+/*
+ * Check that tags propagate correctly when extending a tree.
+ */
+static void extend_checks(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 43);
+ assert(item_tag_get(&tree, 43, 0) == 0);
+ item_tag_set(&tree, 43, 0);
+ assert(item_tag_get(&tree, 43, 0) == 1);
+ item_insert(&tree, 1000000);
+ assert(item_tag_get(&tree, 43, 0) == 1);
+
+ item_insert(&tree, 0);
+ item_tag_set(&tree, 0, 0);
+ item_delete(&tree, 1000000);
+ assert(item_tag_get(&tree, 43, 0) != 0);
+ item_delete(&tree, 43);
+ assert(item_tag_get(&tree, 43, 0) == 0); /* crash */
+ assert(item_tag_get(&tree, 0, 0) == 1);
+
+ verify_tag_consistency(&tree, 0);
+
+ item_kill_tree(&tree);
+}
+
+/*
+ * Check that tags propagate correctly when contracting a tree.
+ */
+static void contract_checks(void)
+{
+ struct item *item;
+ int tmp;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ tmp = 1<<RADIX_TREE_MAP_SHIFT;
+ item_insert(&tree, tmp);
+ item_insert(&tree, tmp+1);
+ item_tag_set(&tree, tmp, 0);
+ item_tag_set(&tree, tmp, 1);
+ item_tag_set(&tree, tmp+1, 0);
+ item_delete(&tree, tmp+1);
+ item_tag_clear(&tree, tmp, 1);
+
+ assert(radix_tree_gang_lookup_tag(&tree, (void **)&item, 0, 1, 0) == 1);
+ assert(radix_tree_gang_lookup_tag(&tree, (void **)&item, 0, 1, 1) == 0);
+
+ assert(item_tag_get(&tree, tmp, 0) == 1);
+ assert(item_tag_get(&tree, tmp, 1) == 0);
+
+ verify_tag_consistency(&tree, 0);
+ item_kill_tree(&tree);
+}
+
+/*
+ * Stupid tag thrasher
+ *
+ * Create a large linear array corresponding to the tree. Each element in
+ * the array is coherent with each node in the tree
+ */
+
+enum {
+ NODE_ABSENT = 0,
+ NODE_PRESENT = 1,
+ NODE_TAGGED = 2,
+};
+
+#define THRASH_SIZE 1000 * 1000
+#define N 127
+#define BATCH 33
+
+static void gang_check(struct radix_tree_root *tree,
+ char *thrash_state, int tag)
+{
+ struct item *items[BATCH];
+ int nr_found;
+ unsigned long index = 0;
+ unsigned long last_index = 0;
+
+ while ((nr_found = radix_tree_gang_lookup_tag(tree, (void **)items,
+ index, BATCH, tag))) {
+ int i;
+
+ for (i = 0; i < nr_found; i++) {
+ struct item *item = items[i];
+
+ while (last_index < item->index) {
+ assert(thrash_state[last_index] != NODE_TAGGED);
+ last_index++;
+ }
+ assert(thrash_state[last_index] == NODE_TAGGED);
+ last_index++;
+ }
+ index = items[nr_found - 1]->index + 1;
+ }
+}
+
+static void do_thrash(struct radix_tree_root *tree, char *thrash_state, int tag)
+{
+ int insert_chunk;
+ int delete_chunk;
+ int tag_chunk;
+ int untag_chunk;
+ int total_tagged = 0;
+ int total_present = 0;
+
+ for (insert_chunk = 1; insert_chunk < THRASH_SIZE; insert_chunk *= N)
+ for (delete_chunk = 1; delete_chunk < THRASH_SIZE; delete_chunk *= N)
+ for (tag_chunk = 1; tag_chunk < THRASH_SIZE; tag_chunk *= N)
+ for (untag_chunk = 1; untag_chunk < THRASH_SIZE; untag_chunk *= N) {
+ int i;
+ unsigned long index;
+ int nr_inserted = 0;
+ int nr_deleted = 0;
+ int nr_tagged = 0;
+ int nr_untagged = 0;
+ int actual_total_tagged;
+ int actual_total_present;
+
+ for (i = 0; i < insert_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_ABSENT)
+ continue;
+ item_check_absent(tree, index);
+ item_insert(tree, index);
+ assert(thrash_state[index] != NODE_PRESENT);
+ thrash_state[index] = NODE_PRESENT;
+ nr_inserted++;
+ total_present++;
+ }
+
+ for (i = 0; i < delete_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] == NODE_ABSENT)
+ continue;
+ item_check_present(tree, index);
+ if (item_tag_get(tree, index, tag)) {
+ assert(thrash_state[index] == NODE_TAGGED);
+ total_tagged--;
+ } else {
+ assert(thrash_state[index] == NODE_PRESENT);
+ }
+ item_delete(tree, index);
+ assert(thrash_state[index] != NODE_ABSENT);
+ thrash_state[index] = NODE_ABSENT;
+ nr_deleted++;
+ total_present--;
+ }
+
+ for (i = 0; i < tag_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_PRESENT) {
+ if (item_lookup(tree, index))
+ assert(item_tag_get(tree, index, tag));
+ continue;
+ }
+ item_tag_set(tree, index, tag);
+ item_tag_set(tree, index, tag);
+ assert(thrash_state[index] != NODE_TAGGED);
+ thrash_state[index] = NODE_TAGGED;
+ nr_tagged++;
+ total_tagged++;
+ }
+
+ for (i = 0; i < untag_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_TAGGED)
+ continue;
+ item_check_present(tree, index);
+ assert(item_tag_get(tree, index, tag));
+ item_tag_clear(tree, index, tag);
+ item_tag_clear(tree, index, tag);
+ assert(thrash_state[index] != NODE_PRESENT);
+ thrash_state[index] = NODE_PRESENT;
+ nr_untagged++;
+ total_tagged--;
+ }
+
+ actual_total_tagged = 0;
+ actual_total_present = 0;
+ for (index = 0; index < THRASH_SIZE; index++) {
+ switch (thrash_state[index]) {
+ case NODE_ABSENT:
+ item_check_absent(tree, index);
+ break;
+ case NODE_PRESENT:
+ item_check_present(tree, index);
+ assert(!item_tag_get(tree, index, tag));
+ actual_total_present++;
+ break;
+ case NODE_TAGGED:
+ item_check_present(tree, index);
+ assert(item_tag_get(tree, index, tag));
+ actual_total_present++;
+ actual_total_tagged++;
+ break;
+ }
+ }
+
+ gang_check(tree, thrash_state, tag);
+
+ printf("%d(%d) %d(%d) %d(%d) %d(%d) / "
+ "%d(%d) present, %d(%d) tagged\n",
+ insert_chunk, nr_inserted,
+ delete_chunk, nr_deleted,
+ tag_chunk, nr_tagged,
+ untag_chunk, nr_untagged,
+ total_present, actual_total_present,
+ total_tagged, actual_total_tagged);
+ }
+}
+
+static void thrash_tags(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ char *thrash_state;
+
+ thrash_state = malloc(THRASH_SIZE);
+ memset(thrash_state, 0, THRASH_SIZE);
+
+ do_thrash(&tree, thrash_state, 0);
+
+ verify_tag_consistency(&tree, 0);
+ item_kill_tree(&tree);
+ free(thrash_state);
+}
+
+static void leak_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 1000000);
+ item_delete(&tree, 1000000);
+ item_kill_tree(&tree);
+}
+
+static void __leak_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_insert(&tree, 1000000);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_delete(&tree, 1000000);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_kill_tree(&tree);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+}
+
+static void single_check(void)
+{
+ struct item *items[BATCH];
+ RADIX_TREE(tree, GFP_KERNEL);
+ int ret;
+
+ item_insert(&tree, 0);
+ item_tag_set(&tree, 0, 0);
+ ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0);
+ assert(ret == 1);
+ ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 1, BATCH, 0);
+ assert(ret == 0);
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ item_kill_tree(&tree);
+}
+
+void tag_check(void)
+{
+ single_check();
+ extend_checks();
+ contract_checks();
+ printf("after extend_checks: %d allocated\n", nr_allocated);
+ __leak_check();
+ leak_check();
+ printf("after leak_check: %d allocated\n", nr_allocated);
+ simple_checks();
+ printf("after simple_checks: %d allocated\n", nr_allocated);
+ thrash_tags();
+ printf("after thrash_tags: %d allocated\n", nr_allocated);
+}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
new file mode 100644
index 000000000000..2bebf34cdc27
--- /dev/null
+++ b/tools/testing/radix-tree/test.c
@@ -0,0 +1,219 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "test.h"
+
+struct item *
+item_tag_set(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_set(root, index, tag);
+}
+
+struct item *
+item_tag_clear(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_clear(root, index, tag);
+}
+
+int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_get(root, index, tag);
+}
+
+int __item_insert(struct radix_tree_root *root, struct item *item)
+{
+ return radix_tree_insert(root, item->index, item);
+}
+
+int item_insert(struct radix_tree_root *root, unsigned long index)
+{
+ return __item_insert(root, item_create(index));
+}
+
+int item_delete(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item = radix_tree_delete(root, index);
+
+ if (item) {
+ assert(item->index == index);
+ free(item);
+ return 1;
+ }
+ return 0;
+}
+
+struct item *item_create(unsigned long index)
+{
+ struct item *ret = malloc(sizeof(*ret));
+
+ ret->index = index;
+ return ret;
+}
+
+void item_check_present(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item;
+
+ item = radix_tree_lookup(root, index);
+ assert(item != 0);
+ assert(item->index == index);
+}
+
+struct item *item_lookup(struct radix_tree_root *root, unsigned long index)
+{
+ return radix_tree_lookup(root, index);
+}
+
+void item_check_absent(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item;
+
+ item = radix_tree_lookup(root, index);
+ assert(item == 0);
+}
+
+/*
+ * Scan only the passed (start, start+nr] for present items
+ */
+void item_gang_check_present(struct radix_tree_root *root,
+ unsigned long start, unsigned long nr,
+ int chunk, int hop)
+{
+ struct item *items[chunk];
+ unsigned long into;
+
+ for (into = 0; into < nr; ) {
+ int nfound;
+ int nr_to_find = chunk;
+ int i;
+
+ if (nr_to_find > (nr - into))
+ nr_to_find = nr - into;
+
+ nfound = radix_tree_gang_lookup(root, (void **)items,
+ start + into, nr_to_find);
+ assert(nfound == nr_to_find);
+ for (i = 0; i < nfound; i++)
+ assert(items[i]->index == start + into + i);
+ into += hop;
+ }
+}
+
+/*
+ * Scan the entire tree, only expecting present items (start, start+nr]
+ */
+void item_full_scan(struct radix_tree_root *root, unsigned long start,
+ unsigned long nr, int chunk)
+{
+ struct item *items[chunk];
+ unsigned long into = 0;
+ unsigned long this_index = start;
+ int nfound;
+ int i;
+
+// printf("%s(0x%08lx, 0x%08lx, %d)\n", __FUNCTION__, start, nr, chunk);
+
+ while ((nfound = radix_tree_gang_lookup(root, (void **)items, into,
+ chunk))) {
+// printf("At 0x%08lx, nfound=%d\n", into, nfound);
+ for (i = 0; i < nfound; i++) {
+ assert(items[i]->index == this_index);
+ this_index++;
+ }
+// printf("Found 0x%08lx->0x%08lx\n",
+// items[0]->index, items[nfound-1]->index);
+ into = this_index;
+ }
+ if (chunk)
+ assert(this_index == start + nr);
+ nfound = radix_tree_gang_lookup(root, (void **)items,
+ this_index, chunk);
+ assert(nfound == 0);
+}
+
+static int verify_node(struct radix_tree_node *slot, unsigned int tag,
+ unsigned int height, int tagged)
+{
+ int anyset = 0;
+ int i;
+ int j;
+
+ slot = indirect_to_ptr(slot);
+
+ /* Verify consistency at this level */
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
+ if (slot->tags[tag][i]) {
+ anyset = 1;
+ break;
+ }
+ }
+ if (tagged != anyset) {
+ printf("tag: %u, height %u, tagged: %d, anyset: %d\n", tag, height, tagged, anyset);
+ for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
+ printf("tag %d: ", j);
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++)
+ printf("%016lx ", slot->tags[j][i]);
+ printf("\n");
+ }
+ return 1;
+ }
+ assert(tagged == anyset);
+
+ /* Go for next level */
+ if (height > 1) {
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+ if (slot->slots[i])
+ if (verify_node(slot->slots[i], tag, height - 1,
+ !!test_bit(i, slot->tags[tag]))) {
+ printf("Failure at off %d\n", i);
+ for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
+ printf("tag %d: ", j);
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++)
+ printf("%016lx ", slot->tags[j][i]);
+ printf("\n");
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
+{
+ if (!root->height)
+ return;
+ verify_node(root->rnode, tag, root->height, !!root_tag_get(root, tag));
+}
+
+void item_kill_tree(struct radix_tree_root *root)
+{
+ struct item *items[32];
+ int nfound;
+
+ while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) {
+ int i;
+
+ for (i = 0; i < nfound; i++) {
+ void *ret;
+
+ ret = radix_tree_delete(root, items[i]->index);
+ assert(ret == items[i]);
+ free(items[i]);
+ }
+ }
+ assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0);
+ assert(root->rnode == NULL);
+}
+
+void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
+{
+ assert(radix_tree_maxindex(root->height) >= maxindex);
+ if (root->height > 1)
+ assert(radix_tree_maxindex(root->height-1) < maxindex);
+ else if (root->height == 1)
+ assert(radix_tree_maxindex(root->height-1) <= maxindex);
+}
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
new file mode 100644
index 000000000000..4e1d95faaa94
--- /dev/null
+++ b/tools/testing/radix-tree/test.h
@@ -0,0 +1,40 @@
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+
+struct item {
+ unsigned long index;
+};
+
+struct item *item_create(unsigned long index);
+int __item_insert(struct radix_tree_root *root, struct item *item);
+int item_insert(struct radix_tree_root *root, unsigned long index);
+int item_delete(struct radix_tree_root *root, unsigned long index);
+struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
+
+void item_check_present(struct radix_tree_root *root, unsigned long index);
+void item_check_absent(struct radix_tree_root *root, unsigned long index);
+void item_gang_check_present(struct radix_tree_root *root,
+ unsigned long start, unsigned long nr,
+ int chunk, int hop);
+void item_full_scan(struct radix_tree_root *root, unsigned long start,
+ unsigned long nr, int chunk);
+void item_kill_tree(struct radix_tree_root *root);
+
+void tag_check(void);
+
+struct item *
+item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
+struct item *
+item_tag_clear(struct radix_tree_root *root, unsigned long index, int tag);
+int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag);
+void tree_verify_min_height(struct radix_tree_root *root, int maxindex);
+void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
+
+extern int nr_allocated;
+
+/* Normally private parts of lib/radix-tree.c */
+void *indirect_to_ptr(void *ptr);
+int root_tag_get(struct radix_tree_root *root, unsigned int tag);
+unsigned long radix_tree_maxindex(unsigned int height);
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 5a6016224bb9..dab61c377f54 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -61,6 +61,8 @@
#define PM_PFRAME_BITS 55
#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define MAX_SWAPFILES_SHIFT 5
+#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
#define PM_SOFT_DIRTY (1ULL << 55)
#define PM_MMAP_EXCLUSIVE (1ULL << 56)
#define PM_FILE (1ULL << 61)
@@ -73,6 +75,7 @@
#define KPF_BYTES 8
#define PROC_KPAGEFLAGS "/proc/kpageflags"
+#define PROC_KPAGECGROUP "/proc/kpagecgroup"
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
@@ -92,7 +95,8 @@
#define KPF_SLOB_FREE 49
#define KPF_SLUB_FROZEN 50
#define KPF_SLUB_DEBUG 51
-#define KPF_FILE 62
+#define KPF_FILE 61
+#define KPF_SWAP 62
#define KPF_MMAP_EXCLUSIVE 63
#define KPF_ALL_BITS ((uint64_t)~0ULL)
@@ -146,6 +150,7 @@ static const char * const page_flag_names[] = {
[KPF_SLUB_DEBUG] = "E:slub_debug",
[KPF_FILE] = "F:file",
+ [KPF_SWAP] = "w:swap",
[KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
};
@@ -164,7 +169,9 @@ static int opt_raw; /* for kernel developers */
static int opt_list; /* list pages (in ranges) */
static int opt_no_summary; /* don't show summary */
static pid_t opt_pid; /* process to walk */
-const char * opt_file;
+const char * opt_file; /* file or directory path */
+static uint64_t opt_cgroup; /* cgroup inode */
+static int opt_list_cgroup;/* list page cgroup */
#define MAX_ADDR_RANGES 1024
static int nr_addr_ranges;
@@ -185,6 +192,7 @@ static int page_size;
static int pagemap_fd;
static int kpageflags_fd;
+static int kpagecgroup_fd = -1;
static int opt_hwpoison;
static int opt_unpoison;
@@ -278,6 +286,16 @@ static unsigned long kpageflags_read(uint64_t *buf,
return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
}
+static unsigned long kpagecgroup_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ if (kpagecgroup_fd < 0)
+ return pages;
+
+ return do_u64_read(kpagecgroup_fd, PROC_KPAGEFLAGS, buf, index, pages);
+}
+
static unsigned long pagemap_read(uint64_t *buf,
unsigned long index,
unsigned long pages)
@@ -297,6 +315,10 @@ static unsigned long pagemap_pfn(uint64_t val)
return pfn;
}
+static unsigned long pagemap_swap_offset(uint64_t val)
+{
+ return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
+}
/*
* page flag names
@@ -346,14 +368,15 @@ static char *page_flag_longname(uint64_t flags)
*/
static void show_page_range(unsigned long voffset, unsigned long offset,
- unsigned long size, uint64_t flags)
+ unsigned long size, uint64_t flags, uint64_t cgroup)
{
static uint64_t flags0;
+ static uint64_t cgroup0;
static unsigned long voff;
static unsigned long index;
static unsigned long count;
- if (flags == flags0 && offset == index + count &&
+ if (flags == flags0 && cgroup == cgroup0 && offset == index + count &&
size && voffset == voff + count) {
count += size;
return;
@@ -364,11 +387,14 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
printf("%lx\t", voff);
if (opt_file)
printf("%lu\t", voff);
+ if (opt_list_cgroup)
+ printf("@%llu\t", (unsigned long long)cgroup0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
flags0 = flags;
+ cgroup0= cgroup;
index = offset;
voff = voffset;
count = size;
@@ -376,16 +402,18 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
static void flush_page_range(void)
{
- show_page_range(0, 0, 0, 0);
+ show_page_range(0, 0, 0, 0, 0);
}
-static void show_page(unsigned long voffset,
- unsigned long offset, uint64_t flags)
+static void show_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup)
{
if (opt_pid)
printf("%lx\t", voffset);
if (opt_file)
printf("%lu\t", voffset);
+ if (opt_list_cgroup)
+ printf("@%llu\t", (unsigned long long)cgroup);
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
@@ -452,6 +480,8 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
flags |= BIT(SOFTDIRTY);
if (pme & PM_FILE)
flags |= BIT(FILE);
+ if (pme & PM_SWAP)
+ flags |= BIT(SWAP);
if (pme & PM_MMAP_EXCLUSIVE)
flags |= BIT(MMAP_EXCLUSIVE);
@@ -566,23 +596,26 @@ static size_t hash_slot(uint64_t flags)
exit(EXIT_FAILURE);
}
-static void add_page(unsigned long voffset,
- unsigned long offset, uint64_t flags, uint64_t pme)
+static void add_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup, uint64_t pme)
{
flags = kpageflags_flags(flags, pme);
if (!bit_mask_ok(flags))
return;
+ if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
+ return;
+
if (opt_hwpoison)
hwpoison_page(offset);
if (opt_unpoison)
unpoison_page(offset);
if (opt_list == 1)
- show_page_range(voffset, offset, 1, flags);
+ show_page_range(voffset, offset, 1, flags, cgroup);
else if (opt_list == 2)
- show_page(voffset, offset, flags);
+ show_page(voffset, offset, flags, cgroup);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -595,24 +628,49 @@ static void walk_pfn(unsigned long voffset,
uint64_t pme)
{
uint64_t buf[KPAGEFLAGS_BATCH];
+ uint64_t cgi[KPAGEFLAGS_BATCH];
unsigned long batch;
unsigned long pages;
unsigned long i;
+ memset(cgi, 0, sizeof cgi);
+
while (count) {
batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
pages = kpageflags_read(buf, index, batch);
if (pages == 0)
break;
+ if (kpagecgroup_read(cgi, index, pages) != pages)
+ fatal("kpagecgroup returned fewer pages than expected");
+
for (i = 0; i < pages; i++)
- add_page(voffset + i, index + i, buf[i], pme);
+ add_page(voffset + i, index + i, buf[i], cgi[i], pme);
index += pages;
count -= pages;
}
}
+static void walk_swap(unsigned long voffset, uint64_t pme)
+{
+ uint64_t flags = kpageflags_flags(0, pme);
+
+ if (!bit_mask_ok(flags))
+ return;
+
+ if (opt_cgroup)
+ return;
+
+ if (opt_list == 1)
+ show_page_range(voffset, pagemap_swap_offset(pme), 1, flags, 0);
+ else if (opt_list == 2)
+ show_page(voffset, pagemap_swap_offset(pme), flags, 0);
+
+ nr_pages[hash_slot(flags)]++;
+ total_pages++;
+}
+
#define PAGEMAP_BATCH (64 << 10)
static void walk_vma(unsigned long index, unsigned long count)
{
@@ -632,6 +690,8 @@ static void walk_vma(unsigned long index, unsigned long count)
pfn = pagemap_pfn(buf[i]);
if (pfn)
walk_pfn(index + i, pfn, 1, buf[i]);
+ if (buf[i] & PM_SWAP)
+ walk_swap(index + i, buf[i]);
}
index += pages;
@@ -713,10 +773,12 @@ static void usage(void)
" -d|--describe flags Describe flags\n"
" -a|--addr addr-spec Walk a range of pages\n"
" -b|--bits bits-spec Walk pages with specified bits\n"
+" -c|--cgroup path|@inode Walk pages within memory cgroup\n"
" -p|--pid pid Walk process address space\n"
" -f|--file filename Walk file address space\n"
" -l|--list Show page details in ranges\n"
" -L|--list-each Show page details one by one\n"
+" -C|--list-cgroup Show cgroup inode for pages\n"
" -N|--no-summary Don't show summary info\n"
" -X|--hwpoison hwpoison pages\n"
" -x|--unpoison unpoison pages\n"
@@ -851,6 +913,7 @@ static void walk_file(const char *name, const struct stat *st)
{
uint8_t vec[PAGEMAP_BATCH];
uint64_t buf[PAGEMAP_BATCH], flags;
+ uint64_t cgroup = 0;
unsigned long nr_pages, pfn, i;
off_t off, end = st->st_size;
int fd;
@@ -908,12 +971,15 @@ got_sigbus:
continue;
if (!kpageflags_read(&flags, pfn, 1))
continue;
+ if (!kpagecgroup_read(&cgroup, pfn, 1))
+ fatal("kpagecgroup_read failed");
if (first && opt_list) {
first = 0;
flush_page_range();
show_file(name, st);
}
- add_page(off / page_size + i, pfn, flags, buf[i]);
+ add_page(off / page_size + i, pfn,
+ flags, cgroup, buf[i]);
}
}
@@ -965,6 +1031,24 @@ static void parse_file(const char *name)
opt_file = name;
}
+static void parse_cgroup(const char *path)
+{
+ if (path[0] == '@') {
+ opt_cgroup = parse_number(path + 1);
+ return;
+ }
+
+ struct stat st;
+
+ if (stat(path, &st))
+ fatal("stat failed: %s: %m\n", path);
+
+ if (!S_ISDIR(st.st_mode))
+ fatal("cgroup supposed to be a directory: %s\n", path);
+
+ opt_cgroup = st.st_ino;
+}
+
static void parse_addr_range(const char *optarg)
{
unsigned long offset;
@@ -1088,9 +1172,11 @@ static const struct option opts[] = {
{ "file" , 1, NULL, 'f' },
{ "addr" , 1, NULL, 'a' },
{ "bits" , 1, NULL, 'b' },
+ { "cgroup" , 1, NULL, 'c' },
{ "describe" , 1, NULL, 'd' },
{ "list" , 0, NULL, 'l' },
{ "list-each" , 0, NULL, 'L' },
+ { "list-cgroup", 0, NULL, 'C' },
{ "no-summary", 0, NULL, 'N' },
{ "hwpoison" , 0, NULL, 'X' },
{ "unpoison" , 0, NULL, 'x' },
@@ -1105,7 +1191,7 @@ int main(int argc, char *argv[])
page_size = getpagesize();
while ((c = getopt_long(argc, argv,
- "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
+ "rp:f:a:b:d:c:ClLNXxh", opts, NULL)) != -1) {
switch (c) {
case 'r':
opt_raw = 1;
@@ -1122,6 +1208,12 @@ int main(int argc, char *argv[])
case 'b':
parse_bits_mask(optarg);
break;
+ case 'c':
+ parse_cgroup(optarg);
+ break;
+ case 'C':
+ opt_list_cgroup = 1;
+ break;
case 'd':
describe_flags(optarg);
exit(0);
@@ -1151,10 +1243,15 @@ int main(int argc, char *argv[])
}
}
+ if (opt_cgroup || opt_list_cgroup)
+ kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+
if (opt_list && opt_pid)
printf("voffset\t");
if (opt_list && opt_file)
printf("foffset\t");
+ if (opt_list && opt_list_cgroup)
+ printf("cgroup\t");
if (opt_list == 1)
printf("offset\tlen\tflags\n");
if (opt_list == 2)